In [None]:
import pandas as pd
import numpy as np
import glob
from scipy.special import digamma, gamma
from numpy.random import dirichlet
import glob
from tqdm import tqdm

# Experimentation

In [None]:
# we start by loading an example term_sentence matrix
ts_matrix_list = sorted(
    glob.glob('/home/joao/Thesis/ts_matrices_original/*.p'))
this_file = ts_matrix_list[0]

In [None]:
term_sentence_document_matrix = pd.read_pickle(this_file)

#from it we create the term_document matrix and the term-sentence matrix
B = term_sentence_document_matrix.copy()
B_keys = B[['this_file_name', 'sentence_order', 'word_count']]
B.drop(
    columns=['this_file_name', 'sentence_order', 'word_count'],
    inplace=True)
B = B.fillna(0).as_matrix().transpose()
B = B / B.sum(axis=0)

# and from the term_sentence_document_matrix we create the term_document_matrix
A = term_sentence_document_matrix.fillna(0).drop(
    columns=['sentence_order']).groupby(
        'this_file_name').sum().as_matrix().transpose()

# I'll allow 40 topics to be present at any time, so
K = 14
alfa_v = np.random.rand(A.shape[1], K)
alfa_u = np.random.rand(B.shape[1], K)

alfa_v[:, :] = alfa_v[0, 0]
alfa_u[:, :] = alfa_v[0, 0] * K / B.shape[1]
# we now create samples from dirichlet
# we must then start by these matrices randomly UsxK: and VdxK
U = np.random.rand(B.shape[1], K)
V = np.random.rand(A.shape[1], K)
# normalizing each column of U and V
U = U / U.sum(axis=0)

V = V / V.sum(axis=1)[:, np.newaxis]

# since we don't know what alfa is supposed to mean, we also randomly initialize it

In [None]:
# we then define the update equations:
def update_U(A, B, U, V, alfa_u):
    C = A / np.matmul(np.matmul(B, U), V.transpose())
    new_U = np.multiply(U, np.matmul(np.matmul(B.transpose(), C), V)) + alfa_u
    return new_U / new_U.sum(axis=0)


def update_V(A, B, U, V, alfa_v):
    C = A / np.matmul(np.matmul(B, U), V.transpose())
    new_V = np.multiply(V, np.matmul(np.matmul(C.transpose(), B), U)) + alfa_v
    return new_V / new_V.sum(axis=1)[:, np.newaxis]

In [None]:
diff_u = 1000
U_old = U
it = 0
while (diff_u > 0.000000001):
    U = update_U(A, B, U, V, alfa_u)
    V = update_V(A, B, U, V, alfa_v)
    if (it % 50 == 0):
        diff_u = np.abs(U_old - U).sum()
        U_old = U
    it += 1
print(it, diff_u)

In [None]:
# loading all raw sentences
sentence_banks = sorted(glob.glob('/home/joao/Thesis/sentence_bank/*.p'))

In [None]:
sentences = pd.read_pickle(sentence_banks[0])

In [None]:
# sentence selection process:
# most likely topic to represent a document:
probabilities = pd.Series(V.flatten())
probabilities = probabilities.sort_values(ascending=False)
topics = []
for i in probabilities.unique():
    topics.extend(np.where(V == i)[1])
#     print(i,np.where(V == i)[1])

In [None]:
U_df = pd.DataFrame(U)
selected_sentences_index = []
selected_sentences = []
total_words = 0
for topic in topics:
    subset = U_df.iloc[:, topic].copy()
    subset.sort_values(ascending=False, inplace=True)
    subset = subset[~subset.index.isin(selected_sentences_index)]
    selected_sentence = sentences.sentence[subset.index[0]]
    total_words += len(selected_sentence.split(' '))
    if (total_words < 400):
        selected_sentences_index.append(subset.index[0])
        selected_sentences.append(selected_sentence)
    else:
        break

In [None]:
abstract = ''
for i in selected_sentences:
    abstract += i

In [None]:
abstract

# Implementation

In [None]:
import pandas as pd
import numpy as np
import glob
from scipy.special import digamma, gamma
from numpy.random import dirichlet
import glob
from tqdm import tqdm
from joblib import Parallel, delayed


# wrapping it all into a function:
# we then define the update equations:
def update_U(A, B, U, V, alfa_u):
    C = A / np.matmul(np.matmul(B, U), V.transpose())
    new_U = np.multiply(U, np.matmul(np.matmul(B.transpose(), C), V)) + alfa_u
    return new_U / new_U.sum(axis=0)


def update_V(A, B, U, V, alfa_v):
    C = A / np.matmul(np.matmul(B, U), V.transpose())
    new_V = np.multiply(V, np.matmul(np.matmul(C.transpose(), B), U)) + alfa_v
    return new_V / new_V.sum(axis=1)[:, np.newaxis]


def summarize_FGB(filename):
    # we start by loading an example term_sentence matrix
    this_file = filename

    term_sentence_document_matrix = pd.read_pickle(this_file)

    #from it we create the term_document matrix and the term-sentence matrix
    B = term_sentence_document_matrix.copy()
    B_keys = B[['this_file_name', 'sentence_order', 'word_count']]
    B.drop(
        columns=['this_file_name', 'sentence_order', 'word_count'],
        inplace=True)
    B = B.fillna(0).as_matrix().transpose()
    B = B / B.sum(axis=0)

    # and from the term_sentence_document_matrix we create the term_document_matrix
    A = term_sentence_document_matrix.fillna(0).drop(
        columns=['sentence_order', 'word_count']).groupby(
            'this_file_name').sum().as_matrix().transpose()
    #     return A,B
    # I'll allow 40 topics to be present at any time, so
    K = 20
    alfa_v = np.random.rand(A.shape[1], K)
    alfa_u = np.random.rand(B.shape[1], K)

    alfa_v[:, :] = alfa_v[0, 0]
    alfa_u[:, :] = alfa_v[0, 0] * K / B.shape[1]
    # we must then start by these matrices randomly UsxK: and VdxK
    U = np.random.rand(B.shape[1], K)
    V = np.random.rand(A.shape[1], K)
    # normalizing each column of U and V
    U = U / U.sum(axis=0)

    V = V / V.sum(axis=1)[:, np.newaxis]

    diff_u = 1000
    U_old = U
    it = 0
    while (diff_u > 0.00001):

        U = update_U(A, B, U, V, alfa_u)
        V = update_V(A, B, U, V, alfa_v)
        if (it % 50 == 0):
            diff_u = np.abs(U_old - U).sum()
            U_old = U
        it += 1
    print(it, diff_u)
    # loading all raw sentences
    sentence_bank_file = '/home/joao/Thesis/sentence_bank/' + this_file.split(
        '/')[-1]
    sentences = pd.read_pickle(sentence_bank_file)
    # sentence selection process:
    # most likely topic to represent a document:
    probabilities = pd.Series(V.flatten())
    probabilities = probabilities.sort_values(ascending=False)
    topics = []
    for i in probabilities.unique():
        topics.extend(np.where(V == i)[1])
    U_df = pd.DataFrame(U)
    selected_sentences_index = []
    selected_sentences = []
    total_words = 0
    for topic in topics:
        subset = U_df.iloc[:, topic].copy()
        subset.sort_values(ascending=False, inplace=True)
        subset = subset[~subset.index.isin(selected_sentences_index)]
        selected_sentence = sentences.sentence[subset.index[0]]
        total_words += len(selected_sentence.split(' '))
        if (total_words < 400):
            selected_sentences_index.append(subset.index[0])
            selected_sentences.append(selected_sentence)
        else:
            break
    abstract = ''
    for i in selected_sentences:
        abstract += i
    return abstract, filename.split('/')[-1]

In [None]:
%%time
ts_matrix_list = sorted(
    glob.glob('/home/joao/Thesis/ts_matrices_original/*.p'))
summaries = []
files = []

In [None]:
results = Parallel(
    n_jobs=8, verbose=11)(delayed(summarize_FGB)(ts_matrix_file)
                          for ts_matrix_file in ts_matrix_list)

In [None]:
filenames = []
abstracts = []
for i in results:
    abstracts.append(i[0])
    filenames.append(i[1])
results_df = pd.DataFrame({'filename': filenames, 'abstracts': abstracts})

In [None]:
results_df.to_pickle('/home/joao/Thesis/FGB/results.p')

In [None]:
results_a = pd.read_pickle('/home/joao/Thesis/FGB/results.p')

In [None]:
this_file = ts_matrix_list[11]
sentence_bank_file = '/home/joao/Thesis/sentence_bank/' + this_file.split(
    '/')[-1]
A, B = summarize_FGB(ts_matrix_list[11])
B_df = pd.DataFrame(B)
sentence_bank = pd.read_pickle(sentence_bank_file)

In [None]:
B_df.loc[:, B_df.sum() == 0].notnull().sum()

In [None]:
sentence_bank.sentence[[295, 301]]

In [None]:
pd.read_pickle(this_file).dropna(how='all').shape

In [None]:
results_a

# Execution time : 3 minutes

# Applying to the test dataset:

In [14]:
import pandas as pd
import numpy as np
import glob
from scipy.special import digamma, gamma
from numpy.random import dirichlet
import glob
from tqdm import tqdm
from joblib import Parallel, delayed


# wrapping it all into a function:
# we then define the update equations:
def update_U(A, B, U, V, alfa_u):
    C = A / np.matmul(np.matmul(B, U), V.transpose())
    new_U = np.multiply(U, np.matmul(np.matmul(B.transpose(), C), V)) + alfa_u
    return new_U / new_U.sum(axis=0)


def update_V(A, B, U, V, alfa_v):
    C = A / np.matmul(np.matmul(B, U), V.transpose())
    new_V = np.multiply(V, np.matmul(np.matmul(C.transpose(), B), U)) + alfa_v
    return new_V / new_V.sum(axis=1)[:, np.newaxis]


def summarize_FGB_test(filename):
    # we start by loading an example term_sentence matrix
    this_file = filename

    term_sentence_document_matrix = pd.read_pickle(this_file)

    #from it we create the term_document matrix and the term-sentence matrix
    B = term_sentence_document_matrix.copy()
    B_keys = B[['this_file_name', 'sentence_order', 'word_count']]
    B.drop(
        columns=['this_file_name', 'sentence_order', 'word_count'],
        inplace=True)
    B = B.fillna(0).as_matrix().transpose()
    B = B / B.sum(axis=0)

    # and from the term_sentence_document_matrix we create the term_document_matrix
    A = term_sentence_document_matrix.fillna(0).drop(
        columns=['sentence_order', 'word_count']).groupby(
            'this_file_name').sum().as_matrix().transpose()
    #     return A,B
    # I'll allow 15 topics to be present at any time, so
    K = 1
    alfa_v = np.random.rand(A.shape[1], K)
    alfa_u = np.random.rand(B.shape[1], K)

    alfa_v[:, :] = alfa_v[0, 0]
    alfa_u[:, :] = alfa_v[0, 0] * K / B.shape[1]
    # we must then start by these matrices randomly UsxK: and VdxK
    U = np.random.rand(B.shape[1], K)
    V = np.random.rand(A.shape[1], K)
    # normalizing each column of U and V
    U = U / U.sum(axis=0)

    V = V / V.sum(axis=1)[:, np.newaxis]

    diff_u = 1000
    U_old = U
    it = 0
    while (diff_u > 0.00001):
        #         if(it > 4):
        #             break
        U = update_U(A, B, U, V, alfa_u)
        V = update_V(A, B, U, V, alfa_v)
        if (it % 50 == 0):
            diff_u = np.abs(U_old - U).sum()
            U_old = U
        it += 1
    print(filename, it, diff_u)
    # loading all raw sentences
    sentence_bank_file = '/home/joao/Thesis/test_set/sentence_banks/' + this_file.split(
        '/')[-1]
    sentences = pd.read_pickle(sentence_bank_file)
    # sentence selection process:
    # most likely topic to represent a document:
    probabilities = pd.Series(V.flatten())
    probabilities = probabilities.sort_values(ascending=False)
    topics = []
    for i in probabilities.unique():
        topics.extend(np.where(V == i)[1])
    U_df = pd.DataFrame(U)
    selected_sentences_index = []
    selected_sentences = []
    total_words = 0
    for topic in topics:
        subset = U_df.iloc[:, topic].copy()
        subset.sort_values(ascending=False, inplace=True)
        subset = subset[~subset.index.isin(selected_sentences_index)]
        selected_sentence = sentences.sentence[subset.index[0]]
        total_words += len(selected_sentence.split(' '))
        if (total_words < 250):
            selected_sentences_index.append(subset.index[0])
            selected_sentences.append(selected_sentence)
        else:
            break
    abstract = ''
    for i in selected_sentences:
        abstract += i
    return abstract, filename.split('/')[-1]

In [15]:
ts_matrix_list = sorted(
    glob.glob('/home/joao/Thesis/test_set/ts_matrices/*.p'))

In [None]:
%%time
summaries = []
files = []
results = []
results = Parallel(
    n_jobs=8, verbose=11)(delayed(summarize_FGB_test)(ts_matrix_file)
                          for ts_matrix_file in ts_matrix_list)

[Parallel(n_jobs=8)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done   1 tasks      | elapsed:    0.2s
[Parallel(n_jobs=8)]: Done   2 tasks      | elapsed:    0.3s
[Parallel(n_jobs=8)]: Done   3 tasks      | elapsed:    0.3s
[Parallel(n_jobs=8)]: Done   4 tasks      | elapsed:    0.4s


# Execution time : 5 minutes 10 seconds

In [11]:
filenames = []
abstracts = []
for i in results:
    abstracts.append(i[0])
    print(i[0])
    filenames.append(i[1])
results_df = pd.DataFrame({'filename': filenames, 'abstracts': abstracts})

Several factors induce the difference: for example, it takes some time for equipments to reach high power, some areas of formation are not completely saturated, micro-fractures in formation exist and there are micro-gaps between cement casing and wellbore.When the pumping pressure exceeds the strength of the formation rock, fractures are induced and propagated into the formation, and then the propping agent is pumped into the fractures to keep them from closing after pumping pressure is released.Fluid-solid coupling elements were used to describe the behavior of formation stress-seepage flow coupling; pore pressure cohesive elements based on damage mechanics were employed to simulate the process of fracture initiation and propagation.However in the model plan, strain assumption was made, the length of the fracture was pointed to be a fixed value and the loading applied on the fracture walls distributed force but not fluid pressure.Fluid-solid coupling elements are used to describe the 

In [None]:
results_df.to_pickle('/home/joao/Thesis/FGB/test_results.p')

# Final Scoring


In [12]:
from rouge import Rouge, FilesRouge
import pandas as pd
import glob
from tqdm import tqdm
import numpy as np
from unidecode import unidecode


def remove_non_ascii(text):
    """This function removes all non-ascii characters from text and 
    replaces them with their closest ascii representation"""
    return unidecode(unicode(text, encoding="utf-8"))


# we then load all summaries and candidate summaries:

scores = []
r1 = []
r2 = []
rl = []
for i in tqdm(results_df.index):
    ground_truth = '/home/joao/Thesis/test_set/abstracts/ground_truths/' + results_df.loc[
        i, 'filename'][:-2] + '.txt'
    rouge = Rouge()
    with open(ground_truth, 'rb') as f:
        ground_truth = f.read()
    ground_truth = remove_non_ascii(ground_truth)
    tmp_scores = rouge.get_scores(
        results_df.loc[i, 'abstracts'], ground_truth, avg=True)
    r2.append(tmp_scores['rouge-2']['f'])
    r1.append(tmp_scores['rouge-1']['f'])
    rl.append(tmp_scores['rouge-l']['f'])

100%|██████████| 69/69 [00:02<00:00, 26.03it/s]


In [13]:
print('r1', np.mean(r1), np.std(r1, ddof=1))
print('r2', np.mean(r2), np.std(r2, ddof=1))
print('rl', np.mean(rl), np.std(rl, ddof=1))

('r1', 0.29537356448505414, 0.06992150949842404)
('r2', 0.0895225750913528, 0.05255140502428644)
('rl', 0.24191560532018608, 0.06727625101821061)


('r1', 0.30312526276606727, 0.07163665233160144)


('r2', 0.09337161516075211, 0.05010419600100166)


('rl', 0.2496758504003376, 0.06976648365116518)
