In [None]:
import pandas as pd
import numpy as np
import glob
from scipy.special import digamma, gamma
from scipy.stats import dirichlet
import glob
from tqdm import tqdm
from joblib import Parallel, delayed

# Training set Testing

In [None]:
# we then begin the iterative procedure:
def eq_3(U,B,V,Y):
    denominator = np.matmul(np.matmul(B,U),V.transpose())
    mid_term = Y/denominator
    tmp  = np.dot(np.dot(B.transpose(),mid_term),V)*U
    new_U = tmp/tmp.sum(axis = 0)
    return new_U
def eq_4(U,B,V,Y,alfa):
    denominator = np.matmul(np.matmul(B,U),V.transpose())
    mid_term  = (Y/denominator).transpose()
    X = np.dot(mid_term,np.matmul(B,U))*V
    new_V  = dirichlet_adjustment(X,alfa)
    return new_V
def dirichlet_adjustment(X,alfa):
    #pre_creating_the_result_matrix:
    # creating the auxiliary list:
    X_alfa = X + np.repeat(np.reshape(alfa,(1,alfa.shape[0])), X.shape[0],axis = 0)
    sums = np.repeat(np.reshape(X_alfa.sum(axis = 1),(X_alfa.shape[0],1)),X_alfa.shape[1],axis = 1)
    new_V = np.exp(digamma(X_alfa) - digamma(sums))
    new_V = new_V/new_V.sum(axis = 1)[:,np.newaxis]
    return new_V

def get_B(alfa):
    return(np.prod(gamma(alfa))/gamma(np.sum(alfa)))


In [None]:
ts_matrix_list = sorted(glob.glob('/home/joao/Thesis/ts_matrices_original/*.p'))

def get_u_v_matrices(this_file):
# we start by loading an example term_sentence matrix
    term_sentence_document_matrix = pd.read_pickle(this_file)
    #from it we create the term_document matrix and the term-sentence matrix
    B = term_sentence_document_matrix.copy()
    B_keys = B[['this_file_name','sentence_order','word_count']]
    B.drop(columns = ['this_file_name','sentence_order','word_count'], inplace = True)
    B = B.fillna(0).as_matrix().transpose()
    B = B/B.sum(axis = 0)
    # and from the term_sentence_document_matrix we create the term_document_matrix
    Y = term_sentence_document_matrix.fillna(0).drop(columns = ['sentence_order','word_count']).groupby('this_file_name').sum().as_matrix().transpose()
    # I'll allow 40 topics to be present at any time, so
    K = 20
    # we must then start by these matrices randomly UsxK: and VdxK
    U = np.random.rand(B.shape[1],K)
    V = np.random.rand(Y.shape[1],K)
    # normalizing each column of U and V
    U = U/U.sum(axis = 0)
    V = V/V.sum(axis = 1)[:,np.newaxis]

    # since we don't know what alfa is supposed to mean, we also randomly initialize it
    alfa = np.random.rand(K)
    # alfa[:] = 0.5
    alfa
    diff_u = 100
    diff_50 = 10
    U_50 = 1000000
    it = 0
    while(diff_50 > 0.001):
        it += 1
        old_U = U
        U = eq_3(U,B,V,Y)
        V = eq_4(U,B,V,Y,alfa)
        diff_u = np.abs(U-old_U).sum()
        if(it%100 == 0):
            diff_50 = np.max(np.abs(U_50 - U).sum(axis = 1))
            U_50 = U
        if(it%2000 == 0):
            print(it,diff_50)
    sentence_bank_file = '/home/joao/Thesis/sentence_bank/'+this_file.split('/')[-1]
    sentences = pd.read_pickle(sentence_bank_file)
    # sentence selection process:
    # most likely topic to represent a document:
    probabilities = pd.Series(V.flatten())
    probabilities = probabilities.sort_values(ascending = False)
    topics = []
    for i in probabilities.unique():
        topics.extend(np.where(V == i)[1])
    U_df = pd.DataFrame(U)
    selected_sentences_index = []
    selected_sentences = []
    total_words = 0
    for topic in topics:
        subset = U_df.iloc[:,topic].copy()
        subset.sort_values(ascending = False, inplace = True)
        subset = subset[~subset.index.isin(selected_sentences_index)]
        selected_sentence = sentences.sentence[subset.index[0]]
        total_words += len(selected_sentence.split(' '))
        if(total_words < 400):
            selected_sentences_index.append(subset.index[0])
            selected_sentences.append(selected_sentence)
        else:
            break
    abstract = ''
    for i in selected_sentences:
        abstract += i
    return abstract,this_file.split('/')[-1]


In [None]:
%%time
results = []

result = Parallel(n_jobs = -1, verbose = 11)(delayed(get_u_v_matrices)(i)for i in ts_matrix_list)


CPU times: user 281 ms, sys: 42.6 ms, total: 324 ms
Wall time: 22min 1s


In [None]:
filenames = []
abstracts = []
for i in result:
    abstracts.append(i[0])
    filenames.append(i[1])
results_df = pd.DataFrame({'filename':filenames,'abstracts':abstracts})

In [None]:
results_df

In [None]:
results_df.to_pickle('/home/joao/Thesis/BSTM/results.p')

# Adapting it and running for the test set

In [None]:
import pandas as pd
import numpy as np
import glob
from scipy.special import digamma, gamma
from scipy.stats import dirichlet
import glob
from tqdm import tqdm
from joblib import Parallel, delayed

In [None]:
# we then begin the iterative procedure:
def eq_3(U,B,V,Y):
    denominator = np.matmul(np.matmul(B,U),V.transpose())
    mid_term = Y/denominator
    tmp  = np.dot(np.dot(B.transpose(),mid_term),V)*U
    new_U = tmp/tmp.sum(axis = 0)
    return new_U
def eq_4(U,B,V,Y,alfa):
    denominator = np.matmul(np.matmul(B,U),V.transpose())
    mid_term  = (Y/denominator).transpose()
    X = np.dot(mid_term,np.matmul(B,U))*V
    new_V  = dirichlet_adjustment(X,alfa)
    return new_V
def dirichlet_adjustment(X,alfa):
    #pre_creating_the_result_matrix:
    # creating the auxiliary list:
    X_alfa = X + np.repeat(np.reshape(alfa,(1,alfa.shape[0])), X.shape[0],axis = 0)
    sums = np.repeat(np.reshape(X_alfa.sum(axis = 1),(X_alfa.shape[0],1)),X_alfa.shape[1],axis = 1)
    new_V = np.exp(digamma(X_alfa) - digamma(sums))
    new_V = new_V/new_V.sum(axis = 1)[:,np.newaxis]
    return new_V

def get_B(alfa):
    return(np.prod(gamma(alfa))/gamma(np.sum(alfa)))
    

In [None]:
ts_matrix_list = sorted(glob.glob('/home/joao/Thesis/test_set/ts_matrices/*.p'))

def get_u_v_matrices(this_file):
# we start by loading an example term_sentence matrix
    term_sentence_document_matrix = pd.read_pickle(this_file)
    #from it we create the term_document matrix and the term-sentence matrix
    B = term_sentence_document_matrix.copy()
    B_keys = B[['this_file_name','sentence_order','word_count']]
    B.drop(columns = ['this_file_name','sentence_order','word_count'], inplace = True)
    B = B.fillna(0).as_matrix().transpose()
    B = B/B.sum(axis = 0)
    # and from the term_sentence_document_matrix we create the term_document_matrix
    Y = term_sentence_document_matrix.fillna(0).drop(columns = ['sentence_order','word_count']).groupby('this_file_name').sum().as_matrix().transpose()
    # I'll allow 40 topics to be present at any time, so
    K = 20
    # we must then start by these matrices randomly UsxK: and VdxK
    U = np.random.rand(B.shape[1],K)
    V = np.random.rand(Y.shape[1],K)
    # normalizing each column of U and V
    U = U/U.sum(axis = 0)
    V = V/V.sum(axis = 1)[:,np.newaxis]

    # since we don't know what alfa is supposed to mean, we also randomly initialize it
    alfa = np.random.rand(K)
    # alfa[:] = 0.5
    alfa
    diff_u = 100
    diff_50 = 10
    U_50 = 1000000
    it = 0
    while(diff_50 > 0.001):
        it += 1
        old_U = U
        U = eq_3(U,B,V,Y)
        V = eq_4(U,B,V,Y,alfa)
        diff_u = np.abs(U-old_U).sum()
        if(it%100 == 0):
            diff_50 = np.max(np.abs(U_50 - U).sum(axis = 1))
            U_50 = U
        if(it%2000 == 0):
            print(it,diff_50)
    sentence_bank_file = '/home/joao/Thesis/test_set/sentence_banks/'+this_file.split('/')[-1]
    sentences = pd.read_pickle(sentence_bank_file)
    # sentence selection process:
    # most likely topic to represent a document:
    probabilities = pd.Series(V.flatten())
    probabilities = probabilities.sort_values(ascending = False)
    topics = []
    for i in probabilities.unique():
        topics.extend(np.where(V == i)[1])
    U_df = pd.DataFrame(U)
    selected_sentences_index = []
    selected_sentences = []
    total_words = 0
    for topic in topics:
        subset = U_df.iloc[:,topic].copy()
        subset.sort_values(ascending = False, inplace = True)
        subset = subset[~subset.index.isin(selected_sentences_index)]
        selected_sentence = sentences.sentence[subset.index[0]]
        total_words += len(selected_sentence.split(' '))
        if(total_words < 250):
            selected_sentences_index.append(subset.index[0])
            selected_sentences.append(selected_sentence)
        else:
            break
    abstract = ''
    for i in selected_sentences:
        abstract += i
    return abstract,this_file.split('/')[-1]

In [None]:
ts_matrix_list = sorted(glob.glob('/home/joao/Thesis/test_set/ts_matrices/*.p'))

result = Parallel(n_jobs = -1, verbose = 11)(delayed(get_u_v_matrices)(i)for i in ts_matrix_list)

# Execution time: 2minutes 42 seconds

In [None]:
filenames = []
abstracts = []
for i in result:
    abstracts.append(i[0])
    filenames.append(i[1])
results_df = pd.DataFrame({'filename':filenames,'abstracts':abstracts})

In [None]:
for i in results_df.abstracts:
    print i
    print('\n\n\n')

In [None]:
results_df.to_pickle('/home/joao/Thesis/BSTM/test_results.p')

# Scoring:

In [None]:
from rouge import Rouge,FilesRouge
import pandas as pd
import glob
from tqdm import tqdm
import numpy as np
from unidecode import unidecode

def remove_non_ascii(text):
    """This function removes all non-ascii characters from text and replaces them with their closest ascii representation"""
    return unidecode(unicode(text, encoding = "utf-8"))
# we then load all summaries and candidate summaries:

total_scores = []
scores = []
r1 = []
r2 = []
rl = []

for i in tqdm(results_df.index):
    ground_truth = '/home/joao/Thesis/test_set/abstracts/ground_truths/'+ results_df.loc[i,'filename'][:-2]+'.txt'
    rouge = Rouge()
    with open(ground_truth,'rb') as f:
        ground_truth = f.read()
    ground_truth = remove_non_ascii(ground_truth)
    tmp_scores = rouge.get_scores(results_df.loc[i,'abstracts'],ground_truth, avg = True)
    r2.append(tmp_scores['rouge-2']['f'])
    r1.append(tmp_scores['rouge-1']['f'])
    rl.append(tmp_scores['rouge-l']['f'])


In [None]:
print('r1',np.mean(r1),np.std(r1,ddof = 1))
print('r2',np.mean(r2),np.std(r2, ddof = 1))
print('rl',np.mean(rl),np.std(rl, ddof = 1))

('r1', 0.3095457283893854, 0.064077106163413)

('r2', 0.0969094899709795, 0.052156640692281465)

('rl', 0.25649784851398894, 0.06272087587320421)
