# Creating TS Matrices

In [None]:
import pandas as pd
import re
import nltk.data
from nltk.probability import FreqDist
from nltk.tokenize import WordPunctTokenizer
from nltk.corpus import stopwords
import glob
import tqdm
from bs4 import BeautifulSoup
from joblib import Parallel,delayed
sent_detector = nltk.data.load('tokenizers/punkt/english.pickle')


In [None]:
def get_ts_matrix(f,filename):
    """This function is responsible for getting the ts matrix from a given file in the DUC2002 format
        inputs: f -> A raw open file in the DUC2002 format
        filename --> The path to the file so we can get the name of the resulting documents
        outputs: TS-matrix for this file, removing stopwords and with an added column 
        containing the filename for use in multiple document summarization"""
    a = BeautifulSoup(f,'html.parser')
    text = a.find('text').text.replace('\n',' ')
    this_file = filename.split("/")[-1]
    #we then proceed to preprocess the text:
    #first by tokeninzing it
    #detecting all the sentences:
    sentences = sent_detector.tokenize(text)
    #tokeninzing each sentence

    tokenized = []
    useful_sentences = []
    for i in range(len(sentences)):
        if(any(c.isalpha() for c in sentences[i])):
            tokenized.append(WordPunctTokenizer().tokenize(sentences[i].strip()))
            useful_sentences.append(sentences[i].strip())
    sentences = useful_sentences

    # We then create the term-sentence matrix from the cleaned up-text
    sentences_df = pd.DataFrame(sentences)
    sentences_df['filename'] = this_file
    sentences_df['sentence_order'] = sentences_df.index.values
    sentences_df.columns = ['sentence','filename','sentence_order']
    full_tokenized = []
    for i in tokenized:
        full_tokenized.extend(i)

    ts_matrix = pd.DataFrame(columns = sorted(set(full_tokenized)))
    accumulator = []
    accumulator.append(ts_matrix)
    for i in range(len(tokenized)):
        fdist = FreqDist(tokenized[i])
        tmp = pd.DataFrame.from_dict(dict(fdist), orient = 'index')
        tmp1 = tmp.reset_index()
        tmp1.columns = ['word','count']
        tmp1 = pd.pivot_table(tmp1, columns = 'word')
        tmp1.index = [i]
        accumulator.append(tmp1)
    ts_matrix = pd.concat(accumulator, ignore_index = True)
    # we then remove the stopwords:

    stop_words = set(stopwords.words('english'))

    ts_matrix = ts_matrix[ts_matrix.columns[~ts_matrix.columns.str.lower().isin(stop_words)]]

    #adding the file for reference:
    ts_matrix['this_file_name'] = this_file
    ts_matrix['sentence_order'] = ts_matrix.index.values
    

    # resulting input matrix: Term-sentence
    return(ts_matrix,sentences_df)

In [None]:
directories_list = sorted(glob.glob('/home/joao/Thesis/DUC2002/DUC2002_Summarization_Documents/docs/*'))

def generate_ts_matrices_for_all_files(directory):
    file_names = sorted(glob.glob(directory + '/*'))
    this_directory = directory.split('/')[-1]
    ts_matrices = []
    sentence_bank = []
    for filename in file_names:
        with open(filename,'rb') as f:
            a = get_ts_matrix(f,filename)
            ts_matrices.append(a[0])
            sentence_bank.append(a[1])
    final_ts_matrix = pd.concat(ts_matrices, ignore_index = True)
    final_ts_matrix = final_ts_matrix[list(final_ts_matrix.columns[final_ts_matrix.columns.str.isalpha()])+
                                      ['this_file_name','sentence_order']]
    sentence_bank = pd.concat(sentence_bank,ignore_index = True)
    rows_to_drop = final_ts_matrix.index[final_ts_matrix.drop(columns = ['this_file_name',
                                                                         'sentence_order']).fillna(0).sum(axis =1) == 0]
    sentence_bank = sentence_bank.drop(index = rows_to_drop).reset_index(drop = True)
    final_ts_matrix = final_ts_matrix.drop(index=  rows_to_drop).reset_index(drop = True)
    word_count = sentence_bank.sentence.str.split(' ').str.len()
    final_ts_matrix['word_count'] = word_count
    sentence_bank.to_pickle('/home/joao/Thesis/sentence_bank/'+this_directory+'.p')
    final_ts_matrix.to_pickle('/home/joao/Thesis/titles/'+this_directory+'.p')
    return 0

In [None]:
results = Parallel(n_jobs = -1, verbose = 11)(delayed(generate_ts_matrices_for_all_files
                                                     )(directory)for directory in directories_list)

# Total Run time: 84 seconds

# Preprocessing the abstract ground truths

In [None]:
import pandas as pd
import numpy as np
import re 
import glob
from tqdm import tqdm

directory_lists = sorted(glob.glob('/home/joao/Thesis/DUC2002/extracts_abstracts/d*'))
summaries = []
files_list = []
people_list = []
for i in tqdm(directory_lists):
    files = glob.glob(i+'/400*')
    if(files):
        print(i)
        with open(files[0],'rb') as f:
            a = f.read()

        d = re.sub('<[^>]+>', 'AAAAAA', a)
        this_summary = i.split('/')[-1][:-1]
        this_person = i.split('/')[-1][-1].upper()
        this_file = '/home/joao/Thesis/simplified_abstracts/'+this_summary
        summaries.append(d)
        files_list.append(this_summary)
        people_list.append(this_person)
#         with open(this_file,'wb') as f:
#             f.write(d)
# we then save this file 

In [None]:
summaries_df = pd.DataFrame({'summaries':summaries,'files':files_list,'person':people_list})

summaries_df.files = summaries_df.person + '.' + summaries_df.files.str[1:-1] 

print(summaries_df.summaries[1])
summaries_df.summaries = summaries_df.summaries.str.strip('AAAAAA')

summaries_df.summaries = summaries_df.summaries.str.replace('\n\n','\n')
summaries_df.summaries = summaries_df.summaries.str.replace('\r\n', '')
summaries_df.summaries = summaries_df.summaries.str.replace('AAAAAA','\r\n')
summaries_df.summaries = summaries_df.summaries.str.strip()
summaries_df.summaries = summaries_df.summaries.str.replace('\r\n\r\n','\r\n')
print('\r\n\r\n')
print(summaries_df.summaries[1])

In [None]:
print(summaries_df.summaries[0])

In [None]:
for i in summaries_df.index:
    this_file = summaries_df.loc[i,:]
    this_filename = '/home/joao/Thesis/simplified_abstracts/abstract.' + this_file.files + '.txt'
    with open(this_filename,'wb') as f:
        f.write(str(this_file.summaries))

In [None]:
# eliminating the starting spaces and comas:
a = glob.glob('/home/joao/Thesis/simplified_abstracts/*')

In [None]:
this_file = a[0]

In [None]:
with open(this_file,'rb') as f:
    this_abstract = f.read()

# We must then preprocess the titles as well:

In [None]:
import pandas as pd
import re
import nltk.data
from nltk.probability import FreqDist
from nltk.tokenize import WordPunctTokenizer
from nltk.corpus import stopwords
import glob
from tqdm import tqdm
from bs4 import BeautifulSoup
from joblib import Parallel,delayed
sent_detector = nltk.data.load('tokenizers/punkt/english.pickle')


def get_ts_matrix(f,filename):
    """This function is responsible for getting the ts matrix 
        from a given file in the DUC2002 format
        inputs: f -> A raw open file in the DUC2002 format
        filename --> The path to the file so we can get the name of
        the resulting documents outputs: TS-matrix for this file, 
        removing stopwords and with an added column 
        containing the filename for use in multiple document summarization"""
    a = BeautifulSoup(f,'html.parser')
    print(a.prettify())
    tmp = a.find('head')
    if(tmp):
        text = tmp.text
    else:
        tmp = a.find('hl')
        if(tmp):
            text = tmp.text
        else:
            tmp = a.find('headline')
            if(tmp):
                text = tmp.text
            else:
                tmp = a.find('ti')
                if(tmp):
                    text = tmp.text
                else:
                    print('\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n')
                    print('no title in file')
                    print('\n\n\n\n\n\n\n\n\n\\n\n\n\n\n\n')
                    return pd.DataFrame()
    this_file = filename.split("/")[-1]
    #we then proceed to preprocess the text:
    #first by tokeninzing it
    #detecting all the sentences:
    sentences = sent_detector.tokenize(text)
    #tokeninzing each sentence

    tokenized = []
    useful_sentences = []
    for i in range(len(sentences)):
        if(any(c.isalpha() for c in sentences[i])):
            tokenized.append(WordPunctTokenizer().tokenize(sentences[i].strip()))
            useful_sentences.append(sentences[i].strip())
    sentences = useful_sentences

    # We then create the term-sentence matrix from the cleaned up-text
    sentences_df = pd.DataFrame(sentences)
    sentences_df['filename'] = this_file
    sentences_df['sentence_order'] = sentences_df.index.values
    sentences_df.columns = ['sentence','filename','sentence_order']
    full_tokenized = []
    for i in tokenized:
        full_tokenized.extend(i)

    ts_matrix = pd.DataFrame(columns = sorted(set(full_tokenized)))
    accumulator = []
    accumulator.append(ts_matrix)
    for i in range(len(tokenized)):
        fdist = FreqDist(tokenized[i])
        tmp = pd.DataFrame.from_dict(dict(fdist), orient = 'index')
        tmp1 = tmp.reset_index()
        tmp1.columns = ['word','count']
        tmp1 = pd.pivot_table(tmp1, columns = 'word')
        tmp1.index = [i]
        accumulator.append(tmp1)
    ts_matrix = pd.concat(accumulator, ignore_index = True
    # we then remove the stopwords:

    stop_words = set(stopwords.words('english'))
    ts_matrix = ts_matrix[ts_matrix.columns[~ts_matrix.columns.str.lower().isin(stop_words)]]

    #adding the file for reference:
    ts_matrix['this_file_name'] = this_file
    ts_matrix['sentence_order'] = ts_matrix.index.values
    

    # resulting input matrix: Term-sentence
    return(ts_matrix)



In [None]:
directories_list = sorted(glob.glob('/home/joao/Thesis/DUC2002/DUC2002_Summarization_Documents/docs/*'))
def generate_ts_matrices_for_all_files(directory):
    file_names = sorted(glob.glob(directory + '/*'))
    this_directory = directory.split('/')[-1]
    ts_matrices = []
    sentence_bank = []
    for filename in file_names:
        with open(filename,'rb') as f:
            a = get_ts_matrix(f,filename)
            ts_matrices.append(a)
    final_ts_matrix = pd.concat(ts_matrices, ignore_index = True)
    final_ts_matrix = final_ts_matrix[list(final_ts_matrix.columns[final_ts_matrix.columns.str.isalpha()])+['this_file_name','sentence_order']]
    rows_to_drop = final_ts_matrix.index[final_ts_matrix.drop(columns = ['this_file_name','sentence_order']).fillna(0).sum(axis =1) == 0]
    final_ts_matrix = final_ts_matrix.drop(index=  rows_to_drop).reset_index(drop = True)
    final_ts_matrix.to_pickle('/home/joao/Thesis/titles/'+this_directory+'.p')
    return final_ts_matrix

In [None]:
for i in tqdm(directories_list):
    generate_ts_matrices_for_all_files(i)

In [None]:
results = Parallel(n_jobs = -1, verbose = 11)(delayed(generate_ts_matrices_for_all_files)(directory)for directory in directories_list)

In [None]:
intermediate = generate_ts_matrices_for_all_files(directories_list[0])

In [None]:
with open(intermediate[1],'rb') as f:
    a = BeautifulSoup(f,'html.parser')
    text = a.find('head').text.replace('\n',' ')

# Characterizing the dataset

In [None]:
import pandas as pd
import glob
import numpy as np
filelist = sorted(glob.glob('/home/joao/Thesis/ts_matrices_original/*p'))

In [None]:
total_words = []
avg_word_sentence = []
avg_docs = []
avg_sentences = []
avg_word_doc = []
for filename in filelist:
    tmp = pd.read_pickle(filename)
    avg_sentences.append(tmp[['word_count','this_file_name']].groupby('this_file_name').count()['word_count'])
    avg_word_sentence.append(tmp.word_count.mean())
    avg_docs.append(tmp.this_file_name.unique().shape[0])
    avg_word_doc.append(tmp[['word_count','this_file_name']].groupby('this_file_name').sum()['word_count'])
    break

In [None]:
print(np.mean(avg_sentences),np.mean(avg_word_sentence),np.mean(avg_docs),np.mean(avg_word_doc))

(30.833333333333332, 19.886486486486486, 6.0, 613.1666666666666)