In [22]:
from sklearn.datasets import load_files
import nltk
from nltk import sent_tokenize
from nltk import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from gensim.models import Word2Vec
from gensim.models import KeyedVectors
import pandas as pd
import math

In [21]:
new_articles = pd.read_csv('news_articles.csv')
new_articles.drop('Unnamed: 0', axis = 1, inplace = True)
print(new_articles.head())

articles = list(new_articles.document)
len(articles)

                                            document
0  Cambodian leader Hun Sen on Friday rejected op...
1  King Norodom Sihanouk has declined requests to...
2  Cambodia 's two-party opposition asked the Asi...
3  Cambodia 's ruling party responded Tuesday to ...
4  Cambodia 's leading opposition party ruled out...


10

In [8]:
sentences = []
for art in new_articles['document']:
    sentences.append(sent_tokenize(art))
sentences = [y for x in sentences for y in x]

In [14]:
def tokenize_docs(articles):
    sentences = []
    for art in articles['document']:
        sentences.append(sent_tokenize(art))
    sentences = [y for x in sentences for y in x]
    return sentences

def frequency_matrix(sentences):
    '''Takes in a list of sentences.
    
    Tokenizing text by word, counting words, and returning a frequency table.'''
    stemmer = PorterStemmer()
    stop_words = set(stopwords.words("english"))
    
    freq_matrix = {}
    for sent in sentences:
        freqtable = {}
        words = word_tokenize(sent)
        for word in words:
            word = word.lower()
            word = stemmer.stem(word)
            if word not in stop_words:
                if word in freqtable:
                    freqtable[word] += 1
                else:
                    freqtable[word] = 1
        freq_matrix[sent] = freqtable
   # print(freqtable.items())
    return freq_matrix

freq_mat = frequency_matrix(tokenize_docs(new_articles))

In [17]:
def tf_matrix(freq_matrix):
    tf_matrix = {}
    for sent, ftable in freq_matrix.items():
        tf_table = {}
        sent_length = len(ftable)
        for word, wcount in ftable.items():
            tf_table[word] = wcount / sent_length
        tf_matrix[sent] = tf_table
    return tf_matrix

tf_mat = tf_matrix(freq_mat)

In [20]:
def doc_per_word_table(freq_matrix):
    '''
    Takes in frequency matrix.
    Returns table of words and number of documents they each appear in.
    '''
    dpw_table = {}
    for sent, ftable in freq_matrix.items():
        for word, wcount in ftable.items():
            if word in dpw_table:
                dpw_table[word] += 1
            else:
                dpw_table[word] = 1
    return dpw_table
dpw = doc_per_word_table(freq_mat)

In [38]:
def idf_matrix(freq_matrix, dpw_table, num_docs):
    ''' Takes in frequency matrix, doc per word table, and number of documents to be summarized.
    Returns IDF matrix.'''
    idf_matrix = {}
    for sent, ftable in freq_matrix.items():
        idf_table = {}
        for word in ftable.keys():
            idf_table[word] = math.log(num_docs/dpw_table[word])
        idf_matrix[sent] = idf_table
    
    return idf_matrix
idf_mat = idf_matrix(freq_mat, dpw, len(articles))

In [39]:
def tf_idf_matrix(tf_matrix, idf_matrix):
    '''
    Multiples two matrices and returns a matrix with tf-idf values.
    '''
    tf_idf_matrix = {}
    for (sent1, ftable1), (sent2, ftable2) in zip(tf_matrix.items(), idf_matrix.items()):
        tf_idf_table = {}
        for (word1, freq1), (word2, freq2) in zip(ftable1.items(), ftable2.items()):
            tf_idf_table[word1] = freq1 * freq2
        tf_idf_matrix[sent1] = tf_idf_table
    return tf_idf_matrix
tfidf_mat = tf_idf_matrix(tf_mat, idf_mat)

In [43]:
def sentence_score(tf_idf_matrix):
    '''Creates a dictionary of scores for each sentence.'''
    sent_scores = {}

    for sent, ftable in tf_idf_matrix.items():
        score = 0
        sent_num_words = len(sent)
        for word, value in ftable.items():
            score += value
        sent_scores[sent] = score / sent_num_words
    return sent_scores
sentscores = sentence_score(tfidf_mat)

In [52]:
def summary(sent_scores):
    '''Calculate average sentence score and print sentences at least 1.3 times greater than the average'''
    sum_scores = 0
    for sentence in sent_scores:
        sum_scores += sent_scores[sentence]
    average_score = sum_scores / len(sent_scores)
    #print(average_score)
    
    summary = ''
    for sentence in sent_scores:
        if sentence in sent_scores and sent_scores[sentence] > 1.3 * (.01 + average_score):
            summary += " " + sentence
    return summary

summary = summary(sentscores)
print(summary)

 They have demanded a thorough investigation into their election complaints as a precondition for their cooperation in getting the national assembly moving and a new government formed . ` ` But the ruling party refuses to negotiate unless it is able to threaten its negotiating partners with arrest or worse .
' ` ` Only those who want to prolong the anarchy and instability prevent efforts to set up a new government,' ' Hun Sen said in a televised speech marking the anniversary of the 1991 Paris Peace Accords . International monitors said the election was relatively free and fair . Negotiations so far have proved fruitless except for the opening of parliament after a Sept. 22 summit led by the king . Sihanouk is reportedly set to fly to Beijing next month to receive medical treatment from his Chinese doctors . The 75-year-old monarch suffers from a variety ailments and periodically makes extended trips to Beijing . He was diagnosed with colon cancer in 1993 , but it has since gone into r