# reading data from dataset

In [3]:
#  read files from cnn and dailymail and add them to a dictionary where key is file name and values are list of 
#  sentences in that file.
from os import listdir,getcwd
from os.path import isfile, join
def readCorpus(datasetDirectoryName):
    docSummaryDict = {}
    docSentencesDict = {}
    summaryline = False
    # datasetDirectoryName = ['cnn', 'dailymail']
    for dirName in datasetDirectoryName:
        filepath = join(getcwd(), dirName,'stories')
        allFiles = [f for f in listdir(filepath) if not (f.startswith('.')) and isfile(join(filepath, f))]
        for file in allFiles:
            filename = join(filepath, file)
            if 'cnn' in filename:
                fileKey = 'cnn_' + file
            else:
                fileKey = 'dailymail_' + file
            for line in open(filename,'r'): 
                if line != '\n':
                    if '@highlight' in line:
                        summaryline = True
                        continue
                    if not summaryline:
                        if fileKey in docSentencesDict:
                            docSentencesDict[fileKey].append(line.strip())
                        else:
                            docSentencesDict[fileKey] = [line.strip()]
                    else:
                        summaryline = False
                        if fileKey in docSummaryDict:
                            docSummaryDict[fileKey].append(line.strip())
                        else:
                            docSummaryDict[fileKey] = [line.strip()]
    return docSentencesDict, docSummaryDict 

# Term Doc Frequency dictionary

In [4]:
#  funciton to get term-document frequency
#  docSentencesDict should be stemmed words dictionary
#  returns you count of words appear in how many documents for idf
from collections import Counter
from nltk import word_tokenize
def getTermDocfrequency(docSentencesDict):
    termDocFreqDict = Counter()
#     each k is a unique file and v is list of sentences
    for k in docSentencesDict:
        v = docSentencesDict[k]
        fileSet = None
        for line in v:
            lineSet = {word for word in word_tokenize(line)}
#             print(lineSet)
            if fileSet is None:
                fileSet = lineSet
            else:
                fileSet = fileSet | lineSet
        for word in fileSet:
            termDocFreqDict[word] += 1
    return termDocFreqDict        

# remove stop words

In [5]:
# function to read stop words from a text file
def readStopWords(filename):
    text = []
    if(filename.endswith('txt')):
        file = open(filename, 'r')
        for line in file:
            text.append(line.strip())
        return text
    else:
        return None;


from nltk import word_tokenize
def removeStopWords(docSentencesDict):
    returnDocSentencesDict = {}
#     returnText = ''
 #  list of stopWords ( removed following words from list -> eight, eleven, fifteen, first, five, forty, four, nine, 
#   one, six, sixty, twelve, twenty, two, ten, )
# http://xpo6.com/download-stop-word-list/
    stopWordsFile = 'stop-word-list.txt'
    stopWords = readStopWords(stopWordsFile)
    if stopWords is None:
        raise Exception('Couldn\'t parse the given file. Stop Words list is empty. Please provide a text file to parse.')
#     print(stopWords)
    for file in docSentencesDict:
        modifiedTextArr = []
        textArr = docSentencesDict[file]
        for text in textArr:
            modifiedText = ''
            words = word_tokenize(text)
            for word in words:
                if word not in stopWords:
                    modifiedText += word + ' '
            modifiedTextArr.append(modifiedText)
        returnDocSentencesDict[file] = modifiedTextArr
    return returnDocSentencesDict     

In [2]:
# print(removeStopWords('This is four times better than previous solution'))

This four times better previous solution 


# Stemming

In [6]:
# nltk.download()
# creating stemmed text and stemmedTextDict(for term frequency)
from nltk.stem import PorterStemmer
from nltk import word_tokenize
def getStemmedText(docSentencesDict):
    returnedDocSentencesDict = {}
    stemmedWordFrequencyDict = {}
    ps = PorterStemmer()
    for file in docSentencesDict:
        modifiedTextArr = []
        textArr = docSentencesDict[file]
        for text in textArr:
            stemmedText = ''
#         text = docSentencesDict[file]
        
            words = word_tokenize(text)
            for word in words:
                stemmedWord = ps.stem(word)
                stemmedText += stemmedWord + ' '
                if(stemmedWordFrequencyDict.get(stemmedWord, None) == None):
                    stemmedWordFrequencyDict[stemmedWord] =1
                else:
                    stemmedWordFrequencyDict[stemmedWord] += 1 
            modifiedTextArr.append(stemmedText)
        returnedDocSentencesDict[file] = modifiedTextArr
    return returnedDocSentencesDict, stemmedWordFrequencyDict

In [5]:
# print(getStemmedText('python pythonly pythoning pythoned jatin sankar. jatin Sankar python'))

# Data set downloaded using following link
# https://cs.nyu.edu/~kcho/DMQA/

# Computing Lex Rank Scores

In [7]:
from nltk import word_tokenize
from collections import Counter
from math import pow, log, sqrt
from nltk import word_tokenize
def idfModifiedCosine(sentence1,sentence2, totalNumberOfDocs, termDocfreqDict):
#     print('sentence1 is ' + sentence1)
#     print('sentence2 is ' + sentence2)
    sent1TFDict = Counter()
    sent2TFDict = Counter()
    idfDict = {}
    wordsInbothSentences = None
    sentence1Arr = word_tokenize(sentence1)
    sentence2Arr = word_tokenize(sentence2)
    for word in sentence1Arr:
        sent1TFDict[word] += 1
        if wordsInbothSentences is None:
            wordsInbothSentences = {word}
        else:
            wordsInbothSentences = wordsInbothSentences | {word}
        if word not in idfDict:
#             print('calculate idf for word : ' +  word + ' ' +  str(termDocfreqDict[word]))
            idfDict[word] = log(totalNumberOfDocs / termDocfreqDict[word])
    for word in sentence2Arr:
        sent2TFDict[word] += 1
        if wordsInbothSentences is None:
            wordsInbothSentences = {word}
        else:
            wordsInbothSentences = wordsInbothSentences | {word}
        if word not in idfDict:
#             print('calculate idf for word : ' +  word + ' ' +  str(termDocfreqDict[word]))
            idfDict[word] = log(totalNumberOfDocs / termDocfreqDict[word])
    num  = 0
    denSent1 = 0
    denSent2 = 0
    for word in wordsInbothSentences:
        num += sent1TFDict.get(word,0) * sent2TFDict.get(word,0) * pow(idfDict[word],2)
        denSent1 += pow(sent1TFDict.get(word,0) * idfDict[word],2)
        denSent2 += pow(sent2TFDict.get(word,0) * idfDict[word],2)
#     print('denSent1 is ' + str(denSent1))
#     print('denSent2 is ' + str(denSent2))
    return num / (sqrt(denSent1) * sqrt(denSent2))

In [8]:
#  implementing matrix Product
import numpy as np
def matrixProduct(matA, matB):
    return np.matmul(matA, matB)

In [9]:
import numpy as np
def matrixTranspose(mat):
    return np.transpose(mat)

In [10]:
import numpy as np
def matrixDifference(mat1,mat2):
    return np.subtract(mat1,mat2)

In [11]:
from math import pow
# http://www.personal.soton.ac.uk/jav/soton/HELM/workbooks/workbook_30/30_4_matrx_norms.pdf
def calculateEucledeanNorm(mat):
    val = 0
    for row in mat:
        for col in range(len(row)):
            val += pow(row[col],2)
    return pow(val,1/2)

In [12]:
#  implementing power method
def powerMethod(cosineMatrix, N, tolerance):
    cosineMatrixTranspose = matrixTranspose(cosineMatrix)
    initializeP = [[(1/N) for x in range(N)] for y in range(N)]
    t = 0
    currentP = None
    while True:
        if currentP is None:
            oldP = initializeP
        else:
            oldP = currentP
        t += 1
        currentP = matrixProduct(cosineMatrixTranspose,oldP)
        differenceMatrix = matrixDifference(currentP, oldP)
        difference = calculateEucledeanNorm(differenceMatrix)      
        if difference < tolerance:
            return currentP      

In [7]:
# # split Text into sentences
# def SplitTextIntoSentences(text):
#     updatedText = ''
#     for i in range(1,len(text)):
#         if text[i-1] == '.' and text[i] == ' ':
#             updatedText += '\n'
#         else:
#             updatedText += text[i]
#     sentences = updatedText.split('\n')
#     return sentences

In [13]:
#  Lex Rank
#  need to check from where do we get tolerance, threshold
#  idf-modified-cosine
#  We might get divide by zero error, need to think how to avoid that.

def lexRank(sentences,threshold, tolerance, totalNumberOfDocs, termDocfreqDict):
    n = len(sentences)
    cosineMatrix = [[0 for x in range(n)] for y in range(n)]
    degree = [1 for x in range(n)]
    for i in range(0,n):
        for j in range(0,n):
            cosineMatrix[i][j] = idfModifiedCosine(sentences[i],sentences[j], totalNumberOfDocs, termDocfreqDict)
            if cosineMatrix[i][j] > threshold:
                cosineMatrix[i][j] = 1
                degree[i] += 1
            else:
                cosineMatrix[i][j] = 0
    for i in range(0,n):
        for j in range(0,n):
            cosineMatrix[i][j] = cosineMatrix[i][j]/degree[i]
    L = powerMethod(cosineMatrix, n, tolerance)
    return L, degree

# Commands to run Lex Rank

In [None]:
# docSentencesDict = readCorpus(['Test_cnn', 'Test_dailymail'])
# #  preprocessing
# removedStopWordsDocSentencesDict = removeStopWords(docSentencesDict)
# # stemmedWordFrequencyDict is not need in our implementation
# stemmedDocSentencesDict, stemmedWordFrequencyDict = getStemmedText(removedStopWordsDocSentencesDict)
# termDocFrequencyDict = getTermDocfrequency(stemmedDocSentencesDict)
# totalDocs = len(docSentencesDict)
# # testFile will be array of stemmed sentences in a test file
# # testFile = 
#  threshold is in range of [0.1 - 0.3]
#  very high thresholds may lose almost all of the information in a similarity matrix
# threshold = 0.1
#  tolerance is also consider as damping factor, range [0.1 - 0.2], research paper has taken it as 0.85
# tolerance = 0.1
# L = lexRank(testFile,threshold, tolerance, totalDocs, termDocFrequencyDict)

In [None]:
# Questions are
# 1) threshold value
# 2) how will I convert L returned by lex rank to readable summary

# Testing the code on first 5 files of cnn and dailymail

In [22]:
testDocSentencesDict, testDocSummaryDict = readCorpus(['Test_cnn', 'Test_dailymail'])
testDocSentencesDict['dailymail_0a0a733db965c3fdf9bc2895104a1ef884a3d593.story']

['By',
 'Ryan Gorman',
 'PUBLISHED:',
 '16:25 EST, 8 September 2013',
 '|',
 'UPDATED:',
 '04:49 EST, 9 September 2013',
 'A fan who caused outrage by appearing to taunt an alleged sexual assault victim on live television Saturday has explained his actions.',
 'A sign held up by a University of Michigan student behind the hosts of ESPN’s College Gameday pregame show that said ‘Hi Lizzy Seeberg’ sparked outrage and appeared to taunt a girl who accused a University of Notre Dame football player of sexual assaulting her Aug 31, 2010 only to have the allegations appear to have been ignored every step of the way.',
 'Ms Seeberg, 19, was a freshman at St Mary’s College, a small women’s only Catholic college literally across the street from the mighty Notre Dame.',
 "I did it: The person who held up the 'Hi Lizzy Seeberg' sign wrote a letter to Deadspin detailing his reasons behind the offensive gesture",
 'Unable to live with the aftermath of the alleged sexual assault and Notre Dame’s refus

In [15]:
testDocSummaryDict

{'cnn_0a0a4c90d59df9e36ffec4ba306b4f20f3ba4acb.story': ['Dean Obeidallah: A movie or TV show can educate or (mis)educate you',
  'Obeidallah: Two new films about hot issues are firing up both the left and right',
  'Senators slammed "Zero Dark Thirty," and energy industry attacked "Promised Land"',
  'Obeidallah: What does Hollywood want? To make money, of course'],
 'cnn_0a0aa464d262b903f44b0f8eaa67f13dd1946cfd.story': ['Math geeks and others celebrate Pi Day every March 14',
  'Pi, or roughly 3.14, is the ratio of circumference to diameter of a circle',
  'The Pi Day holiday idea started at the Exploratorium museum in San Francisco',
  'Albert Einstein was also born on March 14'],
 'cnn_0a0adc84ccbf9414613e145a3795dccc4828ddd4.story': ['MH370 families hold sit-in outside the Malaysian Embassy in Beijing',
  'Relatives marched from their hotel after request to meet Malaysian ambassador failed',
  'More than once in recent weeks Malaysian authorities have not shown up for talks with re

In [16]:
testRemovedStopWordsDocSentencesDict = removeStopWords(testDocSentencesDict)
# testRemovedStopWordsDocSentencesDict

In [17]:
testStemmedDocSentencesDict, testStemmedWordFrequencyDict = getStemmedText(testRemovedStopWordsDocSentencesDict)
# testStemmedDocSentencesDict

In [18]:
testTermDocFrequencyDict = getTermDocfrequency(testStemmedDocSentencesDict)
# for i in testTermDocFrequencyDict:
#     if testTermDocFrequencyDict[i] == 0:
#         print(i) 

In [19]:
testTotalDocs = len(testDocSentencesDict)
# testTotalDocs

In [20]:
testFile = testStemmedDocSentencesDict['dailymail_0a0a733db965c3fdf9bc2895104a1ef884a3d593.story']
# testFile

In [21]:
#  threshold is in range of [0.1 - 0.3]
#  very high thresholds may lose almost all of the information in a similarity matrix
threshold = 0.1
#  tolerance is also consider as damping factor, range [0.1 - 0.2], research paper has taken it as 0.85
tolerance = 0.1
L, degree = lexRank(testFile,threshold, tolerance, testTotalDocs, testTermDocFrequencyDict)
print(degree)
print(L)

[2, 2, 2, 3, 2, 2, 3, 6, 36, 31, 15, 31, 5, 29, 8, 11, 6, 15, 7, 7, 11, 25, 20, 22, 10, 19, 27, 9, 26, 9, 22, 25, 36, 20, 18, 4, 24, 22, 23, 9, 11, 20, 18, 21, 27, 34, 22, 4, 34, 14, 9, 17, 21, 4, 28, 20]
[[ 0.00223214  0.00223214  0.00223214 ...,  0.00223214  0.00223214
   0.00223214]
 [ 0.00223214  0.00223214  0.00223214 ...,  0.00223214  0.00223214
   0.00223214]
 [ 0.00223214  0.00223214  0.00223214 ...,  0.00223214  0.00223214
   0.00223214]
 ..., 
 [ 0.00266344  0.00266344  0.00266344 ...,  0.00266344  0.00266344
   0.00266344]
 [ 0.02237334  0.02237334  0.02237334 ...,  0.02237334  0.02237334
   0.02237334]
 [ 0.01532171  0.01532171  0.01532171 ...,  0.01532171  0.01532171
   0.01532171]]


In [38]:
i=0
for row in L:
#     for col in row:
#         print(str(col)) + ('  ')
    print(str(row[0]) + '    ' + str(i))
    i += 1
#     print('\n')

0.00223214285714    0
0.00223214285714    1
0.00223214285714    2
0.00529100529101    3
0.00223214285714    4
0.00223214285714    5
0.00529100529101    6
0.00468070642306    7
0.0297699977457    8
0.0252193332511    9
0.0121864644853    10
0.0252267874421    11
0.00344407495949    12
0.0239974370511    13
0.00648438705022    14
0.00868490119059    15
0.0050185702546    16
0.0123047152782    17
0.0051611345926    18
0.00594245255039    19
0.00839433957475    20
0.0196451157558    21
0.0153217077772    22
0.0170494624164    23
0.00803096563537    24
0.0157341491407    25
0.0213325865694    26
0.00699766826522    27
0.0209259338627    28
0.00736143451619    29
0.0170494624164    30
0.0204710936616    31
0.029564724658    32
0.0153217077772    33
0.0148544971176    34
0.0026977932114    35
0.020298005326    36
0.0181123897966    37
0.0178742082288    38
0.00689157965386    39
0.0086885386593    40
0.0153217077772    41
0.0148725529708    42
0.0174938024643    43
0.0213840849302    44
0.027

In [52]:
maxVal = 0
for row in L:
    if row[0] > maxVal:
        maxVal = row[0]    

In [61]:
cutOff = 20
cutOffVal = ((100 - cutOff)/100) * maxVal
print(cutOffVal)
i = 0
for row in L:
    if row[0] > cutOffVal:
        print(testDocSentencesDict['dailymail_0a0a733db965c3fdf9bc2895104a1ef884a3d593.story'][i])
    i += 1

0.0238159981966
A sign held up by a University of Michigan student behind the hosts of ESPN’s College Gameday pregame show that said ‘Hi Lizzy Seeberg’ sparked outrage and appeared to taunt a girl who accused a University of Notre Dame football player of sexual assaulting her Aug 31, 2010 only to have the allegations appear to have been ignored every step of the way.
Ms Seeberg, 19, was a freshman at St Mary’s College, a small women’s only Catholic college literally across the street from the mighty Notre Dame.
Unable to live with the aftermath of the alleged sexual assault and Notre Dame’s refusal to even acknowledge her repeated attempts to engage the school in any manner, Ms Seeberg was found dead nine days later – the devout Catholic had overdosed on antidepressants prescribed to treat her anxiety and depression.
Having seemingly dissipated into the ether of time, Lizzy Seeberg’s name, and alleged plight, weren’t on the minds of many people Saturday morning until a sign saying ‘Hi 

In [62]:
testDocSummaryDict['dailymail_0a0a733db965c3fdf9bc2895104a1ef884a3d593.story']

['Despite widespread outrage, he claims the sign was meant to bring attention to her case, and that of others whose accusations of sexual assault at the University of Notre Dame have been ignored or covered up',
 'Lizzy Seeberg, 19, was a freshman at a small Catholic college across the street from Notre Dame',
 'Ms Seeberg had a history of anxiety and depression, and overdosed on her prescribed medication to kill herself',
 'Notre Dame administration refused for over two years to even acknowledge the accusations, outside of A JOKE made by football coach Brian Kelly',
 'The football player accused of the assault was never suspended, never missed a game and was allowed to attend all practices',
 "The trip was Notre Dame's last to Michigan, the student felt the need to 'stand up for what's right'"]

In [65]:
len(testDocSentencesDict['dailymail_0a0a733db965c3fdf9bc2895104a1ef884a3d593.story'])
# sum(testDocSentencesDict.values())

56

In [40]:
testDocSentencesDict['dailymail_0a0a733db965c3fdf9bc2895104a1ef884a3d593.story'][9]

'Ms Seeberg, 19, was a freshman at St Mary’s College, a small women’s only Catholic college literally across the street from the mighty Notre Dame.'

In [41]:
testDocSentencesDict['dailymail_0a0a733db965c3fdf9bc2895104a1ef884a3d593.story'][11]

'Unable to live with the aftermath of the alleged sexual assault and Notre Dame’s refusal to even acknowledge her repeated attempts to engage the school in any manner, Ms Seeberg was found dead nine days later – the devout Catholic had overdosed on antidepressants prescribed to treat her anxiety and depression.'

In [42]:
testDocSentencesDict['dailymail_0a0a733db965c3fdf9bc2895104a1ef884a3d593.story'][13]

'Having seemingly dissipated into the ether of time, Lizzy Seeberg’s name, and alleged plight, weren’t on the minds of many people Saturday morning until a sign saying ‘Hi Lizzy Seeberg’ appeared behind College Gameday co-host Desmond Howard while discussing the Notre Dame-Michigan game being played later that day on UM’s campus.'

In [47]:
testDocSentencesDict['dailymail_0a0a733db965c3fdf9bc2895104a1ef884a3d593.story'][32]

'Ms Seeberg’s story took months to be made public after she accused a Notre Dame football player of the sexual assault. The allegations were detailed in a police report obtained by National Catholic Reporter.'

In [48]:
testDocSentencesDict['dailymail_0a0a733db965c3fdf9bc2895104a1ef884a3d593.story'][45]

"The same day the Chicago Tribune made Ms Seeberg’s allegations and subsequent suicide public – almost three months after the fact – Notre Dame football coach joked during a media conference call ‘that he didn't know the Tribune could afford all the reporters who were peppering him with questions about the case,’ according to NCR. Coach Kelly did not even suspend the player in question while the accusations were looked into, he played in every game, and was part of every single practice."

In [43]:
testDocSentencesDict['dailymail_0a0a733db965c3fdf9bc2895104a1ef884a3d593.story'][48]

'Ms Seeberg’s case isn’t the first one of females accusing Notre Dame football players of sexual assault, rape or other improprieties.'

In [44]:
testDocSummaryDict['dailymail_0a0a733db965c3fdf9bc2895104a1ef884a3d593.story']

['Despite widespread outrage, he claims the sign was meant to bring attention to her case, and that of others whose accusations of sexual assault at the University of Notre Dame have been ignored or covered up',
 'Lizzy Seeberg, 19, was a freshman at a small Catholic college across the street from Notre Dame',
 'Ms Seeberg had a history of anxiety and depression, and overdosed on her prescribed medication to kill herself',
 'Notre Dame administration refused for over two years to even acknowledge the accusations, outside of A JOKE made by football coach Brian Kelly',
 'The football player accused of the assault was never suspended, never missed a game and was allowed to attend all practices',
 "The trip was Notre Dame's last to Michigan, the student felt the need to 'stand up for what's right'"]