## 0.0 Import modules

In [1]:
## NLP modules
import gensim
from gensim.models.doc2vec import Doc2Vec
import nltk
import textblob
from textblob import TextBlob
from sklearn.feature_extraction.stop_words import ENGLISH_STOP_WORDS
from nltk.corpus import stopwords ##Note you'll need to download NLTK and corpuses
from spacy.en import English ##Note you'll need to install Spacy and download its dependencies
parser = English()

## Other Python modules
import itertools
from operator import itemgetter
import re
import string
import numpy as np
import pandas as pd
import matplotlib
%matplotlib inline

## Graph module
import networkx as nx

## Machine learning & text vectorizer modules
from sklearn.decomposition import TruncatedSVD
from sklearn.cluster import KMeans
from sklearn.metrics import pairwise_distances
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer



## 1.0 Functions

### 1.1 Text pre-processing functions

In [2]:
# A custom function to clean the text before sending it into the vectorizer
def cleanText(text):
    # get rid of newlines
    text = text.strip().replace("\n", " ").replace("\r", " ")
    
    # replace twitter @mentions
    mentionFinder = re.compile(r"@[a-z0-9_]{1,15}", re.IGNORECASE)
    text = mentionFinder.sub("@MENTION", text)
    text = re.sub('[^a-zA-Z ]','',text)
    # replace HTML symbols
    text = text.replace("&amp;", "and").replace("&gt;", ">").replace("&lt;", "<")
    
    # lowercase
    text = text.lower()
#     text = str(TextBlob(text).correct())
    return text

# A custom function to tokenize the text using spaCy
# and convert to lemmas
def tokenizeText(sample):
    # get the tokens using spaCy
    tokens = parser(sample)

    # lemmatize
    lemmas = []
    for tok in tokens:
        lemmas.append(tok.lemma_.lower().strip() if tok.lemma_ != "-PRON-" else tok.lower_)
    tokens = lemmas

    # stoplist the tokens
    tokens = [tok for tok in tokens if tok not in STOPLIST]

    # stoplist symbols
    tokens = [tok for tok in tokens if tok not in SYMBOLS]

    # remove large strings of whitespace
    while "" in tokens:
        tokens.remove("")
    while " " in tokens:
        tokens.remove(" ")
    while "\n" in tokens:
        tokens.remove("\n")
    while "\n\n" in tokens:
        tokens.remove("\n\n")

    return tokens

# A custom stoplist
STOPLIST = set(stopwords.words('english') + ["n't", "'s", "'m", "ca"] + list(ENGLISH_STOP_WORDS))
# List of symbols we don't care about
SYMBOLS = " ".join(string.punctuation).split(" ") + ["-----", "---", "...", "“", "”", "'ve"]

## Tokenizer specific for Doc2Vec where each word is important, so stop words are not removed.
def doc2vec_tokenizeText(corpus):
    punctuation = """.,?!:;(){}[]"""
    corpus = [z.lower().replace('\n','') for z in corpus]
    corpus = [z.replace('<br />', ' ') for z in corpus]

    #treat punctuation as individual words
    for c in punctuation:
        corpus = [z.replace(c, ' %s '%c) for z in corpus]
    corpus = [z.split() for z in corpus]
    return corpus

### 1.2 Similarity measure functions

In [3]:
def cos_sim(text1,text2):
    tfvectorizer = TfidfVectorizer(tokenizer=tokenizeText)
    arrays = tfvectorizer.fit_transform([text1,text2]).A
    num = (arrays[0]*arrays[1]).sum()
    denom1 = np.sqrt((arrays[0]**2).sum())
    denom2 = np.sqrt((arrays[1]**2).sum())
    return num/(denom1*denom2)

def similarity(string1,string2):
    w1 = tokenizeText(cleanText(string1))
    w2 = tokenizeText(cleanText(string2))
    score = 0
    for w in w1:
        if w in w2:
            score += 1
        else:
            continue
    for w in w2:
        if w in w1:
            score += 1
        else:
            continue
    return score/(len(w1)+len(w2))

def lDistance(firstString, secondString):
    "Function to find the Levenshtein distance between two words/sentences - gotten from http://rosettacode.org/wiki/Levenshtein_distance#Python"
    if len(firstString) > len(secondString):
        firstString, secondString = secondString, firstString
    distances = range(len(firstString) + 1)
    for index2, char2 in enumerate(secondString):
        newDistances = [index2 + 1]
        for index1, char1 in enumerate(firstString):
            if char1 == char2:
                newDistances.append(distances[index1])
            else:
                newDistances.append(1 + min((distances[index1], distances[index1+1], newDistances[-1])))
        distances = newDistances
    return 1/(distances[-1]+1)


### 1.3 Graph building functions

In [4]:
def buildGraph(nodes,weight):
    "nodes - list of hashables that represents the nodes of the graph"
    gr = nx.Graph() #initialize an undirected graph
    gr.add_nodes_from(nodes)
    nodePairs = list(itertools.combinations(nodes, 2))

    #add edges to the graph (weighted by Levenshtein distance)
    for pair in nodePairs:
        firstString = pair[0]
        secondString = pair[1]
        if weight == 'cosine':
            edge_weight = cos_sim(firstString, secondString)
        if weight == 'similarity':
            edge_weight = similarity(firstString, secondString)
        if weight == 'ldistance':
            edge_weight = lDistance(firstString, secondString)
        gr.add_edge(firstString, secondString, weight=edge_weight)
    
    return gr


### 1.4 Summarizer functions

In [9]:

def draw_graph(text,weight='cosine'):
    sent_detector = nltk.data.load('tokenizers/punkt/english.pickle')
    sentenceTokens = sent_detector.tokenize(text.strip())
    graph = buildGraph(sentenceTokens,weight=weight)
    nx.draw_networkx(graph)

def textrank_summarizer(text,weight='cosine',num_sen = 5, pos_score = False):
    sent_detector = nltk.data.load('tokenizers/punkt/english.pickle')
    sentenceTokens = sent_detector.tokenize(text.strip())
    graph = buildGraph(sentenceTokens,weight=weight)

    calculated_page_rank = nx.pagerank(graph, weight='weight')
    
    #Create position scorelist
    #Scores are caluclated such that page rank score is increased by 10% if its the first or last sentence
    #Sentences in the middle of the document are not given increased scores
    pos = np.array(list(range(len(sentenceTokens))))
#     score_array = np.array(1+(abs((pos+0.5) - len(pos)/2)/max((pos+0.5) - len(pos)/2)/10))
#     score_dict = {}
#     for i in range(len(sentenceTokens)):
#         score_dict[sentenceTokens[i]] = score_array[i] 

#     #Adjusts page rank score for position
#     score_adj_page_rank = {k : v * score_dict[k] for k, v in calculated_page_rank.items() if k in score_dict}
    
    #most important sentences in ascending order of importance
#     if pos_score == True:
#         sentences = sorted(score_adj_page_rank, key=score_adj_page_rank.get,reverse=True)
#     else:
#         sentences = sorted(calculated_page_rank, key=calculated_page_rank.get,reverse=True)
  
    #return a word summary
    pos_dict = {}
    for i in range(len(sentenceTokens)):
        pos_dict[sentenceTokens[i]] = pos[i] 

    combined = {k : [v, pos_dict[k]] for k, v in calculated_page_rank.items() if k in pos_dict}

    listlist = []
    for k, v in combined.items():
        listlist.append((k,v[0],v[1]))

    listlist.sort(key=lambda x: x[1],reverse=True)

    summarysentences = listlist[0:num_sen]

    summarysentences.sort(key=lambda x: x[2],reverse=False)

    summary = ""
    for n in range(num_sen):
        summary += ' ' + summarysentences[n][0]
        summary = " ".join(summary.replace(u"\xa0", u" ").strip().split())

    return summary



def doc2vec_summarizer(text,num_sen=5):

    sent_detector = nltk.data.load('tokenizers/punkt/english.pickle')
    sentenceTokens = sent_detector.tokenize(text.strip())

    LabeledSentence = gensim.models.doc2vec.LabeledSentence

    sentences = doc2vec_tokenizeText(sentenceTokens)

    def labelizeReviews(reviews, label_type):
        labelized = []
        for i,v in enumerate(reviews):
            label = '%s_%s'%(label_type,i)
            labelized.append(LabeledSentence(v, [label]))
        return labelized

    sentences = labelizeReviews(sentences,'train')

    model = Doc2Vec(min_count=1, window=10, size=500, sample=1e-4, workers=8)
    model.build_vocab(sentences)
    for epoch in range(1000):
        model.train(sentences)


    docvec = []
    for i in range(len(model.docvecs)):
        docvec.append(model.docvecs[i])

    kmeans = KMeans(n_clusters=1)

    kmeans.fit(docvec)

    distance = pairwise_distances(kmeans.cluster_centers_, docvec)

    pos = np.array(list(range(len(sentenceTokens))))

    listlist = [list(x) for x in zip(sentenceTokens,distance.tolist()[0],pos)]

    listlist.sort(key=lambda x: x[1],reverse=False)

    ## Sort by sentence order
    summarysentences = listlist[0:num_sen]

    summarysentences.sort(key=lambda x: x[2],reverse=False)

    summary = ""
    for n in range(num_sen):
        summary += ' ' + summarysentences[n][0]
        summary = " ".join(summary.replace(u"\xa0", u" ").strip().split())

    return summary
        
def lsa_summarizer(text,num_sen=5):
    sent_detector = nltk.data.load('tokenizers/punkt/english.pickle')
    sentenceTokens = sent_detector.tokenize(text.strip())

    tfvectorizer = TfidfVectorizer(tokenizer=tokenizeText)
    sparse = tfvectorizer.fit_transform(sentenceTokens).A
    lsa = TruncatedSVD(n_components=1)
    concept = lsa.fit_transform(sparse)

    pos = np.array(list(range(len(sentenceTokens))))    
    
    listlist = [list(x) for x in zip(sentenceTokens,concept,pos)]

    listlist.sort(key=lambda x: x[1],reverse=True)

    summarysentences = listlist[0:num_sen]

    summarysentences.sort(key=lambda x: x[2],reverse=False)

    summary = ""
    for n in range(num_sen):
        summary += ' ' + summarysentences[n][0]
        summary = " ".join(summary.replace(u"\xa0", u" ").strip().split())

    return summary

In [10]:
experitext = "The boy pet the dog. Boy petted the dog. The dog got petted boy. Dog said woof woof. Boy said come here boy. Dog did not go to boy. Cat is fat. Bear is big. Pig is cute. The cavs won the game. Warriors still have a chance. Penguins escaped the sharks!"

In [11]:
print(experitext)

The boy pet the dog. Boy petted the dog. The dog got petted boy. Dog said woof woof. Boy said come here boy. Dog did not go to boy. Cat is fat. Bear is big. Pig is cute. The cavs won the game. Warriors still have a chance. Penguins escaped the sharks!


In [12]:
textrank_summarizer(experitext, weight='cosine')



'The boy pet the dog. Boy petted the dog. The dog got petted boy. Boy said come here boy. Dog did not go to boy.'

In [13]:
textrank_summarizer(experitext, weight='similarity')



'The boy pet the dog. Boy petted the dog. The dog got petted boy. Boy said come here boy. Dog did not go to boy.'

In [14]:
lsa_summarizer(experitext)

'The boy pet the dog. Boy petted the dog. The dog got petted boy. Boy said come here boy. Dog did not go to boy.'

In [16]:
doc2vec_summarizer(experitext)

'The boy pet the dog. Boy petted the dog. The dog got petted boy. Dog said woof woof. Pig is cute.'

## 2.0 Sample runs

Sample Text

In [364]:
text2 = '\n\nBRUSSELS – The “Brexiteers” – those who want Britain to leave the European Union – argue that their goal would be virtually cost-free and have no effect on the United Kingdom’s global trade. They are wrong. On June 23, when voters in Britain cast their ballots in the referendum on the question, they need to consider what is actually involved in leaving the EU – and how the free-trade benefits they now enjoy (and take for granted) could be maintained after Brexit. Start with the basics. Leaving the EU means that the UK would exit the EU’s Customs Union, which is the basis for cross-border free trade among the EU’s 28 members (and establishes a common external tariff vis-à-vis third countries). It also means exit from the Single Market – the basis for the free movement of goods and services among EU members. By definition, non-members of the EU cannot belong to the Single Market.\n\nWill Brexit Destroy Britain and Europe?\n\nPhilippe Legrain weighs the views of Joschka Fischer, Richard Haass, Joseph Nye, and others on what Britain’s withdrawal from the EU would mean for both sides.\n\nSo what would happen next? During the two-year period before Britain’s withdrawal takes final effect, there would be UK-EU negotiations on many points – sovereignty, the legal order, immigration, finances, and economic matters. The assumption is that a crucial goal for Britain would be to negotiate a trading relationship as close as possible to the free-trade relationships that exist today.That is easier said than done. The best result would be if all players agreed to maintain the free trade already achieved, with the UK setting a new external tariff on a duty-free basis, applicable to all comers. This is what happened in the 1970s after Britain and Denmark left the European Free Trade Association: Free-trade agreements were negotiated among EFTA members and between them and the EU (or the EEC as it was then known). But Brexiteers should realize that there is no guarantee that this would happen again – and, in any case, there would be complications. While this solution would be good for the 45% of British exports that are sold in EU markets, it would reduce protection for British industries to zero. Under the rules of the World Trade Organization, the same import duties must be applied to all WTO participants – which means that if Britain’s imports from the EU are duty-free, its imports from the rest of the world must be, too. The alternative would be for UK exporters to accept the EU’s common external tariff, and for the UK to create its own import tariff, applied to all imports, including from the EU. Because the common tariff is at a relatively low level on industrial and fishery products, this might not be an insuperable barrier for UK exports, and it would allow some flexibility in protecting UK companies from imports. The potential pitfall is that any British tariff increase above the EU level would expose the UK to claims for compensation from third countries in the WTO. The bigger question that the Brexiteers need to answer is how to secure a high level of access to the EU’s internal market. This is vitally important for Britain’s service industries, particularly for the City of London’s exports of financial services. There is only one precedent for non-EU members being able to negotiate access to the internal market equivalent to that enjoyed by EU members. This is the European Economic Area agreement concluded with the EU by Norway, Iceland, and Liechtenstein in 1992.In the view of many observers, including me, access to the Single Market through the EEA is no longer available. But what if we’re wrong? The point is that such a deal would go against all the instincts (and rhetoric) of the Brexiteers, because it would mean accepting the EU’s “four freedoms”: not just the free movement of goods, services, and capital – but of people, too. That would be hard to square with the Brexit objective of “controlling our borders.” The Brexiteers would also blanch at Britain’s obligation under an EEA-type agreement to continue contributing to the EU budget.Of course, there could be specific arrangements for particular sectors. But it seems unlikely that such arrangements would be possible in financial services and the major professional services (including doctors, architects, and lawyers), which are important for Britain’s competitors in Europe. Indeed, it is possible that the EU would adopt a tough mercantilist stance: If you want privileged access, you should stay in the club. The final consequence of Brexit is that the UK would lose its free-trade arrangements with third countries under the many trade agreements that the EU has signed since 2000. Replacing these agreements with bilateral deals would take time. There is no guarantee that the EU would agree to an interim continuation of free trade, and it seems certain that UK exports would face higher tariffs than its former EU partners in those third countries (placing British exporters at a competitive disadvantage).\n\nIt’s a big stretch for the Brexiteers to ask us to believe that the world’s most prominent leaders, European or not, have all got the impact of Brexit wrong. In the trade debate, we have had the International Monetary Fund, US President Barack Obama, and the OECD – quite apart from the UK Treasury – telling Britons that a vote to leave would be bad for the economy. It is not much of a reply to argue that the OECD is “in the pay of the EU” or that Obama is anti-British because he had a Kenyan father in British colonial days.Economic forecasting is an uncertain science. But when almost all projections point in the same direction – that Brexit would be hugely damaging to the UK – it is time to decide what is credible and what is not.\n\n'

In [428]:
textrank_summarizer(text2,weight='cosine')



'Leaving the EU means that the UK would exit the EU’s Customs Union, which is the basis for cross-border free trade among the EU’s 28 members (and establishes a common external tariff vis-à-vis third countries).It also means exit from the Single Market – the basis for the free movement of goods and services among EU members.This is what happened in the 1970s after Britain and Denmark left the European Free Trade Association: Free-trade agreements were negotiated among EFTA members and between them and the EU (or the EEC as it was then known).The alternative would be for UK exporters to accept the EU’s common external tariff, and for the UK to create its own import tariff, applied to all imports, including from the EU.There is no guarantee that the EU would agree to an interim continuation of free trade, and it seems certain that UK exports would face higher tariffs than its former EU partners in those third countries (placing British exporters at a competitive disadvantage).'

In [384]:
lsa_summarizer(text2)

'Leaving the EU means that the UK would exit the EU’s Customs Union, which is the basis for cross-border free trade among the EU’s 28 members (and establishes a common external tariff vis-à-vis third countries).It also means exit from the Single Market – the basis for the free movement of goods and services among EU members.This is what happened in the 1970s after Britain and Denmark left the European Free Trade Association: Free-trade agreements were negotiated among EFTA members and between them and the EU (or the EEC as it was then known).The alternative would be for UK exporters to accept the EU’s common external tariff, and for the UK to create its own import tariff, applied to all imports, including from the EU.The point is that such a deal would go against all the instincts (and rhetoric) of the Brexiteers, because it would mean accepting the EU’s “four freedoms”: not just the free movement of goods, services, and capital – but of people, too.'

In [378]:
doc2vec_summarizer(text2)

'On June 23, when voters in Britain cast their ballots in the referendum on the question, they need to consider what is actually involved in leaving the EU – and how the free-trade benefits they now enjoy (and take for granted) could be maintained after Brexit.The assumption is that a crucial goal for Britain would be to negotiate a trading relationship as close as possible to the free-trade relationships that exist today.That is easier said than done.The point is that such a deal would go against all the instincts (and rhetoric) of the Brexiteers, because it would mean accepting the EU’s “four freedoms”: not just the free movement of goods, services, and capital – but of people, too.That would be hard to square with the Brexit objective of “controlling our borders.” The Brexiteers would also blanch at Britain’s obligation under an EEA-type agreement to continue contributing to the EU budget.Of course, there could be specific arrangements for particular sectors.There is no guarantee th

import PyPDF2

from PyPDF2 import PdfFileWriter, PdfFileReader

input1 = PdfFileReader(open("C:/Users/kennd/Downloads/documentation.pdf", "rb"))

pages = input1.getNumPages()

text = ""
for i in range(pages):
    text += input1.getPage(i).extractText()
    text = " ".join(text.replace(u"\xa0", u" ").strip().split())



In [8]:
import numpy as np

from pylatex import Document, Section, Subsection, Tabular, Math, TikZ, Axis,Plot, Figure, Package, Matrix
from pylatex.utils import italic
import os

doc = Document()

with doc.create(Section('The simple stuff')):
    doc.append('Some regular text and some')
    doc.append(italic('italic text. '))
    doc.append('\nAlso some crazy characters: $&#{}')
    with doc.create(Subsection('Math that is incorrect')):
        doc.append(Math(data=['2*3', '=', 9]))

    with doc.create(Subsection('Table of something')):
        with doc.create(Tabular('rc|cl')) as table:
            table.add_hline()
            table.add_row((1, 2, 3, 4))
            table.add_hline(1, 2)
            table.add_empty_row()
            table.add_row((4, 5, 6, 7))

a = np.array([[100, 10, 20]]).T
M = np.matrix([[2, 3, 4],
               [0, 0, 1],
               [0, 0, 2]])

with doc.create(Section('The fancy stuff')):
    with doc.create(Subsection('Correct matrix equations')):
        doc.append(Math(data=[Matrix(M), Matrix(a), '=', Matrix(M * a)]))
                
doc.generate_pdf('full',compiler='pdflatex')

FileNotFoundError: [WinError 2] The system cannot find the file specified