# Advanced: Text Processing in Matrices

## Load Natural Language Toolkit for Parsing

In [1]:
! pip install nltk
import nltk

# Enter 'd' for Download, then 'punkt', and then 'q' for quit
nltk.download()


Collecting nltk
  Downloading nltk-3.2.2.tar.gz (1.2MB)
[K    100% |████████████████████████████████| 1.2MB 914kB/s ta 0:00:01
Building wheels for collected packages: nltk
  Running setup.py bdist_wheel for nltk ... [?25l- \ | done
[?25h  Stored in directory: /home/jovyan/.cache/pip/wheels/42/b5/27/718985cd9719e8a44a405d264d98214c7a607fb65f3a006f28
Successfully built nltk
Installing collected packages: nltk
Successfully installed nltk-3.2.2
[33mYou are using pip version 8.1.2, however version 9.0.1 is available.
You should consider upgrading via the 'pip install --upgrade pip' command.[0m
NLTK Downloader
---------------------------------------------------------------------------
    d) Download   l) List    u) Update   c) Config   h) Help   q) Quit
---------------------------------------------------------------------------
Downloader> d

Download which package (l=list; x=cancel)?
  Identifier> punkt
    Downloading package punkt to /home/jovyan/nltk_data...
      Unzipping t

True

## Import text files into dictionary

As a "corpus" we fetched some data from Wikipedia, based on currently
trendy (2/18/2017) topics.  Each topic had multiple interpretations, some of which 
we suspected would "intersect" in interesting ways (e.g., Trump/Putin, Cloud/Google, 
Cloud/Climate).  Others had various interpretations (e.g., there are many types of 
Football).  See _Wikipedia.ipynb_ for the original download code.

Selected topics (for which the top-10 matches were returned by Wikipedia) were:

 * Pennsylvania
 * Trump
 * Apple
 * Google
 * Farm
 * Climate
 * Cloud
 * Football
 * Government
 * Putin

*docs* is a map from file --> text

In [2]:
import os

docs = {}

for filename in os.listdir('text'):
    file = open('text/' + filename)
    docs[filename] = file.read()
    print ('Loaded',filename)

print ("All files loaded")

Loaded Google Books.txt
Loaded Apple II series.txt
Loaded Trump fragrances.txt
Loaded Family of Donald Trump.txt
Loaded Pennsylvania Historical and Museum Commission.txt
Loaded Government agency.txt
Loaded Climate model.txt
Loaded Football player.txt
Loaded Pennsylvania Railroad.txt
Loaded Pennsylvania.txt
Loaded Russia under Vladimir Putin.txt
Loaded Cooking apple.txt
Loaded Legal affairs of Donald Trump.txt
Loaded Tag cloud.txt
Loaded Farm.txt
Loaded Climate justice.txt
Loaded Calumet Farm.txt
Loaded Google Hangouts.txt
Loaded Mediterranean climate.txt
Loaded American football.txt
Loaded Outline of Pennsylvania.txt
Loaded Flag football.txt
Loaded Arrest of Vladimir Putin viral video.txt
Loaded Football team.txt
Loaded Farm Aid.txt
Loaded Pennsylvania Regions.txt
Loaded Eric Trump.txt
Loaded Google+.txt
Loaded Climate.txt
Loaded Public image of Vladimir Putin.txt
Loaded Local government.txt
Loaded Stratus cloud.txt
Loaded E-government.txt
Loaded Home Farm F.C..txt
Loaded Football.txt


## Other preliminaries to get you started.

The function *has_letter* should be used to filter words based on the presence of a letter.

The set *stopwords* includes words to ignore.

In [70]:
import nltk
from nltk.stem.porter import *
import re
import numpy as np

"""
# Returns True if the input (string) parameter has
# any sort of letter in it, else returns False.
"""
def has_letter(x):
    return re.match('.*[a-zA-Z].*',x) != None

# Stopwords are words we will ignore for search
# purposes, because they are too common to be useful
stopwords = set()

stop_file = open('stopwords.txt')
for line in stop_file:
    stopwords.add(line.strip())

# The NLTK parser breaks apostrophe-s into a separate "word"
# so we'll want to add it to the list... Though it's technically
# not a stop word in the traditional sense.
stopwords.add("'s")

# Use this as the maximum number of words we will index
MAX_WORDS = 18174

stemmer = PorterStemmer(mode='ORIGINAL_ALGORITHM')

In [56]:
stopwords

{"'s",
 'a',
 'about',
 'above',
 'after',
 'again',
 'against',
 'all',
 'am',
 'an',
 'and',
 'any',
 'are',
 "aren't",
 'as',
 'at',
 'be',
 'because',
 'been',
 'before',
 'being',
 'below',
 'between',
 'both',
 'but',
 'by',
 "can't",
 'cannot',
 'could',
 "couldn't",
 'did',
 "didn't",
 'do',
 'does',
 "doesn't",
 'doing',
 "don't",
 'down',
 'during',
 'each',
 'few',
 'for',
 'from',
 'further',
 'had',
 "hadn't",
 'has',
 "hasn't",
 'have',
 "haven't",
 'having',
 'he',
 "he'd",
 "he'll",
 "he's",
 'her',
 'here',
 "here's",
 'hers',
 'herself',
 'him',
 'himself',
 'his',
 'how',
 "how's",
 'i',
 "i'd",
 "i'll",
 "i'm",
 "i've",
 'if',
 'in',
 'into',
 'is',
 "isn't",
 'it',
 "it's",
 'its',
 'itself',
 "let's",
 'me',
 'more',
 'most',
 "mustn't",
 'my',
 'myself',
 'no',
 'nor',
 'not',
 'of',
 'off',
 'on',
 'once',
 'only',
 'or',
 'other',
 'ought',
 'our',
 'ours',
 'ourselves',
 'out',
 'over',
 'own',
 'same',
 "shan't",
 'she',
 "she'd",
 "she'll",
 "she's",
 'shoul

# Your Code Goes Here!

Note that you may want to read more about TF*IDF scoring at:

* http://nlp.stanford.edu/IR-book/html/htmledition/term-frequency-and-weighting-1.html
* https://en.wikipedia.org/wiki/Tf%E2%80%93idf

In [17]:
import pandas as pd

In [71]:
lexicon=[]
inverse_lexicon=[]
doc_vectors=pd.np.zeros((len(docs),MAX_WORDS))

In [72]:

for num,i in enumerate(docs):
    wrds=[ stemmer.stem(i.lower()) for i in nltk.word_tokenize(docs[i]) if has_letter(i.lower()) and i.lower() not in stopwords]
    wrd= list(set(wrds))
    if len(lexicon)!=MAX_WORDS:
        new_wrds=[i for i in wrd if i not in lexicon]
        if len(lexicon)+len(new_wrds)<MAX_WORDS:
            lexicon.extend(new_wrds)
        else:
            lexicon.extend(new_wrds[0:MAX_WORDS-len(lexicon)])
    for i in wrds:
        if i in lexicon:
            doc_vectors[num][lexicon.index(i)]+=1
#AX_WORDS=len(lexicon)  

In [83]:
idf=[ pd.np.log10(float(len(docs))/sum((doc_vectors[:,i]>0))) for i in range(MAX_WORDS)]

In [84]:
idf 

[1.9912260756924949,
 0.71247247473966591,
 1.9912260756924949,
 0.84509804001425681,
 0.66900678095857558,
 0.3010299956639812,
 1.9912260756924949,
 1.5141048209728325,
 0.81513481663681364,
 0.91204482964486999,
 0.94983339053426974,
 0.94983339053426974,
 1.9912260756924949,
 1.9912260756924949,
 0.36797678529459443,
 0.37844221897275931,
 0.87728272338565816,
 1.146128035678238,
 1.9912260756924949,
 0.42302435162549989,
 0.84509804001425681,
 1.9912260756924949,
 1.5141048209728325,
 1.3891660843645326,
 0.54406804435027567,
 1.6901960800285136,
 0.76077715431422099,
 1.5141048209728325,
 0.25883231586952637,
 1.2130748253088512,
 0.78710609303657009,
 0.5288280777935388,
 0.35775762011290835,
 1.9912260756924949,
 0.84509804001425681,
 1.9912260756924949,
 0.44715803134221921,
 0.41144247907568471,
 0.87728272338565816,
 0.71247247473966591,
 0.47271213581460736,
 1.9912260756924949,
 1.2922560713564761,
 1.9912260756924949,
 0.51410482097283239,
 0.59328606702045716,
 0.3099848

In [152]:
def search(vectors, idf, query, num_results):
    q_vec=create_query_vector(query)
    qj=q_vec*idf
    #print(qj)
    cos_sim=pd.np.zeros((len(vectors),1))
    for num,i in enumerate(vectors):
        di=i*idf
        #print (di)
        cos_sim[num]= pd.np.dot(di,qj)/(np.linalg.norm(di)*np.linalg.norm(qj))
        #print(pd.np.dot(di,qj),(pd.np.linalg.norm(di),pd.np.linalg.norm(qj)))
    return cos_sim        

In [153]:
def create_query_vector(query):
    q_vec=pd.np.zeros(MAX_WORDS)
    #print (q_vec)
    for i in nltk.word_tokenize(query):
        if ((i.lower() not in stopwords )and (stemmer.stem(i.lower()) in lexicon)):
            q_vec[lexicon.index(stemmer.stem(i.lower()))]+=1
    return q_vec
            

In [154]:
search(doc_vectors,idf, 'Apple Steve jobs' , 10)

array([[ 0.        ],
       [ 0.        ],
       [ 0.        ],
       [ 0.        ],
       [ 0.30491143],
       [ 0.        ],
       [ 0.00527645],
       [ 0.00808595],
       [ 0.        ],
       [ 0.00561584],
       [ 0.        ],
       [ 0.33702934],
       [ 0.00449055],
       [ 0.        ],
       [ 0.        ],
       [ 0.33007157],
       [ 0.00553675],
       [ 0.01549771],
       [ 0.        ],
       [ 0.        ],
       [ 0.        ],
       [ 0.00926239],
       [ 0.        ],
       [ 0.        ],
       [ 0.        ],
       [ 0.        ],
       [ 0.00983386],
       [ 0.        ],
       [ 0.        ],
       [ 0.        ],
       [ 0.27519078],
       [ 0.        ],
       [ 0.        ],
       [ 0.00628831],
       [ 0.        ],
       [ 0.        ],
       [ 0.        ],
       [ 0.        ],
       [ 0.        ],
       [ 0.        ],
       [ 0.        ],
       [ 0.        ],
       [ 0.00600249],
       [ 0.        ],
       [ 0.40090371],
       [ 0

In [144]:
q_vec=create_query_vector('Apple Steve jobs')

In [126]:
sum(q_vec>0)

3

In [148]:
sum(q_vec*idf)

2.3504289366795845

In [136]:
d=[0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0]
q=[2, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0]
np.dot(d,q)/(np.linalg.norm(d)*np.linalg.norm(q))

0.42640143271122083