In [1]:
import spacy
import numpy as np
nlp = spacy.load('en_core_web_sm')

In [2]:
L = list(nlp.vocab.strings)

In [3]:
numWords = len(L)
print(numWords)

84780


In [7]:
w_2_i_mappings = dict(zip(L, np.arange(numWords)))
i_2_w_mappings = dict(zip(np.arange(numWords), L))

### Creating One Hot Vector for words

In [13]:
def oneHotVector(word, W2I, numWords):
    oneHotVector = np.zeros(numWords, dtype= int)
    oneHotVector[w_2_i_mappings[word]] = 1   
    return oneHotVector
    f

In [14]:
v = oneHotVector('game', w_2_i_mappings, numWords)

## Term Frequency for Document Representations Implementations

In [17]:
doc = 'How are you today? I know most of the time how you feel?'
tokens = [token.text for token in nlp(doc) ]

In [20]:
v = np.zeros(numWords)
for token in tokens:
    v += oneHotVector(token, w_2_i_mappings, numWords)

In [23]:
v[w_2_i_mappings['?']]

2.0

### TFIDF for Document Representations Implementation Reading Corpus

In [24]:
from sklearn.datasets import fetch_20newsgroups as getData

In [25]:
corpus = getData(subset = 'train', remove =('headers', 'footers', 'qoutes'))

In [26]:
docs = corpus.data


In [28]:
len(docs)

11314

In [29]:
print(docs[0])

I was wondering if anyone out there could enlighten me on this car I saw
the other day. It was a 2-door sports car, looked to be from the late 60s/
early 70s. It was called a Bricklin. The doors were really small. In addition,
the front bumper was separate from the rest of the body. This is 
all I know. If anyone can tellme a model name, engine specs, years
of production, where this car is made, history, or whatever info you
have on this funky looking car, please e-mail.


In [32]:
document_frequency = np.zeros(numWords)

for term in L:
    document_frequency_term = 0
    for doc in docs[:100]:
        if term in doc:
            document_frequency_term += 1
    document_frequency[w_2_i_mappings[term]] = document_frequency_term
    

In [34]:
N = 100

In [38]:
Idf = np.log10(N/(document_frequency+1))

In [58]:
doc =  'How are you today? I am fine'
v = np.zeros(numWords)
for token in nlp(doc):
    v = oneHotVector(token.text, w_2_i_mappings, numWords)

In [59]:
tf = np.log10(v+1)
tfidf = tf*Idf
tfidf

array([0., 0., 0., ..., 0., 0., 0.])

In [61]:
np.sum(tfidf != 0)

1

In [64]:
from sklearn.datasets import fetch_20newsgroups as getData
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.naive_bayes import GaussianNB as NB

In [65]:
def loadCorpus():
    corpus = getData(subset = 'train', 
                     remove = ('headers', 'footers', 'quotes'),
                    categories = ['alt.atheism',
                                 'comp.graphics',
                                 'soc.religion.christian'])
    return corpus

In [66]:
corpus = loadCorpus()

In [69]:
def buildTFIDModel(docs):
    
    cv = CountVectorizer()
    ct = cv.fit(docs)
    counts = ct.transform(docs)
    
    tfidf = TfidfTransformer().fit(counts)
    return ct, tfidf

In [70]:
ct, tfidf = buildTFIDModel(corpus.data)

In [73]:
def computeTFIDFFeatures(docs, ct, tfidf):
    counts = ct.transform(docs)
    xF = tfidf.transform(counts)
    return xF.toarray()

In [74]:
xF = computeTFIDFFeatures(corpus.data, ct, tfidf)

In [80]:
docs_new = ['God loves everyone',
           'OpenGL works fast',
           'No one is there']

xF_new = computeTFIDFFeatures(docs_new, ct, tfidf)

In [84]:
clf = NB().fit(xF, corpus.target)

In [88]:
predicted = clf.predict(xF_new)

In [92]:
print(corpus.target_names[predicted[2]])

alt.atheism
