In [None]:
import warnings
import spacy
import autocorrect
warnings.filterwarnings('ignore')

In [None]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer

documents = [
    'He playedd baseball',
    'He plays football',
    'He had a sandwich'
]

toDTM = CountVectorizer(lowercase=False)
DTM = toDTM.fit_transform(documents).todense()
words = toDTM.get_feature_names()

summary = pd.DataFrame(DTM, columns = words, index = documents)
summary

### Reducing the number of columns in Document Term Matrix:
* Normalization and lemmatization
* Spelling correction
* Setting vocabulary size thresholds

In [None]:
#create the processing function
def process(document):
    """
    Parameters
    ----------
    document: str
        The document we want to process
        
    Returns
    ----------
    
    """
    #create spacy object
    
    spacy_doc = nlp(unicode(document), parse=False, entity=False)
    
    #grab the lemma for each token in the document
    processed_tokens = map(lambda token: token.lemma_, spacy_doc)
    
    #join lemmas to a string
    result = " ".join(processed_tokens)
    return result


#Create the Custom tokenizer
class SpellTokenizer(object):
    
    def __init__(self, nlp):
        self.vocab = nlp.vocab
    
    def __call__(self, text):
        doc = nlp.tokenizer(unicode(text))
        words = [autocorrect.spell(i.orth_) for i in doc]
        return spacy.tokens.Doc(self.vocab, words = words)

#create a language model that uses the custom tokenizer
nlp = spacy.load('en')
nlp.make_doc = SpellTokenizer(nlp)        
    
#pass in the process function to sklearns vectorizer
toDTM = CountVectorizer(preprocessor=process 
                        , min_df = 0. #set minimum of token instance
                       )

DTM = toDTM.fit_transform(documents).todense()
words = toDTM.get_feature_names()

summary = pd.DataFrame(DTM, columns = words, index = documents)
summary

### Retrieving Documents in a DTM

0) Encode documents as a DTM

1) Encode the query

2) Compute similarities/distances of query vector and dtm

3) Pick argmin/argmax

In [None]:
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

def getMostSimilarSentence(query, documents):
    """
    Transforms query into vector, and computes cosine similarity 
    of query vector against training documents.
    
    Parameters
    ----------
    query:                (string) document to compare
    vectorizer:            sklearn vectorizer class 
    document_term_matrix: (pandas.DataFrame) table of 
                          term instances in each document
                          
    Returns
    -------
    most similar document (string)
    """
    #create vectorizer and use it to build dtm
    vectorizer = CountVectorizer(preprocessor=process)
    dtm = vectorizer.fit_transform(documents).todense()
    
    #transform query to vector
    query_vector = encodeQuery(query, vectorizer)
    
    #compute similarityes
    similarities = computeSimilarities(query_vector, dtm)
    
    #grab most similar document
    closest_idx = getMostSimilarIdx(similarities)
    return documents[closest_idx]

def encodeQuery(query, vectorizer):
    
    #transform query to vector
    query_vector = vectorizer.transform([query]).todense()
    return query_vector

def computeSimilarities(query_vector, dtm):
    
    #compute similarities
    all_vectors = np.concatenate((dtm, query_vector))
    similarities = cosine_similarity(all_vectors)[-1][:-1]
    return similarities

def getMostSimilarIdx(similarities):
    #grab most similar document
    return np.argmax(similarities)
    
query = 'Foootball'
getMostSimilarSentence(query, documents)

In [None]:
#make labels 0/1 about sports
about_sports = {
    'He played baseball':1,
    'He plays football':1,
    'He had a sandwich':0
}
summary['about_sports'] = pd.Series(about_sports)

X = summary[words].values
y = summary['about_sports']

#regress labels on elements of DTM
from sklearn.linear_model import LogisticRegression
model = LogisticRegression(C = 100000.)
model.fit(X, y)

#show coeficients
pd.Series(model.coef_[0], index = words).sort_values()

In [None]:
from sklearn.linear_model import LogisticRegression
mod = LogisticRegression(C = 100000.)
X = summary[words].values
y = summary['about_sports']

mod.fit(X, y)
pd.Series(mod.coef_[0], index = words).sort_values()

In [None]:
from sklearn.metrics.pairwise import cosine_similarity
pd.DataFrame(cosine_similarity(summary.T.values), index = words, columns = words)

In [None]:
from IPython.display import HTML
HTML("<img src='UsingEmbeddingsForML.svg' width=650 height=500/>")

In [None]:
import spacy 
nlp = spacy.load('en')

In [None]:
text = u'Word vectors are fantastic!'
doc = nlp(text)
token = doc[1]
print token.vector[:25]

In [None]:
import numpy as np

average_of_token_vectors = np.mean([token.vector for token in doc])
document_vector = doc.vector
assert all(average_of_token_vectors - document_vector)

In [None]:
from sklearn.metrics.pairwise import cosine_similarity
import pandas as pd
import plotly.plotly as py
import plotly.graph_objs as go
import plotly.offline as offline

offline.init_notebook_mode()


#grab word vectors for each word
words = [u'cat',u'dog',u'man',u'woman']
vectors = map(lambda word: nlp(word).vector, words)

#create a dataframe of similarities
similarities = cosine_similarity(vectors)
similarity_matrix = pd.DataFrame(similarities, index = words, columns = words)


data = [go.Heatmap( z=similarity_matrix.T.values.tolist()
                   , colorscale='OrRd'
                   ,x = words
                   ,y = words
                  )]


py.iplot(data)

In [None]:
#grab word vectors for each word
words = [u'cat',u'dog',u'man',u'woman',u'women']
vectors = map(lambda word: nlp(word).vector, words)

plural_men = nlp(u'men').vector - nlp(u'man').vector
plural_dogs = nlp(u'dogs').vector - nlp(u'dog').vector

plural = (plural_men + plural_dogs) / 2

vectors.append(plural + vectors[words.index('woman')])
words.append('Plural Plus Woman')

similarities = cosine_similarity(np.array(vectors))
similarity_matrix = pd.DataFrame(similarities, index = words, columns = words)

data = [go.Heatmap( z=similarity_matrix.T.values.tolist()
                   , colorscale='OrRd'
                   ,x = words
                   ,y = words
                  )]


py.iplot(data)

### Document Term Matrices

In [None]:
from IPython.display import HTML
HTML(open('dtm.html').read())

### The Brown Corpus

In [None]:
import nltk
from sklearn.cross_validation import train_test_split
from collections import Counter
from sklearn.utils import shuffle
from sklearn.preprocessing import LabelEncoder, LabelBinarizer

def nltk_corpus(corpus_name):
    corpus = getattr(nltk.corpus, corpus_name)
    try:
        corpus.ensure_loaded()
    except:
        nltk.download(corpus_name)
    return corpus

def corpus_to_x_y(corpus):
    fileids = corpus.fileids()
    tuples = map(lambda i: (" ".join(corpus.words(i)), corpus.categories(i)[0]),fileids)
    x, y = zip(*tuples)
    return x, y


#prepare data for classification
documents, categories = corpus_to_x_y(nltk_corpus('brown'))
documents, categories = shuffle(documents, categories)
encoder = LabelEncoder()
y = encoder.fit_transform(categories)

#Category Breakdown
c = Counter(categories)
for i in c:
    print i, c[i]

### Classifying documents with the DTM

In [None]:
#Create DTM


from sklearn.feature_extraction.text import CountVectorizer
from sklearn.cross_validation import train_test_split

doc_train, doc_test, y_train, y_test = train_test_split(documents, y, test_size = .33)

#create document term matrix with CountVectorizer
Vectorizer = CountVectorizer(stop_words='english')

#create training and testing DTM
X_train_dtm = Vectorizer.fit_transform(doc_train).todense()
X_test_dtm = Vectorizer.transform(doc_test).todense()

print "Shape of Document Term Matrix: {}".format(X_train_dtm.shape)

In [None]:
#classify using DTM
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.dummy import DummyClassifier
from sklearn.metrics import log_loss 
from sklearn.metrics import classification_report, f1_score, precision_score
import pandas as pd

f1_scores = {}
models = {}
losses = {}

def train_and_validate(name, model_classes,X_train, X_test, y_train, y_test):
    
    if name not in f1_scores:
        f1_scores[name] = {}
        
    if name not in models:
        models[name] = {}
        
    if name not in losses:
        losses[name] = {}        
        
    for model in model_classes:
        model.fit(X_train, y_train)    
        predictions = model.predict(X_test)
        probabilities = model.predict_proba(X_test)
    
        losses[name][model.__module__] = log_loss(y_test, probabilities)
        f1_scores[name][model.__module__] = f1_score(y_test, predictions, average = 'weighted')
        models[name][model.__module__] = model
        

In [None]:
model_classes = [LogisticRegression()  
              , GaussianNB()
              , SVC(kernel='linear', probability=True)
              , RandomForestClassifier(n_estimators=100)
              , DummyClassifier()]

train_and_validate('DTM', model_classes, X_train_dtm, X_test_dtm, y_train, y_test)

In [None]:
from plotly import tools
import plotly.plotly as py
import plotly.graph_objs as go



model_names = f1_scores['DTM'].keys()

dtm_f1_trace = go.Bar(
                    y=[f1_scores['DTM'][model] for model in model_names],
                    x=model_names
)

layout = go.Layout(
    barmode='group', title='F1 Scores for Using Document Term Matrices'
)

fig = go.Figure(data=[dtm_f1_trace], layout = layout)
py.iplot(fig, filename='make-subplots')

### Classifying with Word Vectors

In [None]:
#use spacy to get word vectors
import numpy as np

X_train_vec = []
X_test_vec = []


for doc in nlp.pipe(doc_train, n_threads=4, tag=False, parse = False, entity=False):
    X_train_vec.append(doc.vector)
        

for doc in nlp.pipe(doc_test, n_threads=4, tag=False, parse = False, entity=False):
    X_test_vec.append(doc.vector)
    
X_train_vec = np.array(X_train_vec)
X_test_vec = np.array(X_test_vec)

In [None]:
model_classes =[LogisticRegression(C = 1000.)
              , GaussianNB()
              , SVC(C = 10000., kernel='linear', probability = True)
              , RandomForestClassifier(n_estimators=100)
              , DummyClassifier()]

train_and_validate('Word Vectors', model_classes, X_train_vec, X_test_vec, y_train, y_test)

In [None]:
model_names = [i.__module__ for i in model_classes]

dtm_f1_trace = go.Bar(
                    y=[f1_scores['DTM'][model] for model in model_names],
                    x=model_names,
                    name = 'Document Term Matrix'
)

vect_f1_trace = go.Bar(
                    y=[f1_scores['Word Vectors'][model] for model in model_names],
                    x=model_names,
                    name = 'Word Vectors'
)

layout = go.Layout(
    barmode='group', title='F1 Scores for Using Document Term Matrices', yaxis=dict(title = 'F1 Score')
)

fig = go.Figure(data=[dtm_f1_trace, vect_f1_trace], layout = layout)
py.iplot(fig, filename='make-subplots')

In [None]:
def text_to_inputs(text):
    
    if hasattr(text, '__iter__'):
        vec = np.array([nlp(unicode(t)).vector for t in text])
        dtm = Vectorizer.transform(text)
    else:
        vec = nlp(unicode(text)).vector
        dtm = Vectorizer.transform([text])

    return {'DTM':dtm,
            'Word Vector': vec
           }


new_docs = ['Obama was in office before Trump']

In [None]:
x_vec = text_to_inputs(new_docs)['Word Vector']
x_dtm = text_to_inputs(new_docs)['DTM']

vec_test = pd.DataFrame(models['Word Vectors']['sklearn.linear_model.logistic'].predict_proba(x_vec), columns = encoder.classes_)
dtm_test = pd.DataFrame(models['DTM']['sklearn.linear_model.logistic'].predict_proba(x_dtm), columns = encoder.classes_)



In [None]:
'roosevelt' in Vectorizer.get_feature_names()

In [None]:
order = vec_test.T.sort(0).index.values

dtm_f1_trace = go.Bar(
                    y=vec_test.loc[0].loc[order].values,
                    x=order,
                    name = 'Word Vector Predicted'
)

vect_f1_trace = go.Bar(
                    y=dtm_test.loc[0].loc[order].values,
                    x=order,
                    name = 'Document Term Predicted'
)

layout = go.Layout(
    barmode='group', title='Predicted Classes of "Obama was in office before Trump"', yaxis=dict(title = 'F1 Score')
)

fig = go.Figure(data=[dtm_f1_trace, vect_f1_trace], layout = layout)
py.iplot(fig, filename='make-subplots')

In [None]:
#Create the table, keep me hidden
N_WORDS = 10
N_DOCS = 10

random_words = np.random.choice(cv.get_feature_names(), size=N_WORDS)

def gen_random_vec(n):
    ps = map(lambda x: 1 / (x + 1) ** 3, range(10))
    ps = map(lambda p: p / sum(ps), ps)
    return np.random.choice(range(10), p = ps, size = n)

d = {word: gen_random_vec(N_DOCS) for word in random_words}
dtm_random = pd.DataFrame.from_dict(d, orient = 'index')
c = map(list,zip(*[('Documents', column) for column in dtm_random.columns]))
dtm_random.columns  = c

c = map(list,zip(*[('Words', i) for i in dtm_random.index.values]))
dtm_random.index  = c


with open('dtm.html', 'wb') as f:
    f.write(dtm_random.T.to_html())