# In this notebook I've put the various other models I tried out that I discarded; namely, LDA models based on TFIDF and CountVec

In [5]:
import pandas as pd
import numpy as np
import sklearn
import gensim
import nltk
import os
from nltk.corpus import stopwords
from sklearn.decomposition import TruncatedSVD
from sklearn.feature_extraction.text import TfidfVectorizer 
from sklearn.feature_extraction.text import CountVectorizer 
from sklearn.preprocessing import Normalizer
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB

Populating the interactive namespace from numpy and matplotlib


In [9]:
df = pd.DataFrame.from_csv("CleanedApril2018.csv")

  """Entry point for launching an IPython kernel.


# The first half will be LDA-TFIDF

In [54]:
from nltk.corpus import stopwords
nltk.download('punkt')
stopword = set(stopwords.words('english'))

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/JonathonBowyer/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [46]:
stopword = stopword.union(set(['trump', 'donald', 'us', 'politics', 'j', 'united', 'states']))

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/JonathonBowyer/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [55]:
def  chapter_reader():
    for i in df['keywords']:
        yield (x for x in 
            gensim.utils.tokenize(i, lowercase=True, deacc=True, 
                                  errors="ignore")
            if x not in stopword)

In [56]:
corpus = []
dictionary = gensim.corpora.Dictionary(chapter_reader())
dictionary.filter_extremes(no_below=1, no_above=0.8, keep_n=100000)
for values in chapter_reader():
    corpus.append(dictionary.doc2bow(values))

In [58]:
tfidf = gensim.models.TfidfModel(corpus, normalize=True)
corpus_tfidf = tfidf[corpus]

In [95]:
lda = gensim.models.LdaModel(corpus_tfidf, id2word=dictionary, num_topics=5)
lda.print_topics()

[(0,
  '0.008*"program" + 0.008*"tv" + 0.007*"television" + 0.004*"united" + 0.004*"states" + 0.003*"syria" + 0.003*"housing" + 0.003*"international" + 0.003*"relations" + 0.003*"defense"'),
 (1,
  '0.005*"united" + 0.005*"states" + 0.004*"politics" + 0.004*"government" + 0.004*"trump" + 0.003*"elections" + 0.003*"j" + 0.003*"donald" + 0.002*"murders" + 0.002*"new"'),
 (2,
  '0.003*"ny" + 0.003*"us" + 0.003*"trump" + 0.003*"j" + 0.003*"states" + 0.003*"united" + 0.003*"estate" + 0.003*"real" + 0.003*"politics" + 0.003*"government"'),
 (3,
  '0.004*"trump" + 0.004*"states" + 0.004*"united" + 0.004*"politics" + 0.003*"government" + 0.003*"puzzles" + 0.003*"j" + 0.003*"crossword" + 0.003*"donald" + 0.003*"us"'),
 (4,
  '0.006*"crossword" + 0.006*"puzzles" + 0.004*"party" + 0.004*"united" + 0.004*"trade" + 0.004*"states" + 0.004*"j" + 0.004*"international" + 0.004*"trump" + 0.003*"donald"')]

In [96]:
doc_topics = []
for i in chapter_reader():
   bow = dictionary.doc2bow(i)
   doc_topics.append(lda.get_document_topics(bow))

In [97]:
doc_topics

[[(0, 0.012576562),
  (1, 0.94962305),
  (2, 0.012610654),
  (3, 0.012551008),
  (4, 0.012638752)],
 [(0, 0.018580053),
  (1, 0.92584115),
  (2, 0.018403234),
  (3, 0.018755235),
  (4, 0.018420316)],
 [(0, 0.022250505),
  (1, 0.022319848),
  (2, 0.022327194),
  (3, 0.9108502),
  (4, 0.022252236)],
 [(0, 0.025255034),
  (1, 0.025466407),
  (2, 0.02548827),
  (3, 0.025217624),
  (4, 0.8985727)],
 [(0, 0.8658745),
  (1, 0.033451263),
  (2, 0.03363768),
  (3, 0.03334826),
  (4, 0.033688236)],
 [(0, 0.014527261),
  (1, 0.014388927),
  (2, 0.9417702),
  (3, 0.01467715),
  (4, 0.0146364495)],
 [(0, 0.018387519),
  (1, 0.92615604),
  (2, 0.018371372),
  (3, 0.018446423),
  (4, 0.018638598)],
 [(0, 0.015556941),
  (1, 0.015633585),
  (2, 0.93745065),
  (3, 0.01585661),
  (4, 0.015502244)],
 [(0, 0.015468398),
  (1, 0.015623618),
  (2, 0.015491952),
  (3, 0.9379728),
  (4, 0.015443254)],
 [(0, 0.91838056),
  (1, 0.020144394),
  (2, 0.020293659),
  (3, 0.020833133),
  (4, 0.020348292)],
 [(3, 0.9

In [98]:
probs = []
for i in doc_topics:
    row=[]
    for j in i:
        row.append(j[1])
    probs.append(row)

In [100]:
dfLDA = pd.DataFrame(probs)
dfLDA = dfLDA.fillna(0)

In [102]:
y = df["Outcome"]

In [103]:
X_train, X_test, y_train, y_test = train_test_split(dfLDA, y, test_size=0.3, random_state=0)

# Now we'll move to modeling based on these TFIDF Vectors.
# We'll start with TFIDF Naive Bayes

In [104]:
clf = GaussianNB()

In [105]:
clf.fit(X_train, y_train)

GaussianNB(priors=None)

In [106]:
clf.score(X_train, y_train)

0.5751552795031056

In [107]:
clf.score(X_test, y_test)

0.518840579710145

# TFIDF GradientBoosting

In [109]:
gradboost = GradientBoostingClassifier()

In [110]:
n_range = [10, 100, 300]
l_rate = [.001, .01, .1]
depth = [3, 5]
param_grid = dict(n_estimators=n_range, learning_rate=l_rate, max_depth=depth, random_state=[2])

In [111]:
grid = GridSearchCV(gradboost, param_grid, cv=10, scoring='accuracy', pre_dispatch=-1)

In [112]:
grid.fit(X_train, y_train)

GridSearchCV(cv=10, error_score='raise',
       estimator=GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.1, loss='deviance', max_depth=3,
              max_features=None, max_leaf_nodes=None,
              min_impurity_decrease=0.0, min_impurity_split=None,
              min_samples_leaf=1, min_samples_split=2,
              min_weight_fraction_leaf=0.0, n_estimators=100,
              presort='auto', random_state=None, subsample=1.0, verbose=0,
              warm_start=False),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'n_estimators': [10, 100, 300], 'learning_rate': [0.001, 0.01, 0.1], 'max_depth': [3, 5], 'random_state': [2]},
       pre_dispatch=-1, refit=True, return_train_score='warn',
       scoring='accuracy', verbose=0)

In [113]:
print(grid.best_score_)
print(grid.best_params_)
print(grid.best_estimator_)

0.6571428571428571
{'learning_rate': 0.01, 'max_depth': 3, 'n_estimators': 300, 'random_state': 2}
GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.01, loss='deviance', max_depth=3,
              max_features=None, max_leaf_nodes=None,
              min_impurity_decrease=0.0, min_impurity_split=None,
              min_samples_leaf=1, min_samples_split=2,
              min_weight_fraction_leaf=0.0, n_estimators=300,
              presort='auto', random_state=2, subsample=1.0, verbose=0,
              warm_start=False)


# TFIDF Random Forest

In [115]:
randomforest = RandomForestClassifier()

In [119]:
n_range = [10, 50, 100, 300]
feat_range = [2, 5]
depth = [2, 5]
param_grid = dict(n_estimators=n_range, max_features=feat_range, max_depth=depth)

In [120]:
grid = GridSearchCV(randomforest, param_grid, cv=10, scoring='accuracy', pre_dispatch=-1)

In [121]:
grid.fit(X_train, y_train)

GridSearchCV(cv=10, error_score='raise',
       estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'n_estimators': [10, 50, 100, 300], 'max_features': [2, 5], 'max_depth': [2, 5]},
       pre_dispatch=-1, refit=True, return_train_score='warn',
       scoring='accuracy', verbose=0)

In [122]:
print(grid.best_score_)
print(grid.best_params_)
print(grid.best_estimator_)

0.6571428571428571
{'max_depth': 5, 'max_features': 5, 'n_estimators': 50}
RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=5, max_features=5, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=50, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)


# Now we'll look at LDA with CountVec

In [38]:
def  chapter_reader():
    for i in df['keywords']:
        yield (x for x in 
            gensim.utils.tokenize(i, lowercase=True, deacc=True, 
                                  errors="ignore")
            if x not in stopword)

In [39]:
corpus = []
dictionary = gensim.corpora.Dictionary(chapter_reader())
dictionary.filter_extremes(no_below=1, no_above=0.8, keep_n=100000)
for values in chapter_reader():
    corpus.append(dictionary.doc2bow(values))

In [61]:
lda = gensim.models.LdaModel(corpus, id2word=dictionary, num_topics=10)

In [62]:
lda.print_topics()

[(0,
  '0.015*"states" + 0.015*"united" + 0.012*"politics" + 0.012*"government" + 0.012*"new" + 0.009*"york" + 0.008*"j" + 0.007*"donald" + 0.007*"city" + 0.007*"trump"'),
 (1,
  '0.021*"trump" + 0.017*"program" + 0.017*"tv" + 0.016*"j" + 0.015*"donald" + 0.015*"states" + 0.015*"united" + 0.013*"politics" + 0.012*"government" + 0.011*"elections"'),
 (2,
  '0.012*"media" + 0.012*"politics" + 0.011*"government" + 0.010*"new" + 0.010*"news" + 0.009*"party" + 0.009*"york" + 0.009*"inc" + 0.008*"nyc" + 0.007*"ny"'),
 (3,
  '0.010*"ny" + 0.010*"united" + 0.010*"states" + 0.009*"politics" + 0.009*"restaurant" + 0.008*"government" + 0.008*"manhattan" + 0.006*"international" + 0.006*"donald" + 0.006*"trade"'),
 (4,
  '0.015*"housing" + 0.014*"estate" + 0.014*"real" + 0.013*"ny" + 0.013*"manhattan" + 0.012*"residential" + 0.011*"theater" + 0.011*"j" + 0.008*"politics" + 0.008*"trump"'),
 (5,
  '0.024*"states" + 0.022*"united" + 0.015*"government" + 0.015*"politics" + 0.011*"us" + 0.011*"trump" +

In [63]:
doc_topics = []
for i in chapter_reader():
   bow = dictionary.doc2bow(i)
   doc_topics.append(lda.get_document_topics(bow))

In [64]:
probs = []
for i in doc_topics:
    row=[]
    for j in i:
        row.append(j[1])
    probs.append(row)

In [65]:
dfLDA = pd.DataFrame(probs)
dfLDA = dfLDA.fillna(0)

In [66]:
X_train, X_test, y_train, y_test = train_test_split(dfLDA, y, test_size=0.3, random_state=0)

In [67]:
clf = GaussianNB()

In [68]:
clf.fit(X_train, y_train)

GaussianNB(priors=None)

In [69]:
clf.score(X_train, y_train)

0.5714285714285714

In [70]:
clf.score(X_test, y_test)

0.5420289855072464

In [72]:
gradboost = GradientBoostingClassifier()

In [73]:
n_range = [10, 100, 300]
l_rate = [.001, .01, .1]
depth = [3, 5]
param_grid = dict(n_estimators=n_range, learning_rate=l_rate, max_depth=depth, random_state=[2])

In [74]:
grid = GridSearchCV(gradboost, param_grid, cv=10, scoring='accuracy', pre_dispatch=-1)

In [75]:
grid.fit(X_train, y_train)

GridSearchCV(cv=10, error_score='raise',
       estimator=GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.1, loss='deviance', max_depth=3,
              max_features=None, max_leaf_nodes=None,
              min_impurity_decrease=0.0, min_impurity_split=None,
              min_samples_leaf=1, min_samples_split=2,
              min_weight_fraction_leaf=0.0, n_estimators=100,
              presort='auto', random_state=None, subsample=1.0, verbose=0,
              warm_start=False),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'n_estimators': [10, 100, 300], 'learning_rate': [0.001, 0.01, 0.1], 'max_depth': [3, 5], 'random_state': [2]},
       pre_dispatch=-1, refit=True, return_train_score='warn',
       scoring='accuracy', verbose=0)

In [76]:
grid.grid_scores_




[mean: 0.53540, std: 0.00299, params: {'learning_rate': 0.001, 'max_depth': 3, 'n_estimators': 10, 'random_state': 2},
 mean: 0.53665, std: 0.00531, params: {'learning_rate': 0.001, 'max_depth': 3, 'n_estimators': 100, 'random_state': 2},
 mean: 0.61491, std: 0.02810, params: {'learning_rate': 0.001, 'max_depth': 3, 'n_estimators': 300, 'random_state': 2},
 mean: 0.53540, std: 0.00299, params: {'learning_rate': 0.001, 'max_depth': 5, 'n_estimators': 10, 'random_state': 2},
 mean: 0.54286, std: 0.02254, params: {'learning_rate': 0.001, 'max_depth': 5, 'n_estimators': 100, 'random_state': 2},
 mean: 0.59752, std: 0.03232, params: {'learning_rate': 0.001, 'max_depth': 5, 'n_estimators': 300, 'random_state': 2},
 mean: 0.53665, std: 0.00531, params: {'learning_rate': 0.01, 'max_depth': 3, 'n_estimators': 10, 'random_state': 2},
 mean: 0.62484, std: 0.02389, params: {'learning_rate': 0.01, 'max_depth': 3, 'n_estimators': 100, 'random_state': 2},
 mean: 0.62981, std: 0.03947, params: {'learn

In [77]:
print(grid.best_score_)
print(grid.best_params_)
print(grid.best_estimator_)

0.639751552795031
{'learning_rate': 0.01, 'max_depth': 5, 'n_estimators': 300, 'random_state': 2}
GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.01, loss='deviance', max_depth=5,
              max_features=None, max_leaf_nodes=None,
              min_impurity_decrease=0.0, min_impurity_split=None,
              min_samples_leaf=1, min_samples_split=2,
              min_weight_fraction_leaf=0.0, n_estimators=300,
              presort='auto', random_state=2, subsample=1.0, verbose=0,
              warm_start=False)


In [78]:
print(classification_report(y_test, grid.predict(X_test)))

             precision    recall  f1-score   support

          0       0.57      0.69      0.62       181
          1       0.56      0.43      0.48       164

avg / total       0.56      0.57      0.56       345



In [79]:
grid.score(X_test, y_test)

0.5652173913043478