In [1]:
import numpy as np
import pandas as pd

import xgboost

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import ExtraTreesClassifier, VotingClassifier
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.ensemble import BaggingClassifier

from sklearn.externals import joblib

In [2]:
# https://github.com/bpb27/trump-tweet-archive/tree/master/data/realdonaldtrump
# data gleaned from above on 4/5/17

list_of_dfs = []

for year in range(2009,2018):
    df = pd.read_json('data/realdonaldtrump/%s.json' % year)
    list_of_dfs.append(df)

In [44]:
df = pd.concat(list_of_dfs, axis=0)

In [55]:
df['is_donald'] = df['source'].apply(lambda x: 1 if x == 'Twitter for Android' else 0)

In [56]:
df.is_donald.value_counts()

0    16185
1    14545
Name: source, dtype: int64

In [57]:
df.columns

Index(['created_at', 'favorite_count', 'id_str', 'in_reply_to_user_id_str',
       'is_retweet', 'retweet_count', 'source', 'text', 'is_donald'],
      dtype='object')

In [58]:
df1 = df[df['is_donald'] == 1].copy()
df2 = df[df['is_donald'] == 0].copy()

In [59]:
df1['is_donald'] = df1['text'].apply(lambda x: 0 if x[:2] == "\"@" else 1)

In [61]:
df = pd.concat([df1,df2])

In [64]:
df.shape

(30730, 9)

In [65]:
np.mean(df.is_donald)

0.15203384315001628

In [66]:
xgb = xgboost.XGBClassifier()

In [67]:
df.columns

Index(['created_at', 'favorite_count', 'id_str', 'in_reply_to_user_id_str',
       'is_retweet', 'retweet_count', 'source', 'text', 'is_donald'],
      dtype='object')

In [68]:
tfidf = TfidfVectorizer(stop_words='english', max_features = 2000, ngram_range=(1,2))

In [69]:
tfidf_vector = tfidf.fit_transform(df.text.values)

In [70]:
tfidf_vector

<30730x2000 sparse matrix of type '<class 'numpy.float64'>'
	with 220916 stored elements in Compressed Sparse Row format>

In [71]:
tfidf_vector.todense()

matrix([[ 0.,  0.,  0., ...,  0.,  0.,  0.],
        [ 0.,  0.,  0., ...,  0.,  0.,  0.],
        [ 0.,  0.,  0., ...,  0.,  0.,  0.],
        ..., 
        [ 0.,  0.,  0., ...,  0.,  0.,  0.],
        [ 0.,  0.,  0., ...,  0.,  0.,  0.],
        [ 0.,  0.,  0., ...,  0.,  0.,  0.]])

In [72]:
x_train, x_test, y_train, y_test = train_test_split(tfidf_vector.todense(), df.is_donald.values)

In [73]:
def run_model(x_train, y_train, x_test, y_test, model):
    model.fit(x_train, y_train)
    print ("Base model score: " + str(np.mean(y_test))[:6])
    print ("Training set score: " + str(model.score(x_train, y_train))[:6])
    print ("Test set score: " + str(model.score(x_test, y_test))[:6])
    predictions = model.predict(x_test)
    print ("\nConfusion Matrix:\n", pd.DataFrame(confusion_matrix(y_test, predictions), 
                                              columns=['predicted_0', 'predicted_1'], index=['is_0', 'is_1']))
    print ("\nClassification Report:\n",classification_report(y_test, predictions))

In [74]:
lr = LogisticRegression()
rf = RandomForestClassifier(n_estimators=200, min_samples_leaf=5)
et = ExtraTreesClassifier(min_samples_leaf=2)
gb = GradientBoostingClassifier(max_depth=3, min_samples_leaf=6)
ab = AdaBoostClassifier(n_estimators=10, base_estimator=lr)

In [75]:
model_names = ['xgb', 'lr', 'rf', 'et', 'gb', 'ab']

In [76]:
models = [lr, rf, et]

### xgb, gb, ab


for model in models:
    print ("Model: ", str(model).split("(")[0])
    print ("Hyperparameters: " + str(model)[len(str(model).split("(")[0])+1:-1])
    run_model(x_train, y_train, x_test, y_test, model)
    print ("\n")

Model:  XGBClassifier
Hyperparameters: base_score=0.5, colsample_bylevel=1, colsample_bytree=1,
       gamma=0, learning_rate=0.1, max_delta_step=0, max_depth=3,
       min_child_weight=1, missing=None, n_estimators=100, nthread=-1,
       objective='binary:logistic', reg_alpha=0, reg_lambda=1,
       scale_pos_weight=1, seed=0, silent=True, subsample=1
Base model score: 0.1509
Training set score: 0.8665
Test set score: 0.8665

Confusion Matrix:
       predicted_0  predicted_1
is_0         6488           35
is_1          990          170

Classification Report:
              precision    recall  f1-score   support

          0       0.87      0.99      0.93      6523
          1       0.83      0.15      0.25      1160

avg / total       0.86      0.87      0.82      7683



Model:  LogisticRegression
Hyperparameters: C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=N

  'precision', 'predicted', average, warn_for)


In [None]:
fo = open('data/test_text.txt', 'r')

In [None]:
fo.name

In [None]:
test_list = fo.readlines()
test_list

In [None]:
final_list = []

for x in test_list:
    final_list.append(x.strip())
    
final_list
    

In [None]:
fo.close()

In [None]:
test_array = np.array(final_list)
test_array

In [None]:
test_array.shape

In [None]:
test_vector = tfidf.transform(test_array)

In [None]:
np.mean(xgb.predict(test_vector))

In [None]:
predictions = xgb.predict(test_vector)
predict_proba = xgb.predict_proba(test_vector)

for x, y in zip(test_array, predict_proba):
    if y[1] > 0.85:
        print(x, y[1], '\n')

In [82]:
estimators = list(zip(model_names, models))
estimators

[('xgb', XGBClassifier(base_score=0.5, colsample_bylevel=1, colsample_bytree=1,
         gamma=0, learning_rate=0.1, max_delta_step=0, max_depth=3,
         min_child_weight=1, missing=None, n_estimators=100, nthread=-1,
         objective='binary:logistic', reg_alpha=0, reg_lambda=1,
         scale_pos_weight=1, seed=0, silent=True, subsample=1)),
 ('lr',
  LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
            intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
            penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
            verbose=0, warm_start=False)),
 ('rf',
  RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
              max_depth=None, max_features='auto', max_leaf_nodes=None,
              min_impurity_split=1e-07, min_samples_leaf=5,
              min_samples_split=2, min_weight_fraction_leaf=0.0,
              n_estimators=200, n_jobs=1, oob_score=False, random_state=None,
     

In [83]:
del estimators[0]
estimators


[('lr',
  LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
            intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
            penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
            verbose=0, warm_start=False)),
 ('rf',
  RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
              max_depth=None, max_features='auto', max_leaf_nodes=None,
              min_impurity_split=1e-07, min_samples_leaf=5,
              min_samples_split=2, min_weight_fraction_leaf=0.0,
              n_estimators=200, n_jobs=1, oob_score=False, random_state=None,
              verbose=0, warm_start=False)),
 ('et',
  ExtraTreesClassifier(bootstrap=False, class_weight=None, criterion='gini',
             max_depth=None, max_features='auto', max_leaf_nodes=None,
             min_impurity_split=1e-07, min_samples_leaf=2,
             min_samples_split=2, min_weight_fraction_leaf=0.0,
             n_estimators=10, n_

In [84]:
del estimators[3]
estimators


[('lr',
  LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
            intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
            penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
            verbose=0, warm_start=False)),
 ('rf',
  RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
              max_depth=None, max_features='auto', max_leaf_nodes=None,
              min_impurity_split=1e-07, min_samples_leaf=5,
              min_samples_split=2, min_weight_fraction_leaf=0.0,
              n_estimators=200, n_jobs=1, oob_score=False, random_state=None,
              verbose=0, warm_start=False)),
 ('et',
  ExtraTreesClassifier(bootstrap=False, class_weight=None, criterion='gini',
             max_depth=None, max_features='auto', max_leaf_nodes=None,
             min_impurity_split=1e-07, min_samples_leaf=2,
             min_samples_split=2, min_weight_fraction_leaf=0.0,
             n_estimators=10, n_

In [85]:
del estimators[3]
estimators

[('lr',
  LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
            intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
            penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
            verbose=0, warm_start=False)),
 ('rf',
  RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
              max_depth=None, max_features='auto', max_leaf_nodes=None,
              min_impurity_split=1e-07, min_samples_leaf=5,
              min_samples_split=2, min_weight_fraction_leaf=0.0,
              n_estimators=200, n_jobs=1, oob_score=False, random_state=None,
              verbose=0, warm_start=False)),
 ('et',
  ExtraTreesClassifier(bootstrap=False, class_weight=None, criterion='gini',
             max_depth=None, max_features='auto', max_leaf_nodes=None,
             min_impurity_split=1e-07, min_samples_leaf=2,
             min_samples_split=2, min_weight_fraction_leaf=0.0,
             n_estimators=10, n_

In [86]:
vc = VotingClassifier(estimators, voting='soft', n_jobs=-1)

In [87]:
run_model(x_train, y_train, x_test, y_test, vc)

Base model score: 0.1509
Training set score: 0.9080
Test set score: 0.8861

Confusion Matrix:
       predicted_0  predicted_1
is_0         6450           73
is_1          802          358

Classification Report:
              precision    recall  f1-score   support

          0       0.89      0.99      0.94      6523
          1       0.83      0.31      0.45      1160

avg / total       0.88      0.89      0.86      7683



In [None]:
vc.predict_proba(x_train[:10])

In [88]:
joblib.dump(lr, 'lr_model.pkl')

['lr_model.pkl']

In [89]:
joblib.dump(tfidf, 'tfidf.pkl')

['tfidf.pkl']