In [3]:
import os
os.chdir('../')

In [4]:
import xgboost as xgb
import seaborn as sns
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import pickle
import re
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.model_selection import train_test_split, GridSearchCV, KFold, RandomizedSearchCV, StratifiedKFold
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB 
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import GradientBoostingClassifier, ExtraTreesClassifier, RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, confusion_matrix, classification_report, silhouette_score
from sklearn.datasets import make_classification
from scipy.stats import randint, uniform
from xgboost.sklearn import XGBClassifier
import sklearn.neural_network as nn
from sklearn.cluster import KMeans
from lib.general_model import general_model
from lib.prediction_module import prediction_prob

In [5]:
twitionary = {  0 : 'The Magician', 
                1 : 'The Ruler', 
                2 : 'The Joker',
                3 : 'The Lover',
                4 : 'The Explorer',
                5 : 'The Rebel',
                6 : 'The Nurturer',
                7 : 'The Everyperson',
                8 : 'The Sage',
                9 : 'The Innocent',
               10 : 'The Creator',
               11 : 'The Hero'}

#### Vectorize and Transform Model

Importing pickled data and vectorizing the text of each tweet for each user using the a term frequency–inverse document frequency vectorizer (TfidfVectorizer).  Then TruncatedSVD is used to reduce the feature dementionality to 500.

In [6]:
raw_data = pd.read_pickle('data/pickles/tweets_df.pkl')

In [7]:
tfid_vec = TfidfVectorizer(stop_words='english', decode_error='replace', use_idf=True, max_df=2.0)
tfid = tfid_vec.fit_transform(raw_data['text'])

In [None]:
# joblib.dump(tfid_vec, "tfid.pkl")
# from_pkl_cls = joblib.load("tfid.pkl")

In [None]:
svd = TruncatedSVD(n_components=500, n_iter=100)
transformed_X = pd.DataFrame(svd.fit_transform(tfid))

In [None]:
# joblib.dump(svd, "svd.pkl")
# from_pkl_cls = joblib.load("svd.pkl")

In [None]:
x = range(500)
plt.plot(x, np.cumsum(svd.explained_variance_ratio_), color='black')
plt.ylim(0, 0.3)

plt.xlabel('SVD components')
plt.ylabel('Explained Variance')
plt.show()

#### Label Encode Target and Train Test Split

In [None]:
y = raw_data['user']
X = transformed_X 

In [None]:
le = LabelEncoder()
y = le.fit_transform(y)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.3, random_state=50 )
data_dict = {
             'X_test' : X_test,
             'X_train' : X_train,
             'y_test' : y_test,
             'y_train' : y_train} 

#### Logistic Regression

In [2]:
log_reg = LogisticRegression(penalty='l2', C=1)
log_reg.fit(X_train, y_train)
y_pred = log_reg.predict(X_test)
accuracy = log_reg.score(X_test, y_test)
precision = precision_score(y)
print("Accuracy: %.2f%%" % (accuracy * 100.0))

NameError: name 'LogisticRegression' is not defined

#### Decision Trees

In [16]:
dec_tree = DecisionTreeClassifier(criterion='gini',max_depth=None, min_samples_split=3)
dec_tree.fit(X_train, y_train)
accuracy = dec_tree.score(X_test, y_test)
print("Accuracy: %.2f%%" % (accuracy * 100.0))

Accuracy: 54.49%


#### Random Forest

In [15]:
rand_for = RandomForestClassifier(n_estimators=50, min_samples_split=3, criterion='gini')
rand_for.fit(X_train, y_train)
accuracy = rand_for.score(X_test, y_test)
print("Accuracy: %.2f%%" % (accuracy * 100.0))

Accuracy: 68.27%


#### K Neighbors 

In [14]:
knn = KNeighborsClassifier(n_neighbors=5, weights='distance', algorithm='brute')
knn.fit(X_train, y_train)
accuracy = knn.score(X_test, y_test)
print("Accuracy: %.2f%%" % (accuracy * 100.0))

Accuracy: 62.09%


#### Extra Trees

In [18]:
ext_tree = ExtraTreesClassifier(n_estimators=50, criterion='entropy')
ext_tree.fit(X_train, y_train)
accuracy =  ext_tree.score(X_test, y_test)
print("Accuracy: %.2f%%" % (accuracy * 100.0))

Accuracy: 66.91%


#### Naive Bayes

In [17]:
gnb = GaussianNB()
gnb.fit(X_train, y_train)
accuracy =  gnb.score(X_test, y_test)
print("Accuracy: %.2f%%" % (accuracy * 100.0))

Accuracy: 44.23%


#### Multi-Layer Perceptron Neural Network 

In [None]:
mlpc = nn.MLPClassifier(verbose=2, max_iter=2000, activation = 'logistic', hidden_layer_sizes= (100,))                    
mlpc.fit(X_train, y_train)

pred= mlpc.predict(X_test)
mlpc_results = zip(pred,y_test)
results = []
for m in mlpc_results: 
    if m[0] == m[1]:
        results.append(int(1))
    else:
        results.append(int(0))
accuracy = float(sum(results))/len(results)
print("Accuracy: %.2f%%" % (accuracy * 100.0))

### Determine best model based on accuracy scores 

In [None]:
def evaluate_model(model):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    
    a = accuracy_score(y_test, y_pred)
    
    cm = confusion_matrix(y_test, y_pred)
    cr = classification_report(y_test, y_pred)
    
    print cm
    print cr
    
    return a

all_models = {}

In [None]:
model_scores = {'LogisticRegression' : '73',
                'DecisionTrees' : '55',
                'RandomForrest' : '68',
                'KNeighbors' : '62',
                'ExtraTrees': '66',
                'NaiveBayes' : '44',
                'Neural Network' : '75'
               }

In [None]:
plt.bar(range(len(model_scores)), model_scores.values(), align='center')
plt.xticks(range(len(model_scores)), model_scores.keys(), rotation=45)
plt.xlabel('model', fontsize=18)
plt.ylabel('score', fontsize=18)


plt.show()

### Pickle model for new tweets

In [None]:
joblib.dump(mlpc, "MLPC.pkl")
from_pkl_cls = joblib.load("MLPC.pkl")