In [None]:
import os
os.chdir('../')

In [None]:
import xgboost as xgb
import seaborn as sns
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import pickle
import re
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.model_selection import train_test_split, GridSearchCV, KFold, RandomizedSearchCV, StratifiedKFold
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB 
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import GradientBoostingClassifier, ExtraTreesClassifier, RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, confusion_matrix, classification_report, silhouette_score
from sklearn.datasets import make_classification
from scipy.stats import randint, uniform
from xgboost.sklearn import XGBClassifier
import sklearn.neural_network as nn
from sklearn.cluster import KMeans
from lib.general_model import general_model
from lib.prediction_module import prediction_prob

In [None]:
from sklearn.externals import joblib
transformer = joblib.load('data/tweets_df.pkl')

In [None]:
twitionary = {  0 : 'Elon Musk', 
                1 : 'Donald Trump', 
                2 : 'Ellen Degeneres',
                3 : 'Kim Kardashian',
                4 : 'Noam Chomsky',
                5 : 'Michael Moore',
                6 : 'Oprah Winfrey',
                7 : 'Katy Perry',
                8 : 'Paulo Coelho',
                9 : 'Gina Rodriguez',
                10 : 'Russell Brand',
                11 : 'Barack Obama'}

Using Cleaned Data

In [None]:
cleaned_data = pd.read_pickle('data/cleaned_data.pkl')

In [None]:
tfid_vec = TfidfVectorizer(stop_words='english', decode_error='replace', use_idf=True, max_df=2.0)
c_tfid = tfid_vec.fit_transform(cleaned_data['text'])

In [None]:
c_svd = TruncatedSVD(n_components=500, n_iter=100)
c_transformed_X = pd.DataFrame(c_svd.fit_transform(c_tfid))
#about 5 minutes

In [None]:
x = range(500)
plt.plot(x, np.cumsum(c_svd.explained_variance_ratio_), color='black')
plt.ylim(0, 0.5)

plt.xlabel('Cleaned SVD components')
plt.ylabel('Explained Variance')
plt.show()

Using Raw Data

In [None]:
raw_data = pd.read_pickle('data/tweets_df.pkl')

In [None]:
tfid_vec = TfidfVectorizer(stop_words='english', decode_error='replace', use_idf=True, max_df=2.0)
tfid = tfid_vec.fit_transform(raw_data['text'])

In [None]:
svd = TruncatedSVD(n_components=500, n_iter=100)
transformed_X = pd.DataFrame(svd.fit_transform(tfid))
#about 5 minutes

In [None]:
x = range(500)
plt.plot(x, np.cumsum(svd.explained_variance_ratio_), color='black')
plt.ylim(0, 0.5)

plt.xlabel('SVD components')
plt.ylabel('Explained Variance')
plt.show()

#### Raw Data

In [None]:
y = raw_data['user']
X = transformed_X 

In [None]:
le = LabelEncoder()
y = le.fit_transform(y)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.3, random_state=50 )
data_dict = {
             'X_test' : X_test,
             'X_train' : X_train,
             'y_test' : y_test,
             'y_train' : y_train} 

#### Cleaned Data

In [None]:
c_y = cleaned_data['user']
c_X = c_transformed_X 

In [None]:
le = LabelEncoder()
c_y = le.fit_transform(c_y)

In [None]:
c_X_train, c_X_test, c_y_train, c_y_test = train_test_split(c_X, c_y, test_size=.3, random_state=50 )
data_dict = {
             'X_test' : c_X_test,
             'X_train' : c_X_train,
             'y_test' : c_y_test,
             'y_train' : c_y_train} 

In [None]:
# kmeans = KMeans(n_clusters=12)
# kmeans.fit(X)

In [None]:
# labels = kmeans.labels_
# centroids = kmeans.cluster_centers_

In [None]:
# silhouette_score(X, labels, metric='euclidean')
# print(classification_report(y, predicted_y))
# print(confusion_matrix(y, predicted_y))

In [None]:
# def k_means_fit_score(X, y, k):
#     print("k: {}".format(k))
#     kmeans = cluster.KMeans(n_clusters=k)
#     kmeans.fit(X)
    
#     labels = kmeans.labels_
#     centroids = kmeans.cluster_centers_
#     predicted_y = np.choose(labels, range(1,k+1))
#     acc = metrics.accuracy_score(y, predicted_y)
#     sil = metrics.silhouette_score(X, labels, metric='euclidean')
#     print("---")
#     print("accuracy: {}".format(acc))
#     print("silhouette score: {}".format(sil))
#     print("classfication report")
#     print(metrics.classification_report(y, predicted_y))
#     print("confusion matrix")
#     print(metrics.confusion_matrix(y, predicted_y))
#     print("\n")
    
#     return acc, sil

#### Logistic Regression

In [None]:
log_reg = LogisticRegression(penalty='l2', C=1)
log_reg.fit(X_train, y_train)
accuracy = log_reg.score(X_test, y_test)
print("Accuracy: %.2f%%" % (accuracy * 100.0))

In [None]:
c_log_reg = LogisticRegression(penalty='l2', C=1)
c_log_reg.fit(c_X_train, c_y_train)
c_accuracy = c_log_reg.score(c_X_test, c_y_test)
print("Accuracy: %.2f%%" % (c_accuracy * 100.0))

In [None]:
pred_prob = log_reg.predict_proba(X_test[2050:2051]).argmax()
print "The predicted twitter user is:"
print twitionary[pred_prob]
print ''
print "The real twitter user is:"
print twitionary[y_test[2050]]

In [None]:
labels = twitionary.keys()
plt.pie(pred_prob, autopct='%1.1f%%', shadow=True, startangle=90)
plt.show()
actual = y_test[2000]
print("Probability: %.2f%%" % (pred_prob.max() * 100.0))
print pred_prob.argmax()

In [None]:
# X = raw_data.text
# def clean_text(X):
#     all_words = [rb_words, gr_words, dt_words, bo_words, mm_words, ow_words, em_words, kk_words, kp_words, nc_worwords = X.replace('\(', '').replace('\)', '').replace('@', '').replace('https://.?', '')
# X = re.sub(r"http\S+", "", X)
# X = re.sub("['\''%'':''#'')''('';''\n''/''.''!''*''-''+''$'',''?''~''&''@''``''-''=''--''|''<''>']", "", X)
# stop_words = set(stopwords.words('english'))
# word_tokens = word_tokenize(X.lower())
# filtered_text = [w for w in X if not w in stop_words]

#### Decision Trees

In [None]:
dec_tree = DecisionTreeClassifier(criterion='gini',max_depth=None, min_samples_split=3)
dec_tree.fit(X_train, y_train)
accuracy = dec_tree.score(X_test, y_test)
print("Accuracy: %.2f%%" % (accuracy * 100.0))

In [None]:
c_dec_tree = DecisionTreeClassifier(criterion='gini',max_depth=None, min_samples_split=3)
c_dec_tree.fit(c_X_train, c_y_train)
c_accuracy = c_dec_tree.score(c_X_test, c_y_test)
print("Accuracy: %.2f%%" % (c_accuracy * 100.0))

#### Random Forest

In [None]:
rand_for = RandomForestClassifier(n_estimators=50, min_samples_split=3, criterion='gini')
rand_for.fit(X_train, y_train)
accuracy = rand_for.score(X_test, y_test)
print("Accuracy: %.2f%%" % (accuracy * 100.0))

In [None]:
c_rand_for = RandomForestClassifier(n_estimators=50, min_samples_split=3, criterion='gini')
c_rand_for.fit(c_X_train, c_y_train)
c_accuracy = rand_for.score(c_X_test, c_y_test)
print("Accuracy: %.2f%%" % (c_accuracy * 100.0))

#### K Neighbors 

In [None]:
knn = KNeighborsClassifier(n_neighbors=5, weights='distance', algorithm='brute')
knn.fit(X_train, y_train)
accuracy = knn.score(X_test, y_test)
print("Accuracy: %.2f%%" % (accuracy * 100.0))

In [None]:
c_knn = KNeighborsClassifier(n_neighbors=5, weights='distance', algorithm='brute')
c_knn.fit(c_X_train, c_y_train)
c_accuracy = c_knn.score(c_X_test, c_y_test)
print("Accuracy: %.2f%%" % (c_accuracy * 100.0))

#### Extra Trees

In [None]:
ext_tree = ExtraTreesClassifier(n_estimators=50, criterion='entropy')
ext_tree.fit(X_train, y_train)
accuracy =  ext_tree.score(X_test, y_test)
print("Accuracy: %.2f%%" % (accuracy * 100.0))

In [None]:
c_ext_tree = ExtraTreesClassifier(n_estimators=50, criterion='entropy')
c_ext_tree.fit(c_X_train, c_y_train)
c_accuracy =  c_ext_tree.score(c_X_test, c_y_test)
print("Accuracy: %.2f%%" % (c_accuracy * 100.0))

#### Naive Bayes

In [None]:
gnb = GaussianNB()
gnb.fit(X_train, y_train)
accuracy =  gnb.score(X_test, y_test)
print("Accuracy: %.2f%%" % (accuracy * 100.0))

In [None]:
c_gnb = GaussianNB()
c_gnb.fit(c_X_train, c_y_train)
c_accuracy =  c_gnb.score(c_X_test, c_y_test)
print("Accuracy: %.2f%%" % (c_accuracy * 100.0))

#### Gradient Boosting

In [None]:
gbc = GradientBoostingClassifier(max_depth=1, warm_start=True)
gbc.fit(X_train, y_train)
accuracy =  gbc.score(X_test, y_test)
print("Accuracy: %.2f%%" % (accuracy * 100.0))

In [None]:
c_gbc = GradientBoostingClassifier(max_depth=1, warm_start=True)
c_gbc.fit(c_X_train, c_y_train)
c_accuracy =  c_gbc.score(c_X_test, c_y_test)
print("Accuracy: %.2f%%" % (c_accuracy * 100.0))

#### Neural Network 

In [None]:
mlpc = nn.MLPClassifier(verbose=2, max_iter=2000, activation = 'logistic', hidden_layer_sizes= (100,))                    
mlpc.fit(X_train, y_train)

pred= mlpc.predict(X_test)
mlpc_results = zip(pred,y_test)
results = []
for m in mlpc_results:
    if m[0] == m[1]:
        results.append(int(1))
    else:
        results.append(int(0))
accuracy = float(sum(results))/len(results)
print("Accuracy: %.2f%%" % (accuracy * 100.0))

In [None]:
c_mlpc = nn.MLPClassifier(verbose=2, max_iter=2000, activation = 'logistic', hidden_layer_sizes= (100,))                    
mlpc.fit(c_X_train, c_y_train)

c_pred= mlpc.predict(c_X_test)
c_mlpc_results = zip(c_pred,c_y_test)
results = []
for m in c_mlpc_results:
    if m[0] == m[1]:
        results.append(int(1))
    else:
        results.append(int(0))
c_accuracy = float(sum(results))/len(results)
print("Accuracy: %.2f%%" % (c_accuracy * 100.0))


In [None]:
pred_prob = mlpc.predict_proba(X_test[3008:3009]).argmax()
print "The predicted twitter user is:"
print twitionary[pred_prob]
print ''
print "The real twitter user is:"
print twitionary[y_test[3008]]

In [None]:
train_df["num_features"] = train_df["features"].apply(len)
cnt_srs = train_df['num_features'].value_counts()

plt.figure(figsize=(12,6))
sns.barplot(cnt_srs.index, cnt_srs.values, alpha=0.8)
plt.ylabel('Number of Occurrences', fontsize=12)
plt.xlabel('Number of features', fontsize=12)
plt.show()

In [None]:
params = {
    'objective': 'binary:logistic',
    'max_depth': 2,
    'learning_rate': 1.0,
    'silent': 1.0,
    'n_estimators': 5
}

In [None]:
bst = XGBClassifier(**params).fit(c_X_train, c_y_train)

In [None]:
c_preds = bst.predict(c_X_test)
c_preds

In [None]:
correct = 0

for i in range(len(c_preds)):
    if (y_test[i] == c_preds[i]):
        correct += 1
        
acc = accuracy_score(c_y_test, c_preds)

print('Predicted correctly: {0}/{1}'.format(correct, len(c_preds)))
print('Error: {0:.4f}'.format(1-acc))

In [None]:
seed = 342
np.random.seed(seed)

In [None]:
cv = StratifiedKFold(shuffle=True, random_state=seed)

In [None]:
params_grid = {
    'max_depth': [1, 2, 3],
    'n_estimators': [5, 10, 25, 50],
    'learning_rate': np.linspace(1e-16, 1, 3)
}

In [None]:
params_fixed = {
    'objective': 'binary:logistic',
    'silent': 0
}

In [None]:
bst_grid = GridSearchCV(
    estimator=XGBClassifier(params_fixed, seed=seed),
    param_grid=params_grid,
    cv=cv,
    scoring='accuracy'
)

In [None]:
bst_grid.fit(X_train, y_train )
bst_grid.grid_scores_

In [None]:
print("Best accuracy obtained: {0}".format(bst_grid.best_score_))
print("Parameters:")
for key, value in bst_grid.best_params_.items():
    print("\t{}: {}".format(key, value))

In [None]:
params_dist_grid = {
    'max_depth': [1, 2, 3, 4],
    'gamma': [0, 0.5, 1],
    'n_estimators': randint(1, 1001), # uniform discrete random distribution
    'learning_rate': uniform(), # gaussian distribution
    'subsample': uniform(), # gaussian distribution
    'colsample_bytree': uniform() # gaussian distribution
}

In [None]:
rs_grid = RandomizedSearchCV(
    estimator=XGBClassifier(params_fixed, seed=seed),
    param_distributions=params_dist_grid,
    n_iter=10,
    cv=cv,
    scoring='accuracy',
    random_state=seed
)

In [None]:
rs_grid.fit(X, y)

In [None]:
rs_grid.grid_scores_

In [None]:
rs_grid.best_estimator_

In [None]:
rs_grid.best_score_