## Twitter Bot Detection Model Analysis

**Summer 2018**<br>
**Contributers:** Karan Bhandarkar and Vivek Mishra


In [2]:
import warnings
warnings.filterwarnings('ignore')

%matplotlib inline
import json
import numpy as np # imports a fast numerical programming library
import scipy as sp #imports stats functions, amongst other things
import matplotlib as mpl # this actually imports matplotlib
import matplotlib.cm as cm #allows us easy access to colormaps
import matplotlib.pyplot as plt #sets up plotting under plt
import pandas as pd #lets us handle data as dataframes
#sets up pandas table display
pd.set_option('display.width', 500)
pd.set_option('display.max_columns', 100)
pd.set_option('display.notebook_repr_html', True)
from bs4 import BeautifulSoup
import re
#import unidecode
from pandas.plotting import scatter_matrix
from wordcloud import WordCloud, STOPWORDS

In [3]:
tweets_with_accounts = pd.read_csv("tweets_with_accounts.csv", index_col=0)

In [4]:
tweets_with_accounts.dtypes

screen_name          object
popularity_ratio    float64
listed_count          int64
verified              int64
tweets_per_day      float64
default_profile       int64
name                 object
full_text            object
lang                 object
isBot               float64
dtype: object

In [5]:
tweets_with_accounts.head()

Unnamed: 0,screen_name,popularity_ratio,listed_count,verified,tweets_per_day,default_profile,name,full_text,lang,isBot
0,DearAssistant,0.000213,146,0,2.682644,0,DearAssistant,2159.3 miles RT @Hi_5040 whats the lenght of t...,en,1.0
1,SanJacintoClan,1.988737,19,0,14.762141,0,SanJacintoClan,Sean Rima: The Rape of The Alamo. https://t.co...,en,1.0
2,SanJacintoClan,1.988737,19,0,14.762141,0,SanJacintoClan,https://t.co/P28ddc7a4C,und,1.0
3,SanJacintoClan,1.988737,19,0,14.762141,0,SanJacintoClan,Martinez is the cousin to the wife of the gove...,en,1.0
4,SanJacintoClan,1.988737,19,0,14.762141,0,SanJacintoClan,"Charlotte Rae, 'The Facts of Life' and 'Diff'r...",en,1.0


In [5]:
## Splitting the data into training and test data
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

## Removing NaN indexes from dataframe
tweets_with_accounts = tweets_with_accounts[tweets_with_accounts.listed_count.notnull()]
tweets_with_accounts = tweets_with_accounts[tweets_with_accounts.popularity_ratio.notnull()]
tweets_with_accounts = tweets_with_accounts[tweets_with_accounts.verified.notnull()]
tweets_with_accounts = tweets_with_accounts[tweets_with_accounts.tweets_per_day.notnull()]
tweets_with_accounts = tweets_with_accounts[tweets_with_accounts.default_profile.notnull()]
tweets_with_accounts = tweets_with_accounts[tweets_with_accounts.isBot.notnull()]

training_data, test_tune_data = train_test_split(tweets_with_accounts, test_size=.4)

tune_data, test_data = train_test_split(test_tune_data, test_size=.5)

print(" Training data size is {} and test data size is {} and tune data size is {}".
      format(len(training_data), len(test_data), len(tune_data)))



 Training data size is 241836 and test data size is 80612 and tune data size is 80612


In [6]:
training_data.head()

Unnamed: 0.1,Unnamed: 0,screen_name,popularity_ratio,listed_count,verified,tweets_per_day,default_profile,name,full_text,lang,isBot
332772,332772,CitronResearch,0.001531,1850,0,0.132635,0,CitronResearch,$VRX this is not a pharma or bio problem- this...,en,0.0
197011,197011,365GettinIt,1.397004,2,0,20.574038,1,365GettinIt,Trap jumpin Blake Griffin fuck ur Michael Jordan,en,1.0
279212,279212,pfdbot,0.0,1,0,76.58631,1,pfdbot,"Alaskans own 8,073 shares of KOLON INDUSTRIES ...",en,1.0
12546,12546,netflix_bot,0.000149,0,0,5.787074,0,netflix_bot,The Horn/Season 1 (2016) TV-14 [Season] is now...,en,1.0
32282,32282,RyanDetrick,0.016529,1396,0,10.804857,0,RyanDetrick,Talking yield curve inversions with @LaMonicaB...,en,0.0


### Feature Selection and standardizing the predictors

In [7]:
num_features =  ["listed_count", "tweets_per_day"]

scaler = StandardScaler()
for feat in num_features:
    training_data[feat] = scaler.fit_transform(training_data[feat].values.reshape(-1,1))
    test_data[feat] = scaler.fit_transform(test_data[feat].values.reshape(-1,1))


### Building predictors dataset(X_train, X_test) and response dataset(y_train, y_test)

In [8]:
test_data.head()

Unnamed: 0.1,Unnamed: 0,screen_name,popularity_ratio,listed_count,verified,tweets_per_day,default_profile,name,full_text,lang,isBot
299369,299369,jfahmy,0.030469,-0.105587,0,-0.568532,1,jfahmy,This gives a whole new meaning to getting scre...,en,0.0
230712,230712,3roOpal5aja,0.060241,-0.152325,0,-0.227612,0,3roOpal5aja,رسالة سعادة :\r\nسلم أمرك لله وإبتسم وإطمئن وا...,ar,1.0
68284,68284,reverseocr,0.000437,-0.142868,0,-0.508329,1,reverseocr,kinship https://t.co/oFTgFbly17,en,1.0
57371,57371,CATargetBot,0.000699,-0.150568,0,0.098793,1,CATargetBot,NEW FPPC F460\r\n#SD10 Bob Wieckowski (D)\r\nh...,de,1.0
36593,36593,vexmark,0.01869,-0.125629,0,-0.033539,0,vexmark,"Sure, neo-malthusian arguments of global popul...",en,0.0


In [9]:
X_train = training_data[["popularity_ratio", "listed_count", "verified","tweets_per_day","default_profile"]]
y_train = training_data[["isBot"]]

X_test = test_data[["popularity_ratio", "listed_count", "verified","tweets_per_day","default_profile"]]
y_test = test_data[["isBot"]]

X_tune = tune_data[["popularity_ratio", "listed_count", "verified","tweets_per_day","default_profile"]]
y_tune = tune_data[["isBot"]]

In [10]:
X_train.head()

Unnamed: 0,popularity_ratio,listed_count,verified,tweets_per_day,default_profile
332772,0.001531,-0.077761,0,-0.566839,0
197011,1.397004,-0.153936,0,-0.174661,1
279212,0.0,-0.153977,0,0.89996,1
12546,0.000149,-0.154018,0,-0.458356,0
32282,0.016529,-0.096475,0,-0.362087,0


In [11]:
X_test.head()

Unnamed: 0,popularity_ratio,listed_count,verified,tweets_per_day,default_profile
299369,0.030469,-0.105587,0,-0.568532,1
230712,0.060241,-0.152325,0,-0.227612,0
68284,0.000437,-0.142868,0,-0.508329,1
57371,0.000699,-0.150568,0,0.098793,1
36593,0.01869,-0.125629,0,-0.033539,0


In [12]:
print("X_train shape is {}, and y_train shape is {}".format(X_train.shape, y_train.shape))

X_train shape is (241836, 5), and y_train shape is (241836, 1)


###  Our Base Line Model is  a simple Logistic Regression Model

In [13]:
from sklearn.linear_model import LogisticRegression

model_collection = {}

logit_model = LogisticRegression().fit(X_train, y_train)

logistic_model_score = logit_model.score(X_test, y_test)

model_collection["simple_logistic"] = logit_model

print("Train set score: {0:4.4}%".format(logit_model.score(X_train, y_train)*100))
print("Test set score: {0:4.4}%".format(logit_model.score(X_test, y_test)*100))


Train set score: 89.52%
Test set score: 89.62%


**Our Test score is 91% which is not bad for a Base Model considering our worst chance is 50% **

### Next Model is LogisticRegression with Cross Validation

In [14]:
from sklearn.linear_model import LogisticRegressionCV
logic_model_cv = LogisticRegressionCV(Cs=[1,10,100,1000,10000], cv=3, penalty='l2', 
                                       solver='newton-cg').fit(X_train,y_train)

model_collection["simple_logistic_CV"] = logic_model_cv
print("Test set score with Cross Validation: {0:4.4}%".format(logic_model_cv.score(X_test, y_test)*100))


Test set score with Cross Validation: 89.62%


** Cross validation with Regularization is giving us ~ 85 % accuracy which is an improvement from the base model, and We would like to see how we can improve this further. **

### Adding Polynomial of 3 degrees and using Logistic Regression 

In [None]:

from sklearn.preprocessing import PolynomialFeatures


X_train_poly = PolynomialFeatures(degree=3, include_bias=False).fit_transform(X_train)
X_test_poly = PolynomialFeatures(degree=3, include_bias=False).fit_transform(X_test)

logic_model_poly_cv = LogisticRegressionCV(Cs=[1,10,100,1000,10000], cv=3, penalty='l2', 
                                       solver='newton-cg').fit(X_train_poly,y_train)

model_collection["poly_logistic_cv"] = logic_model_poly_cv
`


In [16]:
print("Test set score with Polynomial Features and with Cross Validation: {0:4.4}%".
      format(logic_model_cv.score(X_test, y_test)*100))

Test set score with Polynomial Features and with Cross Validation: 89.62%


** Test score is much better now with Polynomial degree of predictors, We will continue to look for improvements **
**with other different models that we learned in the class ** 

### Decision Tree Classifier

We are going to look at how Decision Tree Classifier fits into our data set, we are going to look at multiple depths and use cross validation score method of sklearn to find out which depth fits perfect and not overfit.(reference from HW6)

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import cross_val_score
fig, ax = plt.subplots(1,1, figsize=(15,6))
x = list(range(1,21))
means = []
stds = []
for cur_depth in range(1,21):
    decision_tree_model = DecisionTreeClassifier(max_depth = cur_depth)
    scores = cross_val_score(decision_tree_model, X_train, y_train, cv=5)
    means.append(np.mean(scores))
    stds.append(2*np.std(scores))
    
ax.errorbar(x, means, stds, marker='^', linestyle='dotted', label="Mean +-2(std)")
#ax.plot(x,means,'o',color="g")
ax.set_xticks(x)
ax.axvline(x=6)
ax.set_xlabel("Decision tree Depth", fontsize=15)
ax.set_ylabel("Performace", fontsize=15)
ax.set_title("Decision Tree Performance with Varying Depth", fontsize=20)
ax.legend()


**Looking at the above graph looks like depth 6 seems to be perfect for our data. So we are going to use that depth to build our model and try testing it on the test data set ** 

In [17]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import cross_val_score

decision_tree_model = DecisionTreeClassifier(max_depth = 6).fit(X_train, y_train)

model_collection["decision_tree"] = decision_tree_model

In [18]:
score = decision_tree_model.score(X_test, y_test)

print("Decision Tree Classifier Model score for test data: {0:4.4}%".format(score*100))


Decision Tree Classifier Model score for test data: 97.08%


Our decision tree seems to be peforming very poorly compared to other models, 
There are mulitple strategies that we have learnt in the class, we can apply 
Boosting , Bagging and Prunning to improve the results of the decision tree

We are going to try **Boosting(AdaBoost) and Bagging ** and let's see if that improves the performance

## Random Forests Classifier

In [20]:
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(n_estimators=25, max_depth=6)
rf_model = rf.fit(X_train, y_train)
score = rf_model.score(X_test, y_test)

model_collection["random_forest"] = random_forest_model

print("Random Forest model score is ", score)

Random Forest model score is  1.0


### Boosting(AdaBoost Classifier)

In [None]:
print(X_train.shape)
print(y_train.shape)

In [None]:
from sklearn import ensemble
from sklearn import tree

fig, ax = plt.subplots(3,1, figsize=(15,15))
estimators_num = 400
x_labels=range(1,estimators_num+1)
#x_labels= list(range(1,5))
ada_scores_test = []
ada_scores_train = []
for depth in range(1,4):
    adaboost = ensemble.AdaBoostClassifier(tree.DecisionTreeClassifier(max_depth=depth), 
                                           n_estimators=estimators_num, learning_rate=.05)
    adamodel = adaboost.fit(X_train, y_train)
    
    ax[depth-1].plot(list(x_labels),list(adamodel.staged_score(X_train,y_train)),color="r",
                     label="Trainng score with depth {}".format(depth))
    ax[depth-1].plot(list(x_labels),list(adamodel.staged_score(X_test,y_test)),color="g", 
                     label="Test score with depth {}".format(depth))
    ax[depth-1].legend()
    
fig.suptitle("Ada Boost Model Score vs # of iterations", fontsize=20)
    
    


### Before We jump on to Neural Networks, 
### I would like to build an emsemble model with all the models that we have build so far and let's see how that changes our prediction

In [24]:


from sklearn.utils import resample
emsemble_train = pd.DataFrame()
emsemble_test = pd.DataFrame()
for model_name, model in model_collection.items():
    if(model_name=="poly_logistic_cv"):
        y_predict = model.predict(X_train_poly)
    elif():
         y_predict = model.predict(X_trai)
    
    emsemble_train[model_name] = y_predict
    
    if(model_name=="poly_logistic_cv"):
        y_predict_test = model.predict(X_test_poly)
    elif():
         y_predict_test = model.predict(X_test)
    
    emsemble_test[model_name] = y_predict_test
    


In [25]:
emsemble_test.head()

Unnamed: 0,simple_logistic,simple_logistic_CV,poly_logistic_cv,decision_tree,random_forest
0,0.0,0.0,0.0,0.0,0.0
1,1.0,1.0,1.0,1.0,1.0
2,1.0,1.0,1.0,1.0,1.0
3,1.0,1.0,1.0,1.0,1.0
4,1.0,1.0,0.0,0.0,0.0


In [26]:
emsemble_train.head()

Unnamed: 0,simple_logistic,simple_logistic_CV,poly_logistic_cv,decision_tree,random_forest
0,0.0,0.0,0.0,0.0,0.0
1,1.0,1.0,1.0,1.0,1.0
2,1.0,1.0,1.0,1.0,1.0
3,0.0,0.0,1.0,1.0,1.0
4,0.0,0.0,0.0,0.0,0.0


In [28]:
from sklearn.metrics import accuracy_score
meta_test_predictions = np.mean(emsemble_test,axis=1) >.5 #do we have more than 50% 1s?
accuracy_score(y_test, meta_test_predictions)
score =  accuracy_score(y_test, meta_test_predictions)
print("Decision Tree Classifier Model score for test data: {0:4.4}%".format(score*100))

Decision Tree Classifier Model score for test data: 98.45%


## Neural Networks 

In [32]:
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Dropout

model = Sequential([
    Dense(1000, input_shape=(5,), activation='relu'),
    Dropout(0.5),
    Dense(500, activation='relu'),
    Dropout(.5),
    Dense(100, activation='relu'),
    Dropout(.5),
    Dense(50, activation='relu'),
    Dropout(.5),
    Dense(1, activation='sigmoid')
])

model.compile(loss='binary_crossentropy',optimizer='adam')



In [33]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_6 (Dense)              (None, 1000)              6000      
_________________________________________________________________
dropout_5 (Dropout)          (None, 1000)              0         
_________________________________________________________________
dense_7 (Dense)              (None, 500)               500500    
_________________________________________________________________
dropout_6 (Dropout)          (None, 500)               0         
_________________________________________________________________
dense_8 (Dense)              (None, 100)               50100     
_________________________________________________________________
dropout_7 (Dropout)          (None, 100)               0         
_________________________________________________________________
dense_9 (Dense)              (None, 50)                5050      
__________

In [35]:
model.fit(X_train, y_train, epochs=5, batch_size=32, validation_split = .2)

Train on 193468 samples, validate on 48368 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x1fe03606f98>

In [38]:
print(model.evaluate(X_test, y_test))

0.03686924080342081


**I was expecting much better result, this is horrible, I am sure it will improve when run for more epochs **

In [47]:
### Summary Report

models = ["Simple Logistic Regression", "Logistic Regression with CV", "Polynomial Logistic Regression",
          "Decision Tree Classifier", "Random Forest", "Ensembling", "Neural Network"]
models_score = ["89.62", "89.62","89.62","97.08","100","98.45", 0]
Models_performance_summary = pd.DataFrame({"Models" :models, "Performance Score": models_score}, columns=["Models", "Performance Score"])

Models_performance_summary

Unnamed: 0,Models,Performance Score
0,Simple Logistic Regression,89.62
1,Logistic Regression with CV,89.62
2,Polynomial Logistic Regression,89.62
3,Decision Tree Classifier,97.08
4,Random Forest,100.0
5,Ensembling,98.45
6,Neural Network,0.0
