Calling our guys !

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os
print(os.listdir("../input"))


Quick view in the dataset, clean !

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier

data = pd.read_csv('../input/train.csv')
data.head(3)

**Data visualisation**


* Check the composition of the dataset
* Overview of the different labels

In [None]:
#Data visualisation
predictors = ['profile pic', 'nums/length username','fullname words', 'nums/length fullname', 'name==username', 'description length', 'external URL', 'private', '#posts', '#followers', '#follows']  
for col in predictors: # Loop through all columns in predictors
    if data[col].dtype == 'object':  # check if column's type is object (text)
        data[col] = pd.Categorical(data[col]).codes  # convert text to numerical

**Splitting data**

Divide the data into one set to train the model and another to test it later.

As an indication we use the linear regression to justify the passage to Random Forest.


In [None]:
# Split the data into a training set and a testing set
from sklearn.model_selection import train_test_split 
X_train, X_test, y_train, y_test = train_test_split(data[predictors], data['fake'], test_size=0.3, random_state=1)

#Using linear regression to check the score 
from sklearn.linear_model import LogisticRegression
clf = LogisticRegression(random_state=1)
clf.fit(X_train, y_train)
train_score = clf.score(X_train, y_train)
print ('train accuracy =', clf.score(X_train, y_train))

from sklearn import model_selection
scores = model_selection.cross_val_score(clf, data[predictors], data['fake'], scoring='accuracy', cv=5)
print('cross validation accuracy =', scores.mean())

**Random Forest**

In [None]:
# import from: http://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestClassifier.html#sklearn.ensemble.RandomForestClassifier
#Using Random Forests
from sklearn.ensemble import RandomForestClassifier
clf_rf = RandomForestClassifier(random_state=1)  # by default, 10 trees are used
# your code here
clf_rf.fit(X_train,y_train)
print ('train accuracy =', clf_rf.score(X_train,y_train))

# Cross validation (Cross-validation is not necessary when using random forest)
from sklearn.model_selection import cross_val_score
scores_rf = cross_val_score(clf_rf, data[predictors], data['fake'], scoring='accuracy', cv=5)
print('cross validation accuracy =', scores_rf)


.
.
.

**Feature Importance**

The first step in understanding the model is to identify which parameters or variables have a larger role in the model. 

In [None]:
feat_imp = pd.DataFrame(clf_rf.feature_importances_, predictors, columns=['Importance'])
feat_imp.sort_values('Importance', ascending=False)

Let's try with a GridSearch CV !

In [None]:
from sklearn.model_selection import GridSearchCV
params = {'min_samples_leaf':list(range(1,5)),'min_samples_split':list(range(2,10,2)),
          'n_estimators':list(range(10,50,10))}
clf_rf2=RandomForestClassifier(random_state=1)
clf_gs=GridSearchCV(clf_rf2, params, scoring = 'accuracy',cv=5)
clf_gs.fit(data[predictors], data['fake'])

In [None]:
print(clf_gs.best_score_)
print(clf_gs.best_params_)
clf_rf3 = RandomForestClassifier(random_state=1,min_samples_leaf=4, min_samples_split=2, n_estimators=30) 
clf_rf3.fit(X_train, y_train)
print ('train accuracy =', clf_rf3.score(X_train, y_train))

scores_rf3 = model_selection.cross_val_score(clf_rf3, data[predictors], data['fake'], scoring='accuracy', cv=5)
print('cross validation accuracy =',scores_rf3.mean())

**Feature Importance**

The feature importance of the same dataset is different when using another model !

In [None]:
feat_imp = pd.DataFrame(clf_rf3.feature_importances_, predictors, columns=['Importance'])
feat_imp.sort_values('Importance', ascending=False)

**Partial Plots**

Understanding how the values of the features interferes in the decision making process.

In [None]:
from matplotlib import pyplot as plt
from pdpbox import pdp, get_dataset, info_plots

# Create the data that we will plot
pdp_description_length = pdp.pdp_isolate(model=clf_rf3, dataset=X_test, model_features=predictors, feature='#follows')

# plot it
pdp.pdp_plot(pdp_description_length, '#follows')
plt.show()



In [None]:
from matplotlib import pyplot as plt
from pdpbox import pdp, get_dataset, info_plots

# Create the data that we will plot
pdp_posts= pdp.pdp_isolate(model=clf_rf3, dataset=X_test, model_features=predictors, feature='fullname words')

# plot it
pdp.pdp_plot(pdp_posts, 'fullname words')
plt.show()


**SHAP Values**

Understanding which features are most valuable when predicting (single prediction)

In [None]:
import shap  # package used to calculate Shap values

# Create object that can calculate shap values
explainer = shap.TreeExplainer(clf_rf3)

# calculate shap values. This is what we will plot.
# Calculate shap_values for all of val_X rather than a single row, to have more data for plot.
shap_values = explainer.shap_values(X_test)

# Make plot. Index of [1] is explained in text below.
shap.summary_plot(shap_values[1], X_test)

Let's analyse a single prediction and how it is influenced



In [None]:
#SHAP
row_to_show = 5
data_for_prediction = X_test.iloc[row_to_show]  # use 1 row of data here. Could use multiple rows if desired
data_for_prediction_array = data_for_prediction.values.reshape(1, -1)


clf_rf3.predict_proba(data_for_prediction_array)

import shap  # package used to calculate Shap values

# Create object that can calculate shap values
explainer = shap.TreeExplainer(clf_rf3)

# Calculate Shap values
shap_values = explainer.shap_values(data_for_prediction)

shap.initjs()
shap.force_plot(explainer.expected_value[1], shap_values[1], data_for_prediction)