In [None]:
import numpy as np
import pandas as pd

# Allow pretty printing of pandas dataframes etc
from IPython.display import display

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, classification_report,\
precision_score, recall_score, precision_recall_curve, average_precision_score
import matplotlib.pyplot as plt

# Show more rows and columns by default
pd.options.display.max_seq_items = 500
pd.set_option('max_rows', 500)
pd.set_option('max_columns', 500)

In [None]:
news_df = pd.read_csv('OnlineNewsPopularity/OnlineNewsPopularity.csv')

In [None]:
news_df.head()

In [None]:
# Predicting the actual number of shares is too difficult, so bin share counts into
# '1' and '0' and use this as a classification target instead.
# Note that it's important to use the specific labels 0 and 1 for binary outputs to
# use sklearn's precision and recall metrics.
news_df['share_bins'] = pd.qcut(news_df[' shares'],
                                q=2,
                                labels=[0, 1])

news_df.head()

In [None]:
# Drop non-predictive column(s), target to make X_df
X_df = news_df.drop(['url', ' shares', 'share_bins'], axis=1)
display(X_df.head())

# Put X into a numpy array for sklearn
X = np.array(X_df)
display(X)

In [None]:
# Extract target to make y and put it into a numpy array for sklearn
y = news_df['share_bins']
y = np.array(y)
print(y)

In [None]:
# Show the first 10 elements of (X, y) side by side
for a, b in zip(X[:10], y[:10]):
    print(a, b)

In [None]:
# Split into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)
y_actual = y_test

In [None]:
# Train a DT model on the training set
dt_model = DecisionTreeClassifier(min_samples_leaf=20)
dt_model.fit(X_train, y_train)

In [None]:
# The default model score for a decision tree is the accuracy
print(dt_model.score(X_train, y_train))
print(dt_model.score(X_test, y_actual))

In [None]:
# Train a RF model on the training set
rf_model = RandomForestClassifier(max_features='auto', min_samples_leaf=20, n_estimators=10)
rf_model.fit(X_train, y_train)

In [None]:
# The default model score for a random forest is the accuracy
print(rf_model.score(X_train, y_train))
print(rf_model.score(X_test, y_actual))

In [None]:
y_pred = rf_model.predict(X_test)
print(classification_report(y_actual,
                            y_pred))

In [None]:
param_grid = {
    'max_features': ['sqrt', 'log2', None],
    'n_estimators': [10, 100, 500],
    'min_samples_leaf': [1, 5, 10, 20, 40]
}

param_grid = {
    'max_features': ['sqrt', 'log2'],
    'n_estimators': [500],
    'min_samples_leaf': [20]
}

rf_model = RandomForestClassifier()

# By default GridSearchCV uses 3-fold cross-validation. Using more folds would be nice but
# can be rather slow as we have to retrain the model for each fold.
# Specifying refit=True refits the model using the best parameters over the whole dataset,
# allowing us to use grid_search_model directly for predictions after training
grid_search_model = GridSearchCV(estimator=rf_model,
                                 param_grid=param_grid,
                                 refit=True)
grid_search_model.fit(X_train, y_train)
grid_search_model.best_estimator_

In [None]:
# Evaluate the best RF estimator over the test set
print(grid_search_model.score(X_train, y_train))
print(grid_search_model.score(X_test, y_actual))

y_pred = grid_search_model.predict(X_test)
print(classification_report(y_actual,
                            y_pred))

In [None]:
# RF gives us pretty good probability estimates, so let's look at precision vs recall
# as we vary the probability threshold
y_pred_probs = grid_search_model.predict_proba(X_test)
print("Some example probabilities:", y_pred_probs[:10])

# y_pred_probs contains probabilities for both '1' and '0'. We only
# care about the probabilities of '1', so we extract it below.
high_index = grid_search_model.best_estimator_.classes_.tolist().index(1)
y_high_probs = [y_pred_probs[i, high_index] for i in range(y_pred_probs.shape[0])]

print("Just the '1' probabilities:", y_high_probs[:10])

average_precision = average_precision_score(y_actual, y_high_probs)
precisions, recalls, _ = precision_recall_curve(y_actual,
                                                y_high_probs)

print("Average precision is", average_precision)
plt.plot(recalls, precisions)
plt.xlabel("Recall")
plt.ylabel("Precision")
plt.show()

In [None]:
# Get the top 10 feature importances
# zip returns an iterator in Python 3 (so as not to waste memory by creating all elements
# unnecessarily), hence the conversion into a list
features_importances = list(zip(X_df.columns, grid_search_model.best_estimator_.feature_importances_))
features_importances.sort(key=lambda x:x[1], reverse=True)
features_importances[:10]