# Random Forest Classifier for Recently Completed Content


## Dependencies and Defaults

In [None]:
import config as cn
import pandas as pd
import numpy as np
import os
from IPython.display import HTML

In [None]:
dir_home = cn.home_dir
dir_clean = cn.clean_dir
dir_processed = cn.processed_dir
dir_raw = cn.raw_dir
f_in = os.path.join(dir_clean,  'final_player_stats_test.csv')

In [None]:
df = pd.read_csv(f_in,dtype='unicode')
df.info()
df_just_in_case = df.copy()

## Clean Player Data

In [None]:
# Load player data
df = pd.read_csv(f_in,dtype='unicode')
df = df.drop_duplicates()
df['id'] = df.player + '_' + df.realm
df = df.set_index('id')
del_cols = [c for c in df.columns.values if 'unnamed' in c.lower()]
df = df.drop(del_cols, axis = 1)


if 'engagement' not in df.columns.values:
    df['engagement'] = np.nan
    for index, row in df.iterrows():
        if int(row.time_since_login.split(' ')[0]) <= 30:
            df.at[index,'engagement'] = 1
        elif int(row.time_since_login.split(' ')[0]) <= 120:
            df.at[index,'engagement'] = 2
        elif int(row.time_since_login.split(' ')[0]) <= 365:
            df.at[index,'engagement'] = 3
        elif int(row.time_since_login.split(' ')[0]) > 365:
            df.at[index,'engagement'] = 4

display(HTML(df.head().to_html()))

In [None]:
df.describe()

## Random Forest Classifier with 5 years of data

In [None]:
# Pick only the months we're interested in
months = np.arange(1, 13)
years = [2015 ,2016, 2017, 2018, 2019, 2020]
timepoints = []

for month in months:
    if month < 10:
        month = str(0) + str(month)
    for year in years:
        if year == 2020 and month >= '06':
            break
        if year < 2011 and month != '01':

            continue
        else:
            month = str(month)
            timepoints.append(str(year) + '-' + str(month))
print(sorted(timepoints))

In [None]:
df_tree = df.copy()
keep_cols = timepoints.append('engagement')
df_tree = df_tree[timepoints]

df_tree = df_tree.dropna()
df_tree = df_tree.reset_index()
df_tree.describe()
display(HTML(df_tree.head().to_html()))

In [None]:
from sklearn.model_selection import cross_val_score, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import MinMaxScaler

In [None]:
from sklearn.model_selection import ShuffleSplit

rs = ShuffleSplit(n_splits=10, test_size=.25, random_state=17)
for train_index, test_index in rs.split(df_tree):
    train_set = df_tree.iloc[train_index].copy()
    train_set = train_set.set_index('id')
    test_set = df_tree.iloc[test_index].copy()
    test_set = test_set.set_index('id')

y_train = train_set.engagement
X_train = train_set.drop('engagement',axis = 1)
y_test = test_set.engagement
X_test = test_set.drop('engagement',axis = 1)

print(y_train.head())

In [None]:
selected = RandomForestClassifier(n_estimators = 100,n_jobs = -1,
                           oob_score = True,bootstrap = True,random_state = 17)
selected.fit(X_train, y_train)

In [None]:
importances = selected.feature_importances_
indices = np.argsort(importances)
important_features = X_train.columns.values[indices]

for i, v in enumerate(important_features[:25]):
    print(i,v)

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

corr_matrix = train_set.corr()
sns.set()
fig, ax = plt.subplots(figsize=(9,6), dpi = 300)
corr_map = sns.heatmap(corr_matrix,  annot = True, ax=ax, cmap = "viridis")

In [None]:
predictions = selected.predict(X_test)

In [None]:
df_pred = pd.DataFrame(X_test)
df_pred['prediction'] = predictions
df_pred['actual'] = y_test
display(HTML(df_pred.iloc[:100][:].to_html()))

In [None]:
print(selected.score(X_train,y_train))

In [None]:
print(selected.oob_score_)

In [None]:
print(len(y_test), len(predictions))

In [None]:
from sklearn import metrics

# Print the confusion matrix
cnf_matrix = metrics.confusion_matrix(y_test,predictions)
print(cnf_matrix)

# Print the precision and recall, among other metrics
metrics = metrics.classification_report(y_test, predictions, digits=3)
print(metrics)

In [None]:
#display (HTML('<br><h2>Confusion Matrix</h2><br>'))
fig3, ax = plt.subplots(figsize = (8,8))
sns.heatmap(pd.DataFrame(cnf_matrix), annot = True, cmap = 'viridis', fmt = 'g', annot_kws={"size":16})
ax.set_xlabel ("Predicted Value", fontsize = 18)
ax.set_ylabel ("Acutal Value", fontsize = 18)
ax.tick_params (labelsize = 14)
plt.tight_layout()
cnf_matrix

This is pretty bad for the target audience I'm looking for (2)

## Random Forest with 2 years of data

In [None]:
df_tree = df.copy()

df_tree = df_tree[['2018-06', '2018-07', '2018-08', '2018-09', 
                  '2018-10', '2018-11', '2018-12', '2019-01', 
                  '2019-02', '2019-03', '2019-04', '2019-05', 
                  '2019-06', '2019-07', '2019-08', '2019-09', 
                  '2019-10', '2019-11', '2019-12', '2020-01', 
                  '2020-02', '2020-03', '2020-04', '2020-05',
                  'engagement']]

df_tree = df_tree.dropna()
df_tree = df_tree.reset_index()
df_tree.describe()
display(HTML(df_tree.to_html()))

In [None]:
from sklearn.model_selection import ShuffleSplit

rs = ShuffleSplit(n_splits=10, test_size=.25, random_state=17)
for train_index, test_index in rs.split(df_tree):
    train_set = df_tree.iloc[train_index].copy()
    train_set = train_set.set_index('id')
    test_set = df_tree.iloc[test_index].copy()
    test_set = test_set.set_index('id')

y_train = train_set.engagement
X_train = train_set.drop('engagement',axis = 1)
y_test = test_set.engagement
X_test = test_set.drop('engagement',axis = 1)

print(y_train.head())

In [None]:
selected = RandomForestClassifier(n_estimators = 200,n_jobs = -1,
                           oob_score = True,bootstrap = True,random_state = 17)
selected.fit(X_train, y_train)

In [None]:
importances = selected.feature_importances_
indices = np.argsort(importances)
important_features = X_train.columns.values[indices]

for i, v in enumerate(important_features[:25]):
    print(i,v)

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

corr_matrix = train_set.corr()
sns.set()
fig, ax = plt.subplots(figsize=(9,6), dpi = 300)
corr_map = sns.heatmap(corr_matrix,  annot = False, ax=ax, cmap = "Purples")

In [None]:
predictions = selected.predict(X_test)

In [None]:
print(selected.score(X_train,y_train))

In [None]:
print(selected.oob_score_)

In [None]:
from sklearn import metrics

# Print the confusion matrix
cnf_matrix = metrics.confusion_matrix(y_test,predictions)
print(cnf_matrix)

# Print the precision and recall, among other metrics
metrics = metrics.classification_report(y_test, predictions, digits=3)
print(metrics)

In [None]:
#display (HTML('<br><h2>Confusion Matrix</h2><br>'))
fig3, ax = plt.subplots(figsize = (8,8))
sns.heatmap(pd.DataFrame(cnf_matrix), annot = True, cmap = 'viridis', fmt = 'g', annot_kws={"size":16})
ax.set_xlabel ("Predicted Value", fontsize = 18)
ax.set_ylabel ("Acutal Value", fontsize = 18)
ax.tick_params (labelsize = 14)
plt.tight_layout()
cnf_matrix