## Timeseries Explorer

Achievements now have completion dates for each user and a tally of how many achievements each player completed for every month from Jan 2010 to may 2020.

## Set up timeseries dataframe
Arrange the data so that rows = date, columns = achievements per month, index = player_realm

In [4]:
import os
import pandas as pd
import datetime as dt
import config as cn
from IPython.display import HTML
import numpy as np
import seaborn as sns
import custom_funcs as cf

file_in = os.path.join(os.path.join(cn.clean_dir, 'final_time_stats.csv'))
player_cols = ['player', 'realm', 'last_login', 'time_since_login', 'status','gear_score', '2020-05']
dfa_dates = cf.get_dates()
keep_cols = player_cols + dfa_dates[-74:]
keep_cols.append('engagement')
df = pd.read_csv(file_in, dtype = 'unicode')
extra_cols = [col for col in df.columns.values if 'unnamed' in col.lower() or  col not in keep_cols]
df = df.drop(extra_cols, axis = 1)
df = df.drop_duplicates()


df_original = df.copy()
display(HTML(df.head().to_html()))



TypeError: join() got an unexpected keyword argument 'dtype'

In [2]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 242957 entries, 0 to 363669
Data columns (total 81 columns):
 #   Column            Non-Null Count   Dtype  
---  ------            --------------   -----  
 0   player            242956 non-null  object 
 1   realm             242957 non-null  object 
 2   gear_score        242940 non-null  float64
 3   last_login        242940 non-null  object 
 4   time_since_login  242957 non-null  object 
 5   2015-01           144838 non-null  float64
 6   2016-01           144838 non-null  float64
 7   2017-01           144838 non-null  float64
 8   2018-01           144838 non-null  float64
 9   2019-01           144838 non-null  float64
 10  2020-01           144838 non-null  float64
 11  2015-02           144838 non-null  float64
 12  2016-02           144838 non-null  float64
 13  2017-02           144838 non-null  float64
 14  2018-02           144838 non-null  float64
 15  2019-02           144838 non-null  float64
 16  2020-02           14

In [3]:
df = df.drop(['player', 'realm', 'last_login', 'time_since_login', 'status'], axis = 1)
df.to_csv(os.path.join(cn.clean_dir,'stripped_final_time_stats.csv'))
df.describe().to_csv(os.path.join(cn.clean_dir, 'eda', 'time_series_descriptive_stats.csv'))
df.describe()

Unnamed: 0,gear_score,2015-01,2016-01,2017-01,2018-01,2019-01,2020-01,2015-02,2016-02,2017-02,...,2017-11,2018-11,2019-11,2014-12,2015-12,2016-12,2017-12,2018-12,2019-12,engagement
count,242940.0,144838.0,144838.0,144838.0,144838.0,144838.0,144838.0,144838.0,144838.0,144838.0,...,144838.0,144838.0,144838.0,144838.0,144838.0,144838.0,144838.0,144838.0,144838.0,242957.0
mean,422.762538,6.974475,5.342969,6.785947,6.525649,8.827145,9.146909,5.315746,4.392204,5.343812,...,5.628488,7.517585,7.48789,9.990804,6.899764,6.001954,6.5064,8.552037,6.980226,0.380545
std,52.731695,13.800024,13.759982,13.816179,13.5214,14.406265,16.154893,11.170896,11.634628,12.63319,...,12.913664,13.403987,15.692877,17.119356,14.228129,13.017865,14.19111,13.318722,14.499861,0.629304
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,400.0,1.0,0.0,1.0,1.0,2.0,1.0,0.0,0.0,0.0,...,1.0,1.0,1.0,1.0,1.0,0.0,1.0,2.0,1.0,0.0
50%,437.0,2.0,2.0,2.0,3.0,4.0,4.0,2.0,1.0,1.0,...,2.0,4.0,3.0,4.0,3.0,2.0,2.0,4.0,3.0,0.0
75%,462.0,7.0,5.0,6.0,6.0,10.0,11.0,6.0,4.0,5.0,...,5.0,8.0,7.0,11.0,6.0,6.0,7.0,10.0,6.0,1.0
max,486.0,413.0,402.0,426.0,414.0,368.0,326.0,336.0,393.0,315.0,...,374.0,466.0,367.0,399.0,490.0,496.0,366.0,328.0,486.0,2.0


## Transpose Data so the x = time and y = player achievements/month

In [None]:
df.index = [float(str(i).replace('-','')) for i in df.index.values] 

df_m = df.groupby('engagement').mean()
display(HTML(df_m.head().to_html()))

## Get mean achievements per month

In [None]:
dft = df_m.T.reset_index()
dft = dft.iloc[1:][:]
dft.columns.values[0] = 'dates'
dft.dates = pd.to_datetime(dft.dates)
dft = dft.sort_values('dates')
dft = dft.set_index('dates')
dft.columns = ['active','risk','lapsed']
#dft = dft.astype(float).diff(baseline)
dft.to_csv(os.path.join(cn.clean_dir, 'eda', 'time_series_plot_data.csv'))
display(HTML(dft.to_html()))

## Plot mean achievements per month

In [None]:
import matplotlib.pyplot as plt
import numpy as np
import matplotlib.dates as mdates
import datetime as dt
import matplotlib.ticker as ticker

fig, ax = plt.subplots(figsize=(20, 10))
a = ax.plot_date(sorted(dft.index), dft.active, label = 'active',   fmt = '-4', 
                 color = 'darkcyan', alpha = 0.5, linewidth=4,marker = 'o', 
                 markersize=10)
a = ax.plot_date(sorted(dft.index), dft.risk, label = 'at-risk',   fmt = '-4', 
                 color = 'dodgerblue', alpha = 0.5, linewidth=4,marker = 'o', 
                 markersize=10)
a = ax.plot_date(sorted(dft.index), dft.lapsed, label = 'lapsed',   fmt = '-4', 
                 color = 'mediumpurple', alpha = 0.5, linewidth=4,marker = 'o', 
                 markersize=10)


a = ax.set_xlabel('\nTime (years)', fontsize = 28, color = 'wheat')
a = ax.set_ylabel('Achievements/Month \n', fontsize = 28, color = 'wheat')
a = ax.tick_params(labelsize = 24, labelcolor = 'wheat')
handles, labels = ax.get_legend_handles_labels()
ax.legend(handles, labels, fontsize = 22)

In [None]:
df.corr()

In [None]:
sns.heatmap(df.corr())

In [None]:
h = df.engagement.hist(bins = 20, figsize = (5,5))
plt.tight_layout()

## Stratified Shuffle Split

In [None]:
from sklearn.model_selection import ShuffleSplit, StratifiedShuffleSplit

y = df.engagement
X = df.drop('engagement', axis = 1)

split = StratifiedShuffleSplit(n_splits = 10, test_size = 0.25, random_state = 17)
for train_index, test_index in split.split(df, df.engagement):
    strat_train = df.iloc[train_index][:]
    strat_test = df.iloc[test_index][:]
    whole_training_set = df_original.iloc[train_index][:]
    whole_test_set = df_original.iloc[test_index][:]
    whole_training_set.to_csv(os.path.join(cn.clean_dir, 'whole_training_set.csv'), index = False)
    whole_test_set.to_csv(os.path.join(cn.clean_dir, 'whole_test_set.csv'), index = False)

In [None]:
h = strat_train.engagement.hist(bins = 20, figsize = (5,5))
plt.tight_layout()

## Shuffle Split

In [None]:
from sklearn.model_selection import ShuffleSplit

split = ShuffleSplit(n_splits = 10, test_size = 0.25, random_state = 17)
for train_index, test_index in split.split(df, df.engagement):
    shuff_train = df.iloc[train_index][:]
    shuff_test = df.iloc[test_index][:]

In [None]:
h2 = shuff_train.engagement.hist(bins = 24, figsize = (5,5))
plt.tight_layout()

## Compare ShuffleSplit vs Stratified Shuffle Split

In [None]:
t = pd.DataFrame()
t['overall'] = df.engagement.value_counts()/len(df.index.values)
t['stratified'] = strat_test.engagement.value_counts()/len(strat_test.index.values)
t['shuffled'] = shuff_test.engagement.value_counts()/len(shuff_test.index.values)
t['shuff_p_err'] = 100 * (t.overall - t.shuffled)/ t.overall
t['strat_p_err'] = 100 * (t.overall - t.stratified)/ t.overall
t = t.sort_index()
display(HTML(t.to_html()))

fig, ax = plt.subplots(1,2, figsize = (12,8), dpi = 100)
ax[0].plot(t.index.values, t['overall'], alpha = 0.5, label = 'overall', color = 'black', lw = 2)
ax[0].plot(t.index.values, t['stratified'], alpha = 0.5, label = 'stratified', color = 'magenta')
ax[0].plot(t.index.values, t['shuffled'], alpha = 0.5, label = 'shuffled', color = 'darkorange')
handles, labels = ax[0].get_legend_handles_labels()
sampling_plot0 = ax[0].legend(handles, labels)
ax[0].set_title ('% of testing instances')
    
ax[1].plot(t.index.values, t['strat_p_err'], alpha = 0.5, label = 'stratified', color = 'magenta')
ax[1].plot(t.index.values, t['shuff_p_err'], alpha = 0.5, label = 'shuffled', color = 'darkorange')
handles, labels = ax[1].get_legend_handles_labels()
sampling_plot1 = ax[1].legend(handles, labels)
ax[1].axhline(0, ls = ':', color = 'silver')
ax[1].set_title ('% err from overall')
    
    ## Remove Category for Stratification from the dataset
#for set_ in (strat_train_set, strat_test_set):
 #   set_.drop(label, axis = 1, inplace = True)

## Stratified is the clear winner with the lowest error

In [None]:
strat_train.to_csv(os.path.join(cn.clean_dir,'time_stratified_train.csv'), index = False)
strat_test.to_csv(os.path.join(cn.clean_dir,'time_stratified_test.csv'), index = False)
strat_train.info()


## Tuning the Random Forest Classifier

In [None]:
No weights, no gear score
                precision    recall  f1-score   support

           0      0.827     0.984     0.899     26915
           1      0.877     0.522     0.654     10146
           2      0.926     0.317     0.472      1508

    accuracy                          0.836     38569
   macro avg      0.877     0.608     0.675     38569
weighted avg      0.844     0.836     0.818     38569

In [None]:
No weights, with gear score
              precision    recall  f1-score   support

           0      0.862     0.973     0.914     27046
           1      0.839     0.627     0.717     10146
           2      0.961     0.362     0.526      1508

    accuracy                          0.859     38700
   macro avg      0.887     0.654     0.719     38700
weighted avg      0.859     0.859     0.847     38700

In [None]:
6/240 weighted 1,7,5,15
            ind precision recall f1-score support
0             0     0.742  0.989    0.848   23642
1             1     0.707  0.113    0.195    3421
2             2     0.822  0.182    0.298    5114
3             3     0.823  0.144    0.246    1254
4      accuracy                     0.744   33431
5     macro_avg     0.773  0.357    0.397   33431
6  weighted_avg     0.753  0.744    0.674   33431
Plotting the confusion matrix...

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics
import pickle

y_train = strat_train.engagement
X_train = strat_train.drop('engagement',axis = 1)
y_test = strat_test.engagement
X_test = strat_test.drop('engagement',axis = 1)


fig1, axes = plt.subplots( figsize=(10,10), dpi=100)
a = sns.distplot(df_original.engagement, color="darkcyan",  axlabel='status')
a.set_xticklabels(df_original.status, rotation = 45)
fig1.savefig(os.path.join(cn.clean_dir, 'pickles',
            'histplot_time_balanced.png'), dpi=180)


print("Start random forest...")

#class_weight = dict({0:1, 1:7, 2:5, 3:15})
#selected = RandomForestClassifier(bootstrap=True,
#            class_weight=class_weight, n_estimators=300,
#            oob_score=True,random_state=17)

selected = RandomForestClassifier(n_estimators = 300,n_jobs = -1,
                           oob_score = True,bootstrap = True,random_state = 17)
selected.fit(X_train, y_train)


print("Important Features...")
importances = selected.feature_importances_
indices = np.argsort(importances)
important_features = X_train.columns.values[indices]
for i, v in enumerate(important_features[:25]):
    print(i,v)


print("Making predictions...")
predictions = selected.predict(X_test)
df_pred = pd.DataFrame(X_test)
df_pred['prediction'] = predictions
df_pred['actual'] = y_test


print('Getting accuracy score...')
print(selected.score(X_train,y_train))


print('Oob score...')
print(selected.oob_score_)

print ("Making confusion matrix...")
# Print the confusion matrix
cnf_matrix = metrics.confusion_matrix(y_test,predictions)
print(cnf_matrix)

# Print the precision and recall, among other metrics
met = metrics.classification_report(y_test, predictions, digits=3)
folder = os.path.join(cn.clean_dir, 'pickles')
f_name = 'metrics_time_balanced_metrics.csv'
print(met)



print("Plotting the confusion matrix...")
fig2, ax = plt.subplots(figsize = (8,8))
sns.heatmap(pd.DataFrame(cnf_matrix), annot = True, cmap = 'viridis', fmt = 'g', annot_kws={"size":16})
ax.set_xlabel ("Predicted Value", fontsize = 18)
ax.set_ylabel ("Actual Value", fontsize = 18)
ax.tick_params (labelsize = 14)
plt.tight_layout()
fig2.savefig(os.path.join(cn.clean_dir, 'pickles','cnfmatrix_time_balanced.png'), dpi=180)

# save the model to disk
pickle_name = 'rf_time_balanced_model.sav'
os.chdir(os.path.join(cn.clean_dir, 'pickles'))
with open(pickle_name, 'wb') as file:
    pickle.dump(selected, file)
plt.show()