
<center><b>Data Dictionary:</b> https://docs.google.com/document/d/1aYqMaddLpN0Skgwa17HRg-jwqOZ0NVLyWd1-zyTXGro/edit?usp=sharing</center><br><br>

## 1. GET/IMPORT THE DATA

In [1]:
#load libraries and data
%matplotlib notebook
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn.linear_model import LinearRegression, Lasso, Ridge, ElasticNet
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectPercentile
import seaborn as sns
from pandas.plotting import scatter_matrix


ow = pd.read_csv('assets/overwatch-diary.csv', index_col = 0)

In [2]:
ow.head()

Unnamed: 0_level_0,result,game_id,sr_start,sr_finish,sr_change,streak_number,my_team_sr,enemy_team_sr,map,round,...,character_2,character_3,psychological_condition,eliminations,objective_kills,healing,deaths,weapon_accuracy,offensive_assists,defensive_assists
season,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
3,W,203.0,,,0,,2929.0,2922.0,Dorado,4.0,...,2_na,3_na,na,,,,,,,
5,W,11.0,,,0,2.0,,,Dorado,2.0,...,2_na,3_na,Good,12.0,7.0,3791.0,4517.0,2.0,20.0,7.0
4,W,,,,0,1.0,2649.0,2638.0,Dorado,4.0,...,2_na,3_na,great,17.0,14.0,10022.0,6940.0,7.0,31.0,9.0
4,W,,,,0,5.0,2895.0,2892.0,Dorado,2.0,...,2_na,3_na,Happy,,,,,,,
3,W,150.0,,,0,,2992.0,2921.0,Dorado,3.0,...,2_na,3_na,na,,,,,,,


In [3]:
#create dummy variable for each object features
#based on rexexp analysis in project 4, we will not include 'psychological_conditon'
result_dummy = pd.get_dummies(ow['result'])
map_dummy = pd.get_dummies(ow['map'])
role_dummy = pd.get_dummies(ow['team_role'])
hero1_dummy = pd.get_dummies(ow['character_1'])
hero2_dummy = pd.get_dummies(ow['character_2'])
hero3_dummy = pd.get_dummies(ow['character_3'])

#concetenate dummies into ow
ow_dummy = pd.concat([ow, result_dummy, map_dummy, role_dummy, hero1_dummy, hero2_dummy, hero3_dummy], axis = 1)

In [4]:
#drop object dtype columns, including psychological_condition
ow_clean = ow_dummy.drop(columns=['result','map','team_role','character_1','character_2','character_3','psychological_condition'])

In [5]:
ow_clean.describe()

Unnamed: 0,game_id,sr_start,sr_finish,sr_change,streak_number,my_team_sr,enemy_team_sr,round,capscore,score_distance,...,2_na,3_Ana,3_Lucio,3_Mercy,3_Soldier,3_Symmetra,3_Zarya,3_Zenyatta,3_lucio,3_na
count,2292.0,1472.0,1472.0,2939.0,2023.0,2700.0,2688.0,2810.0,2602.0,1288.0,...,2939.0,2939.0,2939.0,2939.0,2939.0,2939.0,2939.0,2939.0,2939.0,2939.0
mean,173.565445,2547.171875,2547.96875,0.399115,2.160652,2652.716667,2652.673735,1.940214,1.568025,72.831188,...,0.842463,0.004083,0.003403,0.004423,0.00034,0.00034,0.00034,0.003062,0.000681,0.983328
std,135.417174,206.293353,205.935244,96.387388,1.715523,228.574728,231.596376,1.004432,1.071181,122.092413,...,0.364368,0.063779,0.058242,0.066372,0.018446,0.018446,0.018446,0.055262,0.026082,0.128062
min,1.0,1966.0,1966.0,-2654.0,1.0,1964.0,1937.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,69.0,2427.0,2427.0,-14.0,1.0,2524.0,2521.0,1.0,1.0,60.5675,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
50%,144.5,2573.0,2573.5,0.0,2.0,2655.0,2656.0,2.0,2.0,71.75,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
75%,220.0,2681.0,2682.0,0.0,3.0,2821.0,2822.0,2.0,2.0,82.7675,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
max,538.0,3017.0,3008.0,2698.0,13.0,3178.0,3172.0,6.0,4.0,4375.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


## 2. TEST/TRAIN/SPLIT

In [11]:
#Now that we have all of dummy variables and have removed objects, let's instatitate
#The new 'W' Column will be what we're trying to predict because a '1' will mean a win and '0' will be a loss.
X = ow_clean.drop(['W'], axis=1)
y = ow_clean['W']

In [7]:
# Train test split
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)


## 3. EDA OF TRAIN DATA
Note: Here are some things to consider in your notebook: sample size, correlations, feature importance, unexplained variance or outliers, variable selection, train/test comparison, and any relationships between your target and independent variables.

In [14]:
# let's fill nulls with the feature median
ow.fillna(ow.median(),inplace=True)

columns = ['game_id',        
'sr_start',       
'sr_finish',      
'streak_number',  
'my_team_sr',     
'enemy_team_sr',  
'round',          
'capscore',       
'score_distance', 
'match_length',   
'eliminations',   
'objective_kills',
'healing',        
'deaths',         
'weapon_accuracy',
'offensive_assists',
'defensive_assists']
for column in columns:
     ow[column].fillna(ow[column].mean(), inplace=True)

ow.head()

Unnamed: 0_level_0,result,game_id,sr_start,sr_finish,sr_change,streak_number,my_team_sr,enemy_team_sr,map,round,...,character_2,character_3,psychological_condition,eliminations,objective_kills,healing,deaths,weapon_accuracy,offensive_assists,defensive_assists
season,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
3,W,203.0,2573.0,2573.5,0,2.0,2929.0,2922.0,Dorado,4.0,...,2_na,3_na,na,14.0,6.0,5411.0,9456.0,9.0,28.0,6.0
5,W,11.0,2573.0,2573.5,0,2.0,2655.0,2656.0,Dorado,2.0,...,2_na,3_na,Good,12.0,7.0,3791.0,4517.0,2.0,20.0,7.0
4,W,144.5,2573.0,2573.5,0,1.0,2649.0,2638.0,Dorado,4.0,...,2_na,3_na,great,17.0,14.0,10022.0,6940.0,7.0,31.0,9.0
4,W,144.5,2573.0,2573.5,0,5.0,2895.0,2892.0,Dorado,2.0,...,2_na,3_na,Happy,14.0,6.0,5411.0,9456.0,9.0,28.0,6.0
3,W,150.0,2573.0,2573.5,0,2.0,2992.0,2921.0,Dorado,3.0,...,2_na,3_na,na,14.0,6.0,5411.0,9456.0,9.0,28.0,6.0


In [28]:
#check distribution (histogram)
ow.groupby('result').hist(figsize=(20,20))

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

result
L    [[AxesSubplot(0.125,0.755806;0.158163x0.124194...
W    [[AxesSubplot(0.125,0.755806;0.158163x0.124194...
dtype: object

In [19]:
#outliers boxplot
ow.plot(kind='box', figsize=[50,10])

<IPython.core.display.Javascript object>

<matplotlib.axes._subplots.AxesSubplot at 0x113e6a510>

In [30]:
ow.boxplot(figsize=(50,50))

<matplotlib.axes._subplots.AxesSubplot at 0x1a2a8fbb50>

In [23]:
#correlation matrix
np.corrcoef(X_train)

array([[nan, nan, nan, ..., nan, nan, nan],
       [nan, nan, nan, ..., nan, nan, nan],
       [nan, nan, nan, ..., nan, nan, nan],
       ...,
       [nan, nan, nan, ..., nan, nan, nan],
       [nan, nan, nan, ..., nan, nan, nan],
       [nan, nan, nan, ..., nan, nan, nan]])

In [24]:
#collinearity matrix

## 4. FEATURE ENGINEERING

In [None]:
#map win ratio

In [None]:
#hero win ratio

In [None]:
#sr ratio


## 5. MODELING

In [None]:

## Classification 
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import SGDClassifier
from sklearn.neighbors import KNeighborsClassifier
# Some other ones you could look into:
from sklearn.svm import SVC
from sklearn.naive_bayes import BernoulliNB, MultinomialNB
from sklearn.neighbors import NearestNeighbors
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier

In [None]:
y = admissions_data['admit']
X = admissions_data.drop('admit', axis=1)

In [None]:
# You may also want to define certain columns in your feature set already, so maybe you have something like:
features = ['num_rooms','num_bathrooms', ..., 'num_rooms_div_bathrooms']
X = admissions_data[features]

In [None]:
select = SelectPercentile(percentile=50)
select.fit(X_train, y_train)
#transform training
X_train_selected = select.transform(X_train)

In [None]:
mask = select.get_support()
print(mask)

In [None]:
%matplotlib inline
import matplotlib.pyplot as plt
#black is true 
plt.matshow(mask.reshape(1, -1), cmap = 'gray_r')
plt.xlabel('Sample index')

## 6. MODEL FIT

In [None]:
#regularization:

import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

x = 2
y_predicted = 20
b0 = 4
alpha = .5

error_lst = []
for b1 in range(0, 20):
    error = (y_predicted - b1*x - b0)**2 + (alpha*b1)**2
    error_lst.append(error)
print(error_lst)

plt.plot(error_lst)

In [None]:
#pipeline
from sklearn.pipeline import Pipeline

# it takes a list of tuples as parameter
pipeline = Pipeline([
    ('scaler',StandardScaler()),
    ('clf', LogisticRegression())
])

# use the pipeline object as you would
# a regular classifier
pipeline.fit(X_train,y_train)

In [None]:
#gridsearch
from sklearn.model_selection import GridSearchCV

param_grid = param_grid = {
    'C':[0.01, 0.1, 1.0]
}

grid = GridSearchCV(LogisticRegression(), cv=3, param_grid=param_grid)
grid.fit(X_train,y_train)

preds = grid.predict_proba(X_test)

In [None]:
#cross-validate


## 7. SCORING AND INTEPRETING

In [20]:
## Classification 
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, matthews_corrcoef, roc_auc_score

# Classification Loss Functions (for error)
from sklearn.metrics import log_loss, hinge_loss, hamming_loss

# We also have our confusion matrix, plots and reports
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.metrics import roc_curve, precision_recall_curve

## 8. RUN MODEL ON ENTIRE DATASET

**When should you bring in the test set?**
- Only when you're confident of your model i.e. your scores are consistent
- Score your test set, get the predictions, plot residuals or create confusion matrix
- **Remember that ANYTHING YOU DO TO YOUR TRAIN DATA, YOU WILL HAVE TO DO TO YOUR TEST DATA!!!**