In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.externals import joblib
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LogisticRegression
from sklearn.dummy import DummyClassifier
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix, recall_score



## Load data

In [2]:
# Importing the dataset
missing_values = ["n/a", "na", "--"]
dataset = pd.read_csv('final_df2.csv', na_values = missing_values)


In [12]:
dataset  = dataset.fillna(0)

In [13]:
print(dataset['WS Winner'].value_counts())

0    1302
1      48
Name: WS Winner, dtype: int64


In [14]:
# print percentage of questions where target == 1
(len(dataset.loc[dataset['WS Winner']==1])) / (len(dataset.loc[dataset['WS Winner'] == 0])) * 100

3.686635944700461

Classes are imbalanced.  

This is a problem because many machine learning models are designed to maximize overall accuracy, which especially with imbalanced classes may not be the best metric to use. Classification accuracy is defined as the number of correct predictions divided by total predictions times 100. For example, if we simply predicted all teams are not champions, we would get a classification acuracy score of over 99%!

## Create Train and Test Sets¶

In [31]:
X = dataset.iloc[:, 1:81]
y = dataset.iloc[:, 81]

In [16]:
X.isnull().sum()

#Bat       0
#Fld       0
#P         0
2B         0
3B         0
A          0
AB         0
BA         0
BB9        0
BB_x       0
BB_y       0
BF         0
BK         0
BatAge     0
CG_x       0
CG_y       0
CS         0
Ch         0
DP         0
DefEff     0
E          0
ER         0
ERA        0
ERA+       0
FIP        0
Fld%       0
G          0
GDP        0
GF         0
GS_x       0
          ..
OPS+       0
PA         0
PAge       0
PO         0
R/G        0
RA/G_x     0
RA/G_y     0
RBI        0
R_x        0
R_y        0
Rdrs       0
Rdrs/yr    0
Rtot       0
Rtot/yr    0
SB         0
SF         0
SH         0
SLG        0
SO/W       0
SO9        0
SO_x       0
SO_y       0
SV         0
TB         0
W          0
W-L%       0
WHIP       0
WP         0
cSho       0
tSho       0
Length: 80, dtype: int64

In [17]:

# Prepare data for modeling
# Separate input features and target
#y = df['WS Winner']
#X = df.drop(['WS Winner', 'Tm'], axis=1)

# setting up testing and training sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=27)

# Baseline Models

In [18]:
# DummyClassifier to predict only target 0
dummy = DummyClassifier(strategy='most_frequent').fit(X_train, y_train)
dummy_pred = dummy.predict(X_test)

# checking unique labels
print('Unique predicted labels: ', (np.unique(dummy_pred)))

# checking accuracy
print('Test score: ', accuracy_score(y_test, dummy_pred))

Unique predicted labels:  [42]
Test score:  0.011834319526627219


As predicted our accuracy score for classifying all transactions as not fraud is about 96%

As the Dummy Classifier predicts only Class 0, it is clearly not a good option for our objective of correctly classifying the winning teams.

Let's see how logistic regression performs on this dataset.

In [19]:
X_train.head()

Unnamed: 0,#Bat,#Fld,#P,2B,3B,A,AB,BA,BB9,BB_x,...,SO_x,SO_y,SV,TB,W,W-L%,WHIP,WP,cSho,tSho
1335,53,53,30,252,24,1453,5542,0.252,3.4,537,...,1458,1428,49,2352,96,0.589,1.24,50,0,14
849,43,43,22,283,41,1701,5512,0.274,3.7,497,...,1043,920,43,2329,73,0.451,1.497,42,0,8
951,44,44,19,292,38,1803,5573,0.259,3.8,471,...,1092,958,35,2230,67,0.414,1.44,53,3,14
663,39,38,20,275,27,1399,5036,0.26,4.6,492,...,906,894,22,2058,56,0.389,1.539,73,3,8
119,42,41,20,195,29,1704,5488,0.255,4.3,503,...,791,831,27,1980,57,0.352,1.534,62,8,10


## Tune model with hyperparameters

### Random Hyperparameter grid

In [26]:
from sklearn.model_selection import RandomizedSearchCV
import numpy as np
import pprint


# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]
# Method of selecting samples for training each tree
bootstrap = [True, False]

# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}

pp = pprint.PrettyPrinter(indent=4)
pp.pprint(random_grid)


{   'bootstrap': [True, False],
    'max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, None],
    'max_features': ['auto', 'sqrt'],
    'min_samples_leaf': [1, 2, 4],
    'min_samples_split': [2, 5, 10],
    'n_estimators': [200, 400, 600, 800, 1000, 1200, 1400, 1600, 1800, 2000]}


In [27]:
# Use the random grid to search for best hyperparameters
# First create the base model to tune
rf = RandomForestClassifier()
# Random search of parameters, using 3 fold cross validation, 
# search across 100 different combinations, and use all available cores
rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, n_iter = 100, cv = 3, verbose=2, random_state=42, n_jobs = -1)
# Fit the random search model
rf_random.fit(X_train, y_train)

rf_random.best_params_

Fitting 3 folds for each of 100 candidates, totalling 300 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed:  1.1min
[Parallel(n_jobs=-1)]: Done 146 tasks      | elapsed:  5.8min
[Parallel(n_jobs=-1)]: Done 300 out of 300 | elapsed: 11.6min finished


{'n_estimators': 1600,
 'min_samples_split': 10,
 'min_samples_leaf': 1,
 'max_features': 'sqrt',
 'max_depth': None,
 'bootstrap': True}

## Evaluating Random Search

In [28]:
def evaluate(model, test_features, test_labels):
    predictions = model.predict(test_labels)
    errors = abs(predictions - test_lables)
    mape = 100 * np.mean(errors / test_labels)
    accuracy = 100 - mape
    print('Model Performace')
    print('Average Error: {:0.4f} degrees.'.format(np.mean(errors)))
    print('Accuracy = {:0.2f}%.'.format(accuracy))
    return accuracy, predictions

### Change features into numpy arrays 

In [29]:
X_train.shape
y_test

988     53
1215    44
869     57
1178    62
558     42
837     75
623     43
1112    32
273     32
2       61
1173    47
1229    37
149     26
601     52
1243    52
243     62
1252    54
456     58
500     44
1096    41
193     30
1169    62
825     58
1122    69
323     35
421     61
473     47
487     31
1256    81
436     43
        ..
168     60
223     43
598     52
248     40
109     61
797     55
756     60
766     57
164     44
1150    75
207     25
65      37
1341    95
72      69
339     58
148     35
195     56
115     42
1241    62
1008    59
1078    43
399     42
758     54
916     45
552     37
984     50
898     51
6       76
75      49
369     48
Name: WP, Length: 338, dtype: int64

In [None]:
X_train_np = X_train.values
y_train_np = y_train.values
X_test_np = X_test.values
y_test_np = y_test.values

In [None]:
base_model = RandomForestRegressor(n_estimators = 10, random_state = 42)
base_model.fit(X_train_np, y_train_np)
base_accuracy = evaluate(base_model, X_test_np, y_test)

In [None]:
prediction = pd.Series(predictions)

In [None]:
best_random = rf_random.best_estimator_
best_random = rf_random.best_estimator_

In [None]:
prediction = pd.Series(predictions)

In [None]:
f1_score(y_test, lr_pred)

In [None]:
# recall score
recall_score(y_test, lr_pred)

### Alternatively SVC

In [None]:
#from sklearn.svm import SVC
#clf = SVC(kernel=’linear’, class_weight=’balanced’, probability=True)

In [None]:
pred = clf.predict(X_test)

## Performance metric 

In [None]:
from sklearn.metrics import roc_auc_score
prob_y_2 = clf_2.predict_proba(X)
prob_y_2 = [p[1] for p in prob_y_2]
print( roc_auc_score(y, prob_y_2) )

## Save model for future use


In [None]:
joblib.dump(clf, ‘rf_regressor.pkl’)
# To load: clf2 = joblib.load(‘rf_regressor.pkl’)

# Conclusions

This project could be modified into a regression model by including all teams in the playoffs, ranking them ordinally on how far they got, and them making this a new column