Imports

In [1]:
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline

from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import LogisticRegression


from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier

import pandas as pd 
import os


Various ways to ignore warnings that clog up the output

In [2]:
import warnings
warnings.filterwarnings('ignore') 
os.environ['PYTHONWARNINGS']='ignore'

def warn(*args, **kwargs):
    pass
import warnings
warnings.warn = warn

Read preprocessed data, setup numerical and categorical features and transformers, then build pipeline

In [20]:
dataframe = pd.read_csv("processed.csv")
features = dataframe.columns.to_list()
features.remove('Unnamed: 0')
features.remove('HOME_TEAM_WINS')

data = dataframe.values
X = dataframe[features]
y = dataframe.HOME_TEAM_WINS

numeric_features = ['WEEK'] #week of year
numeric_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="mean")),
    ("scaler", StandardScaler())
])

categorical_features = ['GAME_ID',
            'HOME_TEAM_ID',
            # 'HOME_P1',
            # 'HOME_P2',
            # 'HOME_P3',
            # 'HOME_P4',
            # 'HOME_P5',
            # 'HOME_P6',
            # 'HOME_P7',
            # 'HOME_P8',
            # 'HOME_P9',
            'VISITOR_TEAM_ID',
            # 'VISITOR_P1',
            # 'VISITOR_P2',
            # 'VISITOR_P3',
            # 'VISITOR_P4',
            # 'VISITOR_P5',
            # 'VISITOR_P6',
            # 'VISITOR_P7',
            # 'VISITOR_P8',
            # 'VISITOR_P9',
            ]
categorical_transformer = OneHotEncoder(handle_unknown="ignore")

preprocessor = ColumnTransformer(transformers=[
    ("num_transform", numeric_transformer, numeric_features),
    ("cat_transform", categorical_transformer, categorical_features)
])
pipeline = Pipeline(steps=[("preprocesser", preprocessor), ("classifier", LogisticRegression())])
pipeline



Split test data and perform base run

In [4]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)
pipeline.fit(X_train, y_train)
score = pipeline.score(X_test, y_test)
print(score)

0.6124031007751938


Define parameter grid for grid serach

In [5]:
param_grid = [
    {
        "preprocesser__num_transform__imputer__strategy": ["mean", "median"],
        "classifier__C": [0.1, 1.0, 10.0, 100.0],
        "classifier": [LogisticRegression()]
    },
    {
        "preprocesser__num_transform__imputer__strategy": ["mean", "median"],
        "classifier__n_estimators": [10, 100, 1000],
        "classifier": [RandomForestClassifier()]
    },
]

Search grid and see best scores!

In [19]:
for classifier in param_grid:
    
    grid_search = GridSearchCV(pipeline, classifier, cv=10, verbose=1,n_jobs=-1)
    grid_search.fit(X_train, y_train)
    print(f"Best params:")
    print(grid_search.best_params_)
    print("Best score in grid search:")
    print(grid_search.best_score_)
    print("Best logistic regression from grid search:")
    print(grid_search.score(X_test, y_test))
    print()

#0.6459948320413437
#0.627906976744186

Fitting 10 folds for each of 8 candidates, totalling 80 fits
Best params:
{'classifier': LogisticRegression(C=100.0), 'classifier__C': 100.0, 'preprocesser__num_transform__imputer__strategy': 'mean'}
Best score in grid search:
0.6334059488898199
Best logistic regression from grid search:
0.6149870801033591

Fitting 10 folds for each of 6 candidates, totalling 60 fits
Best params:
{'classifier': RandomForestClassifier(), 'classifier__n_estimators': 100, 'preprocesser__num_transform__imputer__strategy': 'mean'}
Best score in grid search:
0.6275911185588605
Best logistic regression from grid search:
0.6227390180878553



 Best score we saw was 0.6459948320413437, but usually around 0.627 depending on how the test data was split