In [1]:
# run(nba, model=SVC(random_state=0))
import datetime
import string
import numpy as np
import pandas as pd
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, log_loss, recall_score, precision_score
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler, LabelEncoder
from sklearn.model_selection import train_test_split
import mlflow
import mlflow.sklearn
import matplotlib.pyplot as plt
from utils import *

# run(nba, model=SVC(random_state=0))
# Tuned SVC - rbf kernel
# model = SVC(kernel=kernels[0], random_state=0)
# run(nba, model=model)

In [2]:
conn = sql_connection('ml_models')
past_games = pd.read_sql_query('select * from ml_past_games', conn)
tonights_games = pd.read_sql_query('select * from ml_tonights_games', conn)
completed_ml = pd.read_sql_query('select * from tonights_games_ml', conn)

past_games_full = pd.read_sql_query('select * from ml_past_games', conn)
tonights_games_full = pd.read_sql_query('select * from ml_tonights_games', conn).sort_values('home_team_avg_pts_scored')
tonights_games = tonights_games_full.drop(['home_team', 'away_team', 'proper_date', 'outcome'], axis = 1)
# completed_games = completed_games.to_numpy()
# tonights_games = tonights_games.to_numpy()

past_games = pd.read_sql_query('select * from ml_past_games', conn)
past_games_outcome = past_games['outcome']
past_games = past_games.drop(['home_team', 'away_team', 'proper_date', 'outcome'], axis = 1)

past_games_outcome = past_games_outcome.to_numpy()
past_games = past_games.to_numpy()

In [22]:
clf_linear_svc = LinearSVC(random_state=0).fit(past_games, past_games_outcome)
clf_svc = SVC(random_state=0).fit(past_games, past_games_outcome)
clf = LogisticRegression(random_state=0).fit(past_games, past_games_outcome)

print(f"Linear SVC score was {clf_linear_svc.score(past_games, past_games_outcome)}")
print(f"SVC score was {clf_svc.score(past_games, past_games_outcome)}")
print(f"Logistic Regression score was {clf.score(past_games, past_games_outcome)}")

Linear SVC score was 0.5912951167728238
SVC score was 0.6613588110403397
Logistic Regression score was 0.667728237791932


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [15]:
import pickle
clf_linear_svc_pickle = pickle.dumps(clf_linear_svc)
clf_linear_svc_pickle = pickle.loads(clf_linear_svc_pickle)

from joblib import dump, load # more efficient for large numpy arrays
dump(clf, 'clf_model.joblib') 
dump(clf_linear_svc, 'clf_linear_svc_model.joblib') 
dump(clf_svc, 'clf_svc_model.joblib')

['clf_svc_model.joblib']

In [None]:
clf_svc = LogisticRegression(random_state=0).fit(past_games, past_games_outcome)
clf_svc.score(past_games, past_games_outcome)

In [44]:
# feature importance
importance = clf.coef_[0]
print(importance)

[-0.04978847  0.04070776 -0.0367953   0.00377045  0.00076865  0.30607929
  0.06662241 -0.01126052  0.00899668 -0.0153359  -0.00437527 -0.46615876]


In [35]:
clf.predict(tonights_games)



array([1, 1, 1, 0, 0, 0, 1])

In [51]:
clf.predict_proba(tonights_games)



array([[0.48876101, 0.51123899],
       [0.2590111 , 0.7409889 ],
       [0.27315915, 0.72684085],
       [0.56128013, 0.43871987],
       [0.57157408, 0.42842592],
       [0.77698252, 0.22301748],
       [0.40668747, 0.59331253]])

In [68]:
tonights_ml = pd.DataFrame(clf.predict_proba(tonights_games)).rename(columns = {0: "away_team_predicted_win_pct", 1: "home_team_predicted_win_pct"})

tonights_games_final = tonights_games_full.reset_index().drop('outcome', axis = 1)

tonights_games_final['home_team_predicted_win_pct'] = tonights_ml['home_team_predicted_win_pct'].round(3)
tonights_games_final['away_team_predicted_win_pct'] = tonights_ml['away_team_predicted_win_pct'].round(3)



In [26]:
################### mlflow
model_pipeline = Pipeline(steps=[('scaler', MinMaxScaler()), ('model', LogisticRegression())])
model_pipeline.fit(past_games, past_games_outcome)

Pipeline(steps=[('scaler', MinMaxScaler()), ('model', LogisticRegression())])

In [21]:
# run mlflow ui in nba_elt_mlflow/practice
# https://medium.com/analytics-vidhya/machine-learning-model-workflow-and-tracking-using-mlflow-777c1df0a4cc
# ml flow tracks the model, the training/test data acurracy, the parameters, and the tags
past_games_df = pd.DataFrame(past_games)
past_games_df.to_csv('past_games.csv', index = False)

X_train, X_test, y_train, y_test = train_test_split(past_games, past_games_outcome, test_size=0.5)
mlflow.set_experiment("Test mlflow feature")
with mlflow.start_run():
    run = mlflow.active_run()
    run_id = run.info.run_id
    print(f"Active run_id: {run.info.run_id}")

    # dont do this
    # mlflow.set_tag("tag","nba_elt_pipeline")
    # mlflow.set_tag("tag","prod")
    mlflow.set_tags({"version": "1.0.0",
                    "project": "nba_elt_pipeline",
                    "run_type": "prod",
    })

    mlflow.log_artifact("past_games.csv", artifact_path="features")

    clf = RandomForestClassifier(n_estimators=12, random_state=0)

    mlflow.log_param("number of estimators", 12)

    clf.fit(X_train, y_train)

    y_pred = clf.predict(X_train)
    acc = accuracy_score(y_test, y_pred)
    mlflow.log_metric("testing data accuracy", acc)

    y_pred = clf.predict(X_test)

    acc = accuracy_score(y_test, y_pred)

    mlflow.log_metric("training data accuracy",acc)

    mlflow.sklearn.log_model(clf, artifact_path="sklearn-model")

Active run_id: bf2de72fb09a4e8c9b61fe79bf054944


In [12]:
acc

0.5732484076433121

In [29]:
# logged_model = f"mlruns/1/{run_id}/artifacts/sklearn-model/MLmodel"
sample1 = past_games[0].reshape(1, -1)
logged_model = f"mlruns/1/ff269f383b484c92b9e0b69aa1f8a826/artifacts/sklearn-model"
loaded_model = mlflow.pyfunc.load_model(logged_model)

sample=sample1
y_pred=loaded_model.predict(sample)
print(f"The predicted value is {y_pred}")

The predicted value is [1]
