In [1]:
# run(nba, model=SVC(random_state=0))
import datetime
import string
from joblib import load
import numpy as np
import pandas as pd
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, log_loss, recall_score, precision_score
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler, LabelEncoder
from sklearn.model_selection import train_test_split
import mlflow
import mlflow.sklearn
import matplotlib.pyplot as plt
from utils import *

# run(nba, model=SVC(random_state=0))
# Tuned SVC - rbf kernel
# model = SVC(kernel=kernels[0], random_state=0)
# run(nba, model=model)

In [2]:
TRACKING_URI = f"http://{os.environ.get('mlflow_user')}:{os.environ.get('mlflow_pw')}@ec2-54-86-81-54.compute-1.amazonaws.com"

mlflow.set_tracking_uri(TRACKING_URI)
client = mlflow.tracking.MlflowClient(TRACKING_URI)

expr_name = 'NBA_ELT_PIPELINE_ML'
# s3_bucket = 's3://jacobsbucket97/practice/'
experiment = mlflow.create_experiment(expr_name)
mlflow.set_experiment(expr_name)

<Experiment: artifact_location='./artifacts/4', experiment_id='4', lifecycle_stage='active', name='NBA_ELT_PIPELINE_ML', tags={}>

In [3]:
conn = sql_connection('ml_models')
past_games = pd.read_sql_query('select * from ml_past_games', conn)
tonights_games = pd.read_sql_query('select * from ml_tonights_games', conn)
completed_ml = pd.read_sql_query('select * from tonights_games_ml', conn)

past_games_full = pd.read_sql_query('select * from ml_past_games', conn)
tonights_games_full = pd.read_sql_query('select * from ml_tonights_games', conn).sort_values('home_team_avg_pts_scored')
tonights_games = tonights_games_full.drop(['home_team', 'away_team', 'proper_date', 'outcome'], axis = 1)
# completed_games = completed_games.to_numpy()
# tonights_games = tonights_games.to_numpy()

past_games = pd.read_sql_query('select * from ml_past_games', conn)
past_games_outcome = past_games['outcome']
past_games = past_games.drop(['home_team', 'away_team', 'proper_date', 'outcome'], axis = 1)

past_games_outcome = past_games_outcome.to_numpy()
past_games = past_games.to_numpy()

In [4]:
clf_linear_svc = LinearSVC(random_state=0).fit(past_games, past_games_outcome)
clf_svc = SVC(random_state=0).fit(past_games, past_games_outcome)
clf = LogisticRegression(random_state=0).fit(past_games, past_games_outcome)

print(f"Linear SVC score was {clf_linear_svc.score(past_games, past_games_outcome)}")
print(f"SVC score was {clf_svc.score(past_games, past_games_outcome)}")
print(f"Logistic Regression score was {clf.score(past_games, past_games_outcome)}")

Linear SVC score was 0.5697445972495089
SVC score was 0.6640471512770137
Logistic Regression score was 0.6719056974459725




In [10]:
log_accuracy = clf.score(past_games, past_games_outcome)
mlflow.log_metric("accuracy", log_accuracy)

mlflow.set_tags({"version": "1.0.0",
                "project": "nba_elt_pipeline",
                "run_type": "prod",
})


In [12]:
clf = load("log_model.joblib")

In [13]:
mlflow.sklearn.log_model(clf, "log_model.joblib")

ModelInfo(artifact_path='log_model.joblib', flavors={'python_function': {'model_path': 'model.pkl', 'loader_module': 'mlflow.sklearn', 'python_version': '3.8.10', 'env': 'conda.yaml'}, 'sklearn': {'pickled_model': 'model.pkl', 'sklearn_version': '1.0.2', 'serialization_format': 'cloudpickle'}}, model_uri='runs:/02ee85c1ec6743baa612b35e8de261c4/log_model.joblib', model_uuid='983ae2d57c0b4009994581e92b5c6997', run_id='02ee85c1ec6743baa612b35e8de261c4', saved_input_example_info=None, signature_dict=None, utc_time_created='2022-03-15 01:47:49.024272')

In [14]:
from mlflow.models.signature import infer_signature
signature = infer_signature(past_games, clf.predict(tonights_games))
mlflow.sklearn.log_model(clf, "iris_rf", signature=signature)



ModelInfo(artifact_path='iris_rf', flavors={'python_function': {'model_path': 'model.pkl', 'loader_module': 'mlflow.sklearn', 'python_version': '3.8.10', 'env': 'conda.yaml'}, 'sklearn': {'pickled_model': 'model.pkl', 'sklearn_version': '1.0.2', 'serialization_format': 'cloudpickle'}}, model_uri='runs:/02ee85c1ec6743baa612b35e8de261c4/iris_rf', model_uuid='073a49d89d8344b8aa331ca3830216e2', run_id='02ee85c1ec6743baa612b35e8de261c4', saved_input_example_info=None, signature_dict={'inputs': '[{"type": "tensor", "tensor-spec": {"dtype": "float64", "shape": [-1, 14]}}]', 'outputs': '[{"type": "tensor", "tensor-spec": {"dtype": "int64", "shape": [-1]}}]'}, utc_time_created='2022-03-15 01:51:18.181725')

In [5]:
# import pickle
# clf_linear_svc_pickle = pickle.dumps(clf_linear_svc)
# clf_linear_svc_pickle = pickle.loads(clf_linear_svc_pickle)

from joblib import dump, load # more efficient for large numpy arrays
dump(clf, 'log_model.joblib') 
# dump(clf_linear_svc, 'clf_linear_svc_model.joblib') 
# dump(clf_svc, 'clf_svc_model.joblib')

['log_model.joblib']

In [7]:
clf = load('log_model.joblib')
clf.score(past_games, past_games_outcome)

0.6772216547497446

In [None]:
clf_svc = LogisticRegression(random_state=0).fit(past_games, past_games_outcome)
clf_svc.score(past_games, past_games_outcome)

In [44]:
# feature importance
importance = clf.coef_[0]
print(importance)

[-0.04978847  0.04070776 -0.0367953   0.00377045  0.00076865  0.30607929
  0.06662241 -0.01126052  0.00899668 -0.0153359  -0.00437527 -0.46615876]


In [5]:
clf.predict(tonights_games)



array([1, 0, 0, 1, 1, 1])

In [6]:
clf.predict_proba(tonights_games)



array([[0.22881564, 0.77118436],
       [0.59036812, 0.40963188],
       [0.72943658, 0.27056342],
       [0.17054731, 0.82945269],
       [0.29047694, 0.70952306],
       [0.36025145, 0.63974855]])

In [68]:
tonights_ml = pd.DataFrame(clf.predict_proba(tonights_games)).rename(columns = {0: "away_team_predicted_win_pct", 1: "home_team_predicted_win_pct"})

tonights_games_final = tonights_games_full.reset_index().drop('outcome', axis = 1)

tonights_games_final['home_team_predicted_win_pct'] = tonights_ml['home_team_predicted_win_pct'].round(3)
tonights_games_final['away_team_predicted_win_pct'] = tonights_ml['away_team_predicted_win_pct'].round(3)



In [26]:
################### mlflow
model_pipeline = Pipeline(steps=[('scaler', MinMaxScaler()), ('model', LogisticRegression())])
model_pipeline.fit(past_games, past_games_outcome)

Pipeline(steps=[('scaler', MinMaxScaler()), ('model', LogisticRegression())])

In [22]:
df = (pd.DataFrame(past_games_outcome).iloc[:-1, :]).to_numpy()

In [19]:
# run mlflow ui in nba_elt_mlflow/practice
# https://medium.com/analytics-vidhya/machine-learning-model-workflow-and-tracking-using-mlflow-777c1df0a4cc
# ml flow tracks the model, the training/test data acurracy, the parameters, and the tags
TRACKING_URI = f"http://{os.environ.get('mlflow_user')}:{os.environ.get('mlflow_pw')}@ec2-54-86-81-54.compute-1.amazonaws.com"

mlflow.set_tracking_uri(TRACKING_URI)
client = mlflow.tracking.MlflowClient(TRACKING_URI)

expr_name = 'NBA_ELT_PIPELINE_ML'
mlflow.set_experiment(expr_name)

past_games_df = pd.DataFrame(past_games)
past_games_df.to_csv('past_games.csv', index = False)

X_train, X_test, y_train, y_test = train_test_split(past_games, past_games_outcome, test_size=0.5)
with mlflow.start_run():
    run = mlflow.active_run()
    run_id = run.info.run_id
    print(f"Active run_id: {run.info.run_id}")

    # dont do this
    # mlflow.set_tag("tag","nba_elt_pipeline")
    # mlflow.set_tag("tag","prod")
    mlflow.set_tags({"version": "1.0.0",
                    "project": "nba_elt_pipeline",
                    "run_type": "prod",
    })

    mlflow.log_artifact("past_games.csv", artifact_path="features")

    clf = LogisticRegression(random_state=0)

    clf.fit(X_train, y_train)

    y_pred = clf.predict(X_train)
    acc = accuracy_score(y_test, y_pred)
    mlflow.log_metric("testing data accuracy", acc)

    y_pred = clf.predict(X_test)

    acc = accuracy_score(y_test, y_pred)

    mlflow.log_metric("training data accuracy",acc)

    mlflow.sklearn.log_model(clf, artifact_path="sklearn-model")

Active run_id: 54b58733df8146c3aee515b5c6452a4b


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [17]:
mlflow.end_run()

In [12]:
TRACKING_URI = f"http://{os.environ.get('mlflow_user')}:{os.environ.get('mlflow_pw')}@ec2-54-86-81-54.compute-1.amazonaws.com"

mlflow.set_tracking_uri(TRACKING_URI)
client = mlflow.tracking.MlflowClient(TRACKING_URI)

expr_name = 'NBA_ELT_PIPELINE_ML'
mlflow.set_experiment(expr_name)
past_games.to_csv('past_games.csv')

with mlflow.start_run():
    run = mlflow.active_run()
    run_id = run.info.run_id
    print(f"Active run_id: {run.info.run_id}")

    # dont do this
    # mlflow.set_tag("tag","nba_elt_pipeline")
    # mlflow.set_tag("tag","prod")
    mlflow.set_tags({"version": "1.0.0",
                    "project": "nba_elt_pipeline",
                    "run_type": "prod",
    })

    mlflow.log_artifact("past_games.csv", artifact_path="features")

    clf = LogisticRegression(random_state=0).fit(past_games, past_games_outcome)

    y_pred = clf.predict(past_games)
    acc = accuracy_score(past_games, tonights_games)
    mlflow.log_metric("testing data accuracy", acc)

    y_pred = clf.predict(X_test)

    acc = accuracy_score(y_test, y_pred)

    mlflow.log_metric("training data accuracy",acc)

    mlflow.sklearn.log_model(clf, artifact_path="sklearn-model")

0.5732484076433121

In [29]:
# logged_model = f"mlruns/1/{run_id}/artifacts/sklearn-model/MLmodel"
sample1 = past_games[0].reshape(1, -1)
logged_model = f"mlruns/1/ff269f383b484c92b9e0b69aa1f8a826/artifacts/sklearn-model"
loaded_model = mlflow.pyfunc.load_model(logged_model)

sample=sample1
y_pred=loaded_model.predict(sample)
print(f"The predicted value is {y_pred}")

The predicted value is [1]
