In [3]:
from sklearn import linear_model
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error, precision_score, recall_score, accuracy_score, brier_score_loss, log_loss
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
import statsmodels.api as sm
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import itertools
import random
import os
import sys
import json
import pickle
from sklearn.preprocessing import StandardScaler

In [4]:
# https://meettank29067.medium.com/performance-measurement-in-logistic-regression-8c9109b25278

## Pulling in the data
### starting with predictALL for the large model

In [5]:
filepath = os.path.join(os.path.abspath(".."), "ETL", "modelData", "modelLarge.json")
data = pd.read_json(filepath)
print(len(data.columns))
print(data.columns)


75
Index(['name', 'wins', 'tko', 'sub', 'udec', 'sdec', 'height', 'weight',
       'reach', 'dob', 'slpm', 'sapm', 'strAcc', 'strDef', 'hslpm', 'hsapm',
       'hstrAcc', 'bslpm', 'bsapm', 'bstrAcc', 'lslpm', 'lsapm', 'lstrAcc',
       'dslpm', 'dsapm', 'dstrAcc', 'cslpm', 'csapm', 'cstrAcc', 'gslpm',
       'gsapm', 'gstrAcc', 'tdAvg', 'tdAcc', 'tdDef', 'subAvg', 'ctrlAvg',
       'name1', 'wins1', 'tko1', 'sub1', 'udec1', 'sdec1', 'height1',
       'weight1', 'reach1', 'dob1', 'slpm1', 'sapm1', 'strAcc1', 'strDef1',
       'hslpm1', 'hsapm1', 'hstrAcc1', 'bslpm1', 'bsapm1', 'bstrAcc1',
       'lslpm1', 'lsapm1', 'lstrAcc1', 'dslpm1', 'dsapm1', 'dstrAcc1',
       'cslpm1', 'csapm1', 'cstrAcc1', 'gslpm1', 'gsapm1', 'gstrAcc1',
       'tdAvg1', 'tdAcc1', 'tdDef1', 'subAvg1', 'ctrlAvg1', 'winner'],
      dtype='object')


# I will run best subset and see if it makes sense
## then ill run best subset on my selected model
## I think a good model will include 
- tko
- sub
- udec
- sdec
- height
- reach
- dob
- strapm
- strAcc   
- strDef
- hstrAcc
- bstrAcc
- lstrAcc
- dstrAcc
- cstrAcc
- gstrAcc
- tdAvg
- TdAcc
- tdDef
- subAvg
- ctrlAvg

  - try best subset where we just change strAcc preditctors. 
    - this way we can use all other predictors and find out exactly which of these matters
    - first model will have strAcc 
    - second will have all other acc 
    - best subset.

In [6]:
# Initialize the scaler
scaler = StandardScaler()
# iterate over the numerical columns and use this z-score to normalize
# probably not needed, but will want to try on things like wins, tko, age, height
# each normilizatoin step increases data complexity alot once we get to the 
# weightclass models 
# Now use the scaled features to train your logistic regression model


Checking the Variance inflation factor of the data

In [7]:
numerical_cols = pd.DataFrame()
for col in data.columns:
    if data[col].dtype != "object":
        numerical_cols[col] = data[col]
    else:
        print(col)
# I need to make the result numeric for 0-1
print(numerical_cols.columns)



name
name1
Index(['wins', 'tko', 'sub', 'udec', 'sdec', 'height', 'weight', 'reach',
       'dob', 'slpm', 'sapm', 'strAcc', 'strDef', 'hslpm', 'hsapm', 'hstrAcc',
       'bslpm', 'bsapm', 'bstrAcc', 'lslpm', 'lsapm', 'lstrAcc', 'dslpm',
       'dsapm', 'dstrAcc', 'cslpm', 'csapm', 'cstrAcc', 'gslpm', 'gsapm',
       'gstrAcc', 'tdAvg', 'tdAcc', 'tdDef', 'subAvg', 'ctrlAvg', 'wins1',
       'tko1', 'sub1', 'udec1', 'sdec1', 'height1', 'weight1', 'reach1',
       'dob1', 'slpm1', 'sapm1', 'strAcc1', 'strDef1', 'hslpm1', 'hsapm1',
       'hstrAcc1', 'bslpm1', 'bsapm1', 'bstrAcc1', 'lslpm1', 'lsapm1',
       'lstrAcc1', 'dslpm1', 'dsapm1', 'dstrAcc1', 'cslpm1', 'csapm1',
       'cstrAcc1', 'gslpm1', 'gsapm1', 'gstrAcc1', 'tdAvg1', 'tdAcc1',
       'tdDef1', 'subAvg1', 'ctrlAvg1', 'winner'],
      dtype='object')


In [8]:
# vif = pd.DataFrame()
# vif["Variable"] = numerical_cols.columns
# vif["VIF"] = [sm.OLS(numerical_cols[col], numerical_cols.drop(col, axis=1)).fit().rsquared for col in numerical_cols.columns]
# for i in range(len(vif)):
#     factor = 1/(1-vif.loc[i][1])
#     print(vif.loc[i]["Variable"],factor)

In [17]:
model = LogisticRegression(max_iter=5000)
# MAIN metric is BIC
# precision -> tp / (tp + fp) :: how precisce are true predictions
# recall -> tp / (tp + fn) :: all true predictions 
# accuracy -> (tp + tn) / total :: total correct
modelData = data.drop(["name","name1"], axis=1)
x_train, x_test, y_train, y_test = train_test_split(modelData.drop(["winner"],axis=1), 
                                                    modelData["winner"], 
                                                    test_size=0.2, 
                                                    random_state=43)

model = model.fit(x_train, y_train)
y_predicted = model.predict(x_test)
results = pd.DataFrame({"Index":y_test.index.values,
                        "y_test": y_test.values,
                        "y_pred": y_predicted} )
print(results)

     Index  y_test  y_pred
0     2353       0       0
1     2066       0       0
2     3175       1       1
3     3452       0       0
4     1604       0       0
..     ...     ...     ...
960    891       0       1
961   2732       0       0
962   3426       1       0
963   2883       1       0
964    294       0       0

[965 rows x 3 columns]


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


## metrics for model with all 72 predictors

In [16]:
# MAIN metric is BIC
# precision -> tp / (tp + fp) :: true pos / all true
# recall -> tp / (tp + fn) :: true predictions / actual true in data
# accuracy -> (tp + tn) / total :: total correct
# Calculate precision
def printMetrics(model, results):
    precision = precision_score(results["y_test"], results["y_pred"], average="macro")

    # # Calculate recall
    recall = recall_score(results["y_test"], results["y_pred"], average="macro")

    # # Calculate accuracy
    accuracy = accuracy_score(results["y_test"], results["y_pred"])

    # print(results["y_test"].values, results["y_pred"])
    # # Calculate BIC
    # BIC is not a standard metric provided by scikit-learn, but you can calculate it using log_loss
    log_loss_value = log_loss(results["y_test"].values, results["y_pred"].values,labels=[0, 1])
    n = len(results["y_test"])  # Number of samples
    p = len(model.coef_[0]) + 1  # Number of parameters (including intercept)
    bic = log_loss_value + 0.5 * p * np.log(n)

    predictors = [list(model.feature_names_in_)]

    print(f"Precision: {precision}")
    print(f"Recall: {recall}")
    print(f"Accuracy: {accuracy}")
    print(f"BIC: {bic}")
    print(f"Predictors: {predictors}")
    print(f"number of predictors: {p}")
printMetrics(model, results)

Precision: 0.4514333045231922
Recall: 0.4593913007274176
Accuracy: 0.6766839378238342
BIC: 261.8138509283035
Predictors: [['wins', 'tko', 'sub', 'udec', 'sdec', 'height', 'weight', 'reach', 'dob', 'slpm', 'sapm', 'strAcc', 'strDef', 'hslpm', 'hsapm', 'hstrAcc', 'bslpm', 'bsapm', 'bstrAcc', 'lslpm', 'lsapm', 'lstrAcc', 'dslpm', 'dsapm', 'dstrAcc', 'cslpm', 'csapm', 'cstrAcc', 'gslpm', 'gsapm', 'gstrAcc', 'tdAvg', 'tdAcc', 'tdDef', 'subAvg', 'ctrlAvg', 'wins1', 'tko1', 'sub1', 'udec1', 'sdec1', 'height1', 'weight1', 'reach1', 'dob1', 'slpm1', 'sapm1', 'strAcc1', 'strDef1', 'hslpm1', 'hsapm1', 'hstrAcc1', 'bslpm1', 'bsapm1', 'bstrAcc1', 'lslpm1', 'lsapm1', 'lstrAcc1', 'dslpm1', 'dsapm1', 'dstrAcc1', 'cslpm1', 'csapm1', 'cstrAcc1', 'gslpm1', 'gsapm1', 'gstrAcc1', 'tdAvg1', 'tdAcc1', 'tdDef1', 'subAvg1', 'ctrlAvg1']]
number of predictors: 73


  _warn_prf(average, modifier, msg_start, len(result))


In [22]:
base_model_predictors = [
    "wins",
    "wins1",
    "tko",
    "tko1",
    "udec",
    "udec1",
    "sdec",
    "sdec1",
    "height",
    "height1",
    "reach",
    "reach1",
    "dob",
    "dob1",
    "sapm",
    "sapm1",
    "strDef",
    "strDef1",
    "strAcc",
    "strAcc1",
    "tdAvg",
    "tdAvg1",
    "tdAcc",
    "tdAcc1",
    "tdDef",
    "tdDef1",
    "subAvg",
    "subAvg1",
    "ctrlAvg",
    "ctrlAvg1",
    "winner"
]
base_data = modelData[base_model_predictors]
base_model = LogisticRegression(max_iter=5000)
# MAIN metric is BIC
# precision -> tp / (tp + fp) :: how precisce are true predictions
# recall -> tp / (tp + fn) :: all true predictions 
# accuracy -> (tp + tn) / total :: total correct
x_train, x_test, y_train, y_test = train_test_split(base_data.drop(["winner"],axis=1), 
                                                    base_data["winner"], 
                                                    test_size=0.2, 
                                                    random_state=43)

base_model = model.fit(x_train, y_train)
y_predicted = model.predict(x_test)
results = pd.DataFrame({"Index":y_test.index.values,
                        "y_test": y_test.values,
                        "y_pred": y_predicted} )
printMetrics(base_model, results)


Precision: 0.4518064516129032
Recall: 0.46026711632139605
Accuracy: 0.677720207253886
BIC: 117.46180986403027
Predictors: [['wins', 'wins1', 'tko', 'tko1', 'udec', 'udec1', 'sdec', 'sdec1', 'height', 'height1', 'reach', 'reach1', 'dob', 'dob1', 'sapm', 'sapm1', 'strDef', 'strDef1', 'strAcc', 'strAcc1', 'tdAvg', 'tdAvg1', 'tdAcc', 'tdAcc1', 'tdDef', 'tdDef1', 'subAvg', 'subAvg1', 'ctrlAvg', 'ctrlAvg1']]
number of predictors: 31


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
  _warn_prf(average, modifier, msg_start, len(result))


In [21]:
spread_str_predictors = [
    "wins",
    "wins1",
    "tko",
    "tko1",
    "udec",
    "udec1",
    "sdec",
    "sdec1",
    "height",
    "height1",
    "reach",
    "reach1",
    "dob",
    "dob1",
    "sapm",
    "sapm1",
    "strDef",
    "strDef1",
    # "strAcc",
    # "strAcc1",
    "hstrAcc",
    "hstrAcc1",
    "bstrAcc",
    "bstrAcc1",
    "lstrAcc",
    "lstrAcc1",
    "dstrAcc",
    "dstrAcc1",
    "cstrAcc",
    "cstrAcc1",
    "gstrAcc",
    "gstrAcc1",
    "tdAvg",
    "tdAvg1",
    "tdAcc",
    "tdAcc1",
    "tdDef",
    "tdDef1",
    "subAvg",
    "subAvg1",
    "ctrlAvg",
    "ctrlAvg1",
    "winner"
]
spread_str_data = modelData[spread_str_predictors]
spread_model = LogisticRegression(max_iter=5000)
# MAIN metric is BIC
# precision -> tp / (tp + fp) :: how precisce are true predictions
# recall -> tp / (tp + fn) :: all true predictions 
# accuracy -> (tp + tn) / total :: total correct
x_train, x_test, y_train, y_test = train_test_split(spread_str_data.drop(["winner"],axis=1), 
                                                    spread_str_data["winner"], 
                                                    test_size=0.2, 
                                                    random_state=43)

spread_model = model.fit(x_train, y_train)
y_predicted = model.predict(x_test)
results = pd.DataFrame({"Index":y_test.index.values,
                        "y_test": y_test.values,
                        "y_pred": y_predicted} )
printMetrics(spread_model, results)

Precision: 0.46077419354838706
Recall: 0.4694118801634459
Accuracy: 0.6911917098445596
BIC: 151.33688820071637
Predictors: [['wins', 'wins1', 'tko', 'tko1', 'udec', 'udec1', 'sdec', 'sdec1', 'height', 'height1', 'reach', 'reach1', 'dob', 'dob1', 'sapm', 'sapm1', 'strDef', 'strDef1', 'hstrAcc', 'hstrAcc1', 'bstrAcc', 'bstrAcc1', 'lstrAcc', 'lstrAcc1', 'dstrAcc', 'dstrAcc1', 'cstrAcc', 'cstrAcc1', 'gstrAcc', 'gstrAcc1', 'tdAvg', 'tdAvg1', 'tdAcc', 'tdAcc1', 'tdDef', 'tdDef1', 'subAvg', 'subAvg1', 'ctrlAvg', 'ctrlAvg1']]
number of predictors: 41


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
  _warn_prf(average, modifier, msg_start, len(result))
