In [26]:
# Import our libraries
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import pickle
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.linear_model import LogisticRegressionCV, LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, roc_curve, auc, make_scorer
from sklearn.ensemble import VotingClassifier
import tensorflow.compat.v1 as tf
tf.disable_v2_behavior()
import keras
import warnings
warnings.filterwarnings("ignore")

In [12]:
# Load in our X (processed_fights.csv) and y (processed_data_labels.csv)
X = pd.read_csv('data/processed/processed_fights.csv')
y = pd.read_csv('data/processed/processed_data_labels.csv')

In [27]:
X = pd.read_csv('data/permutations/minus_all_11.csv')
y = pd.read_csv('data/processed/processed_data_labels.csv')

In [28]:
# Create our train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
print('Training set shape:', X_train.shape, y_train.shape)
print('Testing set shape:', X_test.shape, y_test.shape)
print('Number of 0s in test =', (y_test == 0).sum().sum())
print('Number of 1s in test =', (y_test == 1).sum().sum())

Training set shape: (4164, 24) (4164, 1)
Testing set shape: (1042, 24) (1042, 1)
Number of 0s in test = 532
Number of 1s in test = 510


In [4]:
# # Standardize non-categorical features
# features_to_scale = ['f1_reach', 'f1_sapm', 'f1_slpm',
#        'f1_stk_acc', 'f1_stk_def', 'f1_sub_avg', 'f1_td_acc', 'f1_td_avg',
#        'f1_td_def', 'f1_weight', 'f2_reach', 'f2_sapm', 'f2_slpm',
#        'f2_stk_acc', 'f2_stk_def', 'f2_sub_avg', 'f2_td_acc', 'f2_td_avg',
#        'f2_td_def', 'f2_weight', 'f1_wins', 'f1_losses', 'f1_draws', 'f2_wins',
#        'f2_losses', 'f2_draws', 'f1_age', 'f2_age', 'f1_height', 'f2_height']

# scaler = StandardScaler()
# X_train[features_to_scale] = scaler.fit_transform(X_train[features_to_scale])
# X_test[features_to_scale] = scaler.transform(X_test[features_to_scale])

# # Save our scaler for later reference
# with open('scaler.pkl', 'wb') as f:
#     pickle.dump(scaler, f)

In [None]:
# Train a logistic regression model
lr = LogisticRegressionCV(penalty='l1', solver='liblinear', random_state=42)
lr.fit(X_train, y_train)

# Determine our logistic regression model accuracy and AUC
y_pred = lr.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
fpr, tpr, thresholds = roc_curve(y_test, lr.predict_proba(X_test)[:,1])
roc_auc = auc(fpr, tpr)
print('Accuracy:', accuracy)
plt.plot(fpr, tpr, label='Logistic Regression (AUC = %0.2f)' % roc_auc)
plt.title('Receiver Operating Characteristic')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.legend(loc="lower right")
plt.show()

In [29]:
# Train on multiple models
classifiers = [
    LogisticRegressionCV(random_state=42),
    KNeighborsClassifier(),
    DecisionTreeClassifier(random_state=42),
    RandomForestClassifier(random_state=42),
    GradientBoostingClassifier(random_state=42),
    AdaBoostClassifier(random_state=42),
]

classifiers_names = [
    'Logistic Regression',
    'K Neighbors',
    'Decision Tree Classifier',
    'RandomForest Classifier',
    'GradientBoostingClassifier',
    'AdaBoostClassifier'
]

for index, clf in enumerate(classifiers):
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    fpr, tpr, thresholds = roc_curve(y_test, clf.predict_proba(X_test)[:,1])
    roc_auc = auc(fpr, tpr)
    print(classifiers_names[index])
    print('Accuracy:', accuracy)
    print('AUC: %0.2f' % roc_auc)
    print()

    with open('models/{}.pkl'.format(classifiers_names[index]), 'wb') as f:
        pickle.dump(clf, f)


Logistic Regression
Accuracy: 0.6938579654510557
AUC: 0.76

K Neighbors
Accuracy: 0.5921305182341651
AUC: 0.64

Decision Tree Classifier
Accuracy: 0.6017274472168906
AUC: 0.60

RandomForest Classifier
Accuracy: 0.6880998080614203
AUC: 0.76

GradientBoostingClassifier
Accuracy: 0.6967370441458733
AUC: 0.78

AdaBoostClassifier
Accuracy: 0.6813819577735125
AUC: 0.76



In [7]:
# Define the layers of the neural network
model = keras.Sequential([
    keras.layers.Dense(64, input_dim=X_train.shape[1], activation='relu'),
    keras.layers.Dense(32, activation='relu'),
    keras.layers.Dense(1, activation='sigmoid')
])

# Compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Train the model
model.fit(X_train, y_train, epochs=100, batch_size=32, validation_data=(X_test, y_test))

# Evaluate the model on the testing set
test_loss, test_acc = model.evaluate(X_test, y_test)
print('Test accuracy:', test_acc)

# Make predictions on new data
# predictions = model.predict(subtracted_fights_data)


Train on 4164 samples, validate on 1042 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100

In [8]:
# import pandas as pd
# from sklearn.model_selection import train_test_split
# import tensorflow as tf
# from tensorflow import keras

# # Load the preprocessed data and labels
# data = pd.read_csv('new_processed_fights_data.csv')
# #originalData = pd.read_csv('processed_fights.csv')
# labels = pd.read_csv('processed_data_labels.csv')

# # Add new feature to data set
# #data['fighter1_win_pct'] = data['wins_fighter1'] / (data['wins_fighter1'] + data['losses_fighter1'])
# #data['fighter2_win_pct'] = data['wins_fighter2'] / (data['wins_fighter2'] + data['losses_fighter2'])

# # Drop the original wins, losses, and draws columns from the data set
# #data = data.drop(columns=['wins_fighter1', 'losses_fighter1', 'draws_fighter1',
# #                         'wins_fighter2', 'losses_fighter2', 'draws_fighter2'])

# # Split the data and labels into training and testing sets
# X_train, X_test, y_train, y_test = train_test_split(data, labels, test_size=0.2)

# # Define the layers of the neural network
# model = keras.Sequential([
#     keras.layers.Dense(64, input_dim=X_train.shape[1], activation='relu'),
#     keras.layers.Dense(32, activation='relu'),
#     keras.layers.Dense(1, activation='sigmoid')
# ])

# # Compile the model
# model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# # Train the model
# model.fit(X_train, y_train, epochs=100, batch_size=32, validation_data=(X_test, y_test))

# # Evaluate the model on the testing set
# test_loss, test_acc = model.evaluate(X_test, y_test)
# print('Test accuracy:', test_acc)


In [30]:
# Load the previously trained models
with open('models/Logistic Regression.pkl', 'rb') as f:
    lr = pickle.load(f)

with open('models/GradientBoostingClassifier.pkl', 'rb') as f:
    gbc = pickle.load(f)
    
with open('models/AdaBoostClassifier.pkl', 'rb') as f:
    abc = pickle.load(f)

with open('models/RandomForest Classifier.pkl', 'rb') as f:
   rfc = pickle.load(f)

with open('models/Decision Tree Classifier.pkl', 'rb') as f:
   dtc = pickle.load(f)

# Create the ensemble model
ensemble_model = VotingClassifier(estimators=[
    ('lr', lr),
    ('gbc', gbc),
    ('abc',abc),
    # ('rfc',rfc),
    # ('dtc',dtc)
    ], voting='hard')

# Fit the ensemble model to the training data
ensemble_model.fit(X_train, y_train)

# Predict the labels of the test set using the ensemble model
y_pred_ensemble = ensemble_model.predict(X_test)

# Calculate the accuracy of the ensemble model
accuracy_ensemble = accuracy_score(y_test, y_pred_ensemble)
print('Ensemble model accuracy:', accuracy_ensemble)

with open('models/voteClass.pkl', 'wb') as f:
    pickle.dump(ensemble_model, f)


Ensemble model accuracy: 0.6919385796545106


In [10]:
from mlxtend.classifier import StackingClassifier

# Initialize the StackingClassifier
stacked_model = StackingClassifier(
    classifier=[lr, gbc, abc, rfc, dtc],
    meta_classifier=LogisticRegression(),
    use_probas=True,
    average_probas=False
)

# Train the stacked model using 5-fold cross-validation
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

for train_index, test_index in skf.split(X_train, y_train):
    X_train_fold, X_val_fold = X_train.iloc[train_index], X_train.iloc[test_index]
    y_train_fold, y_val_fold = y_train.iloc[train_index], y_train.iloc[test_index]
    stacked_model.fit(X_train_fold, y_train_fold)
    val_pred_fold = stacked_model.predict(X_val_fold)
    print('Fold accuracy:', accuracy_score(y_val_fold, val_pred_fold))

# Make predictions on the test set
test_pred = stacked_model.predict(X_test)
print('Test set accuracy:', accuracy_score(y_test, test_pred))

TypeError: StackingClassifier.__init__() got an unexpected keyword argument 'classifier'

In [None]:
# import pandas as pd
# import pickle
# from sklearn.metrics import accuracy_score

# # Load the models
# with open('lr.pkl', 'rb') as f:
#     lr = pickle.load(f)
# with open('gbc_gs.pkl', 'rb') as f:
#     gbc = pickle.load(f)


# # Load the data
# X_test = pd.read_csv('processed_fights.csv')
# y_test = pd.read_csv('processed_data_labels.csv')

# # Make predictions on the test data
# lr_pred = lr.predict(X_test)
# gbc_pred = gbc.predict(X_test)

# # Compute the accuracy of each model
# lr_acc = accuracy_score(y_test, lr_pred)
# gbc_acc = accuracy_score(y_test, gbc_pred)

# # Print the accuracy of each model
# print(f"Logistic Regression Accuracy: {lr_acc}")
# print(f"Gradient Boosting Accuracy: {gbc_acc}")

# # Create a blended prediction by taking a weighted average of the individual model predictions
# blended_pred = (0.3 * lr_pred) + (0.5 * gbc_pred)

# # Compute the accuracy of the blended prediction
# blended_acc = accuracy_score(y_test, blended_pred)

# # Print the accuracy of the blended prediction
# print(f"Blended Accuracy: {blended_acc}")
