In [1]:
import pandas as pd
import joblib

In [2]:
# Load the model and scaler
clf = joblib.load('logistic_regression_model.pkl')
scaler = joblib.load('scaler.pkl')

In [3]:
#Loading the dataset
#Replace file path with path on your device
file_path = '/Users/nanakwasi/Downloads/archive/cbb.csv'
df = pd.read_csv(file_path)

In [4]:
# List of current CAA member schools
caa_member_schools = [
    'College of Charleston', 'Delaware', 'Drexel', 'Elon',
    'Hampton', 'Hofstra', 'Monmouth', 'North Carolina A&T',
    'UNC Wilmington', 'Northeastern', 'Stony Brook',
    'Towson', 'William & Mary'
]
# Normalize the school names to lowercase for consistent comparison
caa_member_schools = [school.lower() for school in caa_member_schools]

# Filter the dataset for the current CAA member schools
df['TEAM'] = df['TEAM'].str.lower()
caa_df = df[df['TEAM'].isin(caa_member_schools)].copy()

In [5]:
# Handle missing values
caa_df.loc[:, 'POSTSEASON'] = caa_df['POSTSEASON'].fillna('None')
caa_df.loc[:, 'SEED'] = caa_df['SEED'].fillna(0)

In [6]:
# Convert 'None' and non-numeric values to 0 in 'POSTSEASON' column
caa_df['POSTSEASON'] = caa_df['POSTSEASON'].apply(lambda x: 0 if not str(x).isdigit() else int(x))

In [7]:
# Derive the number of losses and create a binary target variable for win/loss
caa_df['LOSSES'] = caa_df['G'] - caa_df['W']
caa_df['WIN'] = (caa_df['W'] > caa_df['LOSSES']).astype(int)

In [8]:
# Select relevant features
selected_features = [
    'G', 'ADJOE', 'ADJDE', 'EFG_O', 'EFG_D', 'TOR', 'TORD', 'ORB', 
    'DRB', 'FTR', 'FTRD', '2P_O', '2P_D', '3P_O', '3P_D', 'ADJ_T', 'WAB', 'YEAR', 'POSTSEASON'
]

In [9]:
def predict_matchup(team1, team2):
    # Convert input team names to lowercase
    team1 = team1.lower()
    team2 = team2.lower()
    
    # Check if both teams exist in the dataset
    if team1 not in caa_df['TEAM'].values:
        return f"Error: Team {team1} not found in the dataset."
    if team2 not in caa_df['TEAM'].values:
        return f"Error: Team {team2} not found in the dataset."
    
    team1_stats = caa_df[caa_df['TEAM'] == team1][selected_features].iloc[0]
    team2_stats = caa_df[caa_df['TEAM'] == team2][selected_features].iloc[0]
    
    # Prepare the data for prediction
    matchup_data = pd.DataFrame([team1_stats, team2_stats])
    
    # Ensure all data is numeric
    matchup_data = matchup_data.apply(pd.to_numeric)
    
    # Standardize the data
    matchup_scaled = scaler.transform(matchup_data)
    
    # Predict the outcomes
    probabilities = clf.predict_proba(matchup_scaled)
    
    # Interpret the predictions
    outcome1 = 'Win' if probabilities[0][1] > probabilities[1][1] else 'Lose'
    outcome2 = 'Win' if outcome1 == 'Lose' else 'Lose'
    
    result = {
        'team1': team1,
        'team2': team2,
        'team1_outcome': outcome1,
        'team2_outcome': outcome2
    }
    
    return result

In [10]:
# Testing algorithm
team1 = input("Enter the name of Team 1: ")
team2 = input("Enter the name of Team 2: ")

result = predict_matchup(team1, team2)

if isinstance(result, str):  
    print(result)
else:
    print(f"Prediction for {team1} vs. {team2}:")
    print(f"{result['team1']} will {result['team1_outcome']}")
    print(f"{result['team2']} will {result['team2_outcome']}")

Enter the name of Team 1:  drexel
Enter the name of Team 2:  elon


Prediction for drexel vs. elon:
drexel will Lose
elon will Win
