In [2]:
# install dependencies
import pandas as pd
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, classification_report
import matplotlib.pyplot as plt

In [3]:
df = pd.read_csv("match_data.csv")


Columns (89,91,97,98) have mixed types. Specify dtype option on import or set low_memory=False.



In [4]:
df.head()

Unnamed: 0,Club ID,Club Name,Competition Name,Match ID,Period Number,Period Type,Assists,Assists Defensive,Assists Turnover Ratio,Bench Points,...,Basketball Matches - Match → Home Name,Basketball Matches - Match → Away Name,Basketball Matches - Match → Home ID,Basketball Matches - Match → Away ID,Basketball Matches - Match → Home Score,Basketball Matches - Match → Away Score,Basketball Matches - Match → Include In Conf Stats,Basketball Matches - Match → Include In Standings,Basketball Matches - Match → Place If Won,Basketball Matches - Match → Place If Lost
0,17257.0,Drake,2018-19 Men's Basketball,1007583,0,REGULAR,16,0,1.45,48,...,Drake,Buena Vista,103544.0,103383.0,98.0,52.0,0.0,0.0,0.0,0.0
1,17096.0,Buena Vista,2018-19 Men's Basketball,1007583,0,REGULAR,10,0,0.67,31,...,Drake,Buena Vista,103544.0,103383.0,98.0,52.0,0.0,0.0,0.0,0.0
2,18243.0,Col. of Idaho,2018-19 Men's Basketball,1007743,0,REGULAR,15,0,0.83,33,...,Utah,Col. of Idaho,104389.0,106093.0,96.0,76.0,0.0,0.0,0.0,0.0
3,18096.0,Utah,2018-19 Men's Basketball,1007743,0,REGULAR,22,0,2.44,38,...,Utah,Col. of Idaho,104389.0,106093.0,96.0,76.0,0.0,0.0,0.0,0.0
4,17296.0,Emory,2018-19 Men's Basketball,1007998,0,REGULAR,3,0,0.75,15,...,North Georgia,Emory,104002.0,103583.0,88.0,92.0,0.0,0.0,0.0,0.0


In [5]:
df.info()
df.isnull().sum()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 634523 entries, 0 to 634522
Columns: 107 entries, Club ID to Basketball Matches - Match → Place If Lost
dtypes: float64(34), int64(63), object(10)
memory usage: 518.0+ MB


Club ID                                                   11
Club Name                                                 11
Competition Name                                           0
Match ID                                                   0
Period Number                                              0
                                                       ...  
Basketball Matches - Match → Away Score               177125
Basketball Matches - Match → Include In Conf Stats    167061
Basketball Matches - Match → Include In Standings     167061
Basketball Matches - Match → Place If Won             160075
Basketball Matches - Match → Place If Lost            160075
Length: 107, dtype: int64

In [6]:
# Data preprocessing

# Drop rows with NaN values
# Drop irrelevant columns
df.drop(['Club ID', 'Match ID', 'Period Number', 'Period Type', 'Biggest Lead', 'Biggest Lead Score', 'Biggest Scoring Run Score', 'Basketball Matches - Match → Status', 'Basketball Matches - Match → Away Score', 'Basketball Matches - Match → Include In Conf Stats', 'Basketball Matches - Match → Include In Standings', 'Basketball Matches - Match → Place If Won', 'Basketball Matches - Match → Place If Lost', 'Basketball Matches - Match → Attendance', 'Basketball Matches - Match → Weather', 'Basketball Matches - Match → Is Conference', 'Basketball Matches - Match → Full Time Played'], axis=1, inplace=True)
df.head()

Unnamed: 0,Club Name,Competition Name,Assists,Assists Defensive,Assists Turnover Ratio,Bench Points,Biggest Scoring Run,Blocks,Blocks Received,Defensive Points Per Possession,...,Team ID,Team Name,Basketball Matches - Match → Extra Periods Used,Basketball Matches - Match → Match Time Utc,Basketball Matches - Match → Tournament ID,Basketball Matches - Match → Home Name,Basketball Matches - Match → Away Name,Basketball Matches - Match → Home ID,Basketball Matches - Match → Away ID,Basketball Matches - Match → Home Score
0,Drake,2018-19 Men's Basketball,16,0,1.45,48,0,11,0,0.71,...,103544,Drake,0.0,2018-11-09,0.0,Drake,Buena Vista,103544.0,103383.0,98.0
1,Buena Vista,2018-19 Men's Basketball,10,0,0.67,31,0,0,11,1.34,...,103383,Buena Vista,0.0,2018-11-09,0.0,Drake,Buena Vista,103544.0,103383.0,98.0
2,Col. of Idaho,2018-19 Men's Basketball,15,0,0.83,33,11,1,4,1.25,...,106093,Col. of Idaho,0.0,2018-11-01,0.0,Utah,Col. of Idaho,104389.0,106093.0,96.0
3,Utah,2018-19 Men's Basketball,22,0,2.44,38,9,4,1,0.99,...,104389,Utah,0.0,2018-11-01,0.0,Utah,Col. of Idaho,104389.0,106093.0,96.0
4,Emory,2018-19 Men's Basketball,3,0,0.75,15,6,2,2,1.22,...,103583,Emory,0.0,2018-11-05,0.0,North Georgia,Emory,104002.0,103583.0,88.0


In [7]:
# Feature selection/engineering
# Identify relevant features and target variable
X = df.drop(['Club Name', 'Competition Name'], axis=1)  # Features
y = df['Competition Name']  # Target variable

# One-hot encode categorical features
X = pd.get_dummies(X)

In [8]:
df.head()

Unnamed: 0,Club Name,Competition Name,Assists,Assists Defensive,Assists Turnover Ratio,Bench Points,Biggest Scoring Run,Blocks,Blocks Received,Defensive Points Per Possession,...,Team ID,Team Name,Basketball Matches - Match → Extra Periods Used,Basketball Matches - Match → Match Time Utc,Basketball Matches - Match → Tournament ID,Basketball Matches - Match → Home Name,Basketball Matches - Match → Away Name,Basketball Matches - Match → Home ID,Basketball Matches - Match → Away ID,Basketball Matches - Match → Home Score
0,Drake,2018-19 Men's Basketball,16,0,1.45,48,0,11,0,0.71,...,103544,Drake,0.0,2018-11-09,0.0,Drake,Buena Vista,103544.0,103383.0,98.0
1,Buena Vista,2018-19 Men's Basketball,10,0,0.67,31,0,0,11,1.34,...,103383,Buena Vista,0.0,2018-11-09,0.0,Drake,Buena Vista,103544.0,103383.0,98.0
2,Col. of Idaho,2018-19 Men's Basketball,15,0,0.83,33,11,1,4,1.25,...,106093,Col. of Idaho,0.0,2018-11-01,0.0,Utah,Col. of Idaho,104389.0,106093.0,96.0
3,Utah,2018-19 Men's Basketball,22,0,2.44,38,9,4,1,0.99,...,104389,Utah,0.0,2018-11-01,0.0,Utah,Col. of Idaho,104389.0,106093.0,96.0
4,Emory,2018-19 Men's Basketball,3,0,0.75,15,6,2,2,1.22,...,103583,Emory,0.0,2018-11-05,0.0,North Georgia,Emory,104002.0,103583.0,88.0


In [9]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize and train the decision tree model
model = DecisionTreeClassifier(random_state=42)
model.fit(X_train, y_train)

# Predict probabilities for each team reaching each tournament round
y_probabilities = model.predict_proba(X)

In [10]:
# Define the tournament rounds
tournament_rounds = ['Round of 64', 'Round of 32', 'Sweet Sixteen', 'Elite Eight', 'Final Four', 'Championship Game']

# Aggregate probabilities for each team across all tournament rounds
team_probabilities = {}
for i, team in enumerate(df['Club Name']):
    team_probabilities[team] = {}
    for j, round_name in enumerate(tournament_rounds):
        team_probabilities[team][round_name] = y_probabilities[i][j]

# Example: Print probabilities for a specific team
team_name = 'Gonzaga'
print(f"Probabilities for {team_name}:")
for round_name, probability in team_probabilities[team_name].items():
    print(f"{round_name}: {probability:.2f}")

Probabilities for Gonzaga:
Round of 64: 0.04
Round of 32: 0.00
Sweet Sixteen: 0.15
Elite Eight: 0.31
Final Four: 0.34
Championship Game: 0.15


In [12]:
import json

# Define the file path for exporting the dictionary
output_file_path = 'team_probabilities.json'

# Export the team_probabilities dictionary to a JSON file
with open(output_file_path, 'w') as f:
    json.dump(team_probabilities, f)