In [1]:
# imports

import re
import pandas as pd
import numpy as np
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor

In [19]:
## data source: https://masseyratings.com/scores.php?s=cf2022&sub=ncaa-d1&all=1&sch=on

with open('cf2022.txt') as f:
    lines = f.readlines()

    
def clean_data(data):
    games = []
    team_code = {}
    team_number = 500
    
    for line in data:
        dates = re.findall('[0-9]{4}-[0-9]{2}-[0-9]{2}', line) ## regex for dates
        line = re.sub(r'[0-9]{4}-[0-9]{2}-[0-9]{2}', r'', line) ## removing dates from line
        date = dates[0]
        line = line.strip()

        teams = re.findall("['@A-Za-z\s&]+?(?=[0-9])", line) ## regex for 2 teams
        if len(teams) > 1: ## if statement for lines that aren't pulling back anything
            team1 = teams[0].strip()
            team2 = teams[1].strip()
        line = re.sub(r"['@A-Za-z\s&]+?(?=[0-9])", r' ', line) ## removing teams from line

        scores = re.findall('[0-9]{1,3}', line) ## regex for 2 scores
        if len(scores) > 1: ## if statement for lines that aren't pulling back anything
            score1 = int(scores[0].strip())
            score2 = int(scores[1].strip())
            score_diff = abs(score1 - score2) ## difference in scores between the two teams
            if score_diff == 0:
                score1 = np.nan
                score2 = np.nan
                score_diff = np.nan
        
        team1_host = re.findall('@', team1) ## is the game home or away?
        team1 = re.sub('@', '', team1) ## is the game home or away?

        team2_host = re.findall('@', team2)
        team2 = re.sub('@', '', team2)
        
        if team1 not in team_code:
            team_code[team1] = team_number ## assigning number to team
            team_number = team_number + 1 ## adding 1 to number assignment, avoiding dup assignments
        if team2 not in team_code:
            team_code[team2] = team_number ## assigning number to team
            team_number = team_number + 1 ## adding 1 to number assignment, avoiding dup assignments
         
        if len(team1_host) > 0:
            host_team = team_code[team1] ## who is hosting the game? assigning team number to variable
        elif len(team2_host) > 0:
            host_team = team_code[team2] ## who is hosting the game? assigning team number to variable
        else:
            host_team = 0 ## how to handle neutral site?
            
        ## not yet played = 0, team 1 = 1, team 2 = 2
        if score1 > score2:
            winner = team1 ## who won? assigning team number to variable
        elif score2 > score1:
            winner = team2 ## who won? assigning team number to variable
        else:
            winner = np.nan ## game has not happened yet


        games.append((date, team1, team2, score1, score2, score_diff, host_team, winner))

    games = pd.DataFrame(games)
    games.columns = ['date', 'team 1', 'team 2', 'score1', 'score2', 'score_diff', 'host_team', 'winner']
    games['team 1 code'] = games["team 1"].apply(lambda x: team_code.get(x))
    games['team 2 code'] = games["team 2"].apply(lambda x: team_code.get(x))

    
    return team_code, games

In [20]:
team_code, games = clean_data(lines)

bowl_games = games[games['winner'].isnull()] ## games we want to predict
games = games.dropna() ## taking predicted games out of data set

In [21]:
## predicting scores for bowl games

pred_score_X = games[['team 1 code', 'team 2 code']]
pred_score_y = games['score_diff']

X_train, X_test, y_train, y_test = train_test_split(pred_score_X, pred_score_y, random_state = 42)
scores_model = RandomForestRegressor(random_state = 42)
scores_model.fit(X_train, y_train)

y_pred = scores_model.predict(bowl_games[['team 1 code', 'team 2 code']])
bowl_games['score_diff'] = y_pred.round()

In [49]:
## predicting winner of bowl games

games_X_train = games[['team 1 code', 'team 2 code', 'score_diff']]
games_y_train = games['winner']

X_train, X_test, y_train, y_test = train_test_split(games_X_train, games_y_train, test_size = 0.2, 
                                                    random_state = 42)

winner_model = DecisionTreeClassifier(random_state = 3).fit(X_train, y_train)
score = winner_model.score(X_test, y_test)
print(score)

0.9387096774193548


In [23]:
pd.options.mode.chained_assignment = None

# upcoming bowl games

X_bowl_games = bowl_games[['team 1 code', 'team 2 code', 'score_diff']]
y_pred = winner_model.predict(X_bowl_games)
list(y_pred)

X_bowl_games['predicted_winner'] = y_pred

X_bowl_games

# d = {v:k for k, v in team_code.items()}
# X_bowl_games['predicted_winner_name'] = X_bowl_games['predicted_winner'].map(d)

Unnamed: 0,team 1 code,team 2 code,score_diff,predicted_winner
1546,571,730,14.0,Samford
1547,591,743,26.0,William & Mary
1548,736,750,29.0,Incarnate Word
1549,613,636,15.0,Army
1550,605,657,20.0,Holy Cross
1551,543,626,23.0,UAB
1552,684,670,14.0,UT San Antonio
1553,763,721,28.0,Jackson St
1554,650,645,12.0,Cincinnati
1555,642,667,15.0,BYU


In [27]:
# who will win the national championship?

championship_game = X_bowl_games.loc[[1586, 1587]]
championship_game = {'team 1 code': team_code[championship_game.loc[1586]['predicted_winner']],
                     'team 2 code': team_code[championship_game.loc[1587]['predicted_winner']],
                     'host_team': 0, 'score_diff': np.nan, 'predicted_winner': np.nan}
championship_game = pd.DataFrame([championship_game])

# score differential for championship game
y_pred = scores_model.predict(championship_game[['team 1 code', 'team 2 code']])
championship_game['score_diff'] = y_pred.round()

X_championship_game = championship_game[['team 1 code', 'team 2 code', 'score_diff']]
y_pred = winner_model.predict(X_championship_game)

championship_game['predicted_winner'] = y_pred
# championship_game['predicted_winner_name'] = championship_game['predicted_winner'].map(d)

In [37]:
# all bowl games predictions

bowl_game_winners = pd.concat([X_bowl_games, championship_game], axis=0, join="inner")
bowl_game_winners = bowl_game_winners.reset_index()
bowl_game_winners = bowl_game_winners['predicted_winner']#.drop(['index'], axis = 1)

In [38]:
bowl_game_winners

0            Samford
1     William & Mary
2     Incarnate Word
3               Army
4         Holy Cross
5                UAB
6     UT San Antonio
7         Jackson St
8         Cincinnati
9                BYU
10         Fresno St
11              Rice
12           Florida
13          Boise St
14       Connecticut
15        E Michigan
16           Liberty
17     South Alabama
18         Air Force
19           Houston
20          Missouri
21              MTSU
22     Bowling Green
23           Buffalo
24           Memphis
25       Coastal Car
26       Oklahoma St
27               UCF
28          Arkansas
29    North Carolina
30       Mississippi
31             Texas
32         Minnesota
33        Florida St
34              Ohio
35           Clemson
36          Maryland
37        Pittsburgh
38        Notre Dame
39           Alabama
40          Michigan
41           Georgia
42              Iowa
43            Tulane
44               LSU
45           Penn St
46          Illinois
47          M