In [1]:
## imports

import re
import random
import datetime
import glob
import os
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings("ignore")
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
pd.options.mode.chained_assignment = None 

In [2]:
## data source: masseyratings.com

In [3]:
## rankings that are used during the 2022 season

rankers = pd.read_csv('cf2022.csv')
rankers = list(rankers.ranking_code.unique())

In [4]:
## cleaning txt files, used to document each season's games

team_code = {}
team_number = 500

def clean_data(data):
    global team_code
    global team_number
    games = []
    seed = 3
    
    for line in data:
        dates = re.findall('[0-9]{4}-[0-9]{2}-[0-9]{2}', line) ## regex for dates
        line = re.sub(r'[0-9]{4}-[0-9]{2}-[0-9]{2}', r'', line) ## removing dates from line
        date = dates[0]
        line = line.strip()

        teams = re.findall("['@A-Za-z\s&]+?(?=[0-9])", line) ## regex for 2 teams
        
        random.seed(seed)
        if len(teams) > 1: ## if statement for lines that aren't pulling back anything
            r = random.uniform(0, 1)
            if r < 0.5:
                team1 = teams[0].strip()
                team2 = teams[1].strip()
            else:
                team1 = teams[1].strip()
                team2 = teams[0].strip()
        seed += 1
        line = re.sub(r"['@A-Za-z\s&]+?(?=[0-9])", r' ', line) ## removing teams from line
        
        ## variables that are available for use but are removed in current model
        ## score differential, home team
        scores = re.findall('[0-9]{1,3}', line) ## regex for 2 scores
        if len(scores) > 1: ## if statement for lines that aren't pulling back anything
            if r < 0.5:
                score1 = int(scores[0].strip())
                score2 = int(scores[1].strip())
            else:
                score1 = int(scores[1].strip())
                score2 = int(scores[0].strip())
            score_diff = abs(score1 - score2) ## difference in scores between the two teams
            if score_diff == 0:
                score1 = np.nan
                score2 = np.nan
                score_diff = np.nan
        
        team1_host = re.findall('@', team1)
        team1 = re.sub('@', '', team1)

        team2_host = re.findall('@', team2)
        team2 = re.sub('@', '', team2)
        
        ## team codes
        if team1 not in team_code:
            team_code[team1] = team_number
            team_number = team_number + 1
        if team2 not in team_code:
            team_code[team2] = team_number ## assigning number to team
            team_number = team_number + 1 ## adding 1 to number assignment, avoiding dup assignments
         
        if len(team1_host) > 0:
            host_team = team_code[team1] ## who is hosting the game? assigning team number to variable
        elif len(team2_host) > 0:
            host_team = team_code[team2] ## who is hosting the game? assigning team number to variable
        else:
            host_team = 0 ## how to handle neutral site?
            
        ## who won?
        if score1 > score2:
            winner = 'team1'
        elif score2 > score1:
            winner = 'team2'
        else:
            winner = 'tbd' ## game has not happened yet


        games.append((date, team1, team2, score1, score2, score_diff, host_team, winner))

    games = pd.DataFrame(games)
    games.columns = ['date', 'team1', 'team2', 'score1', 'score2', 'score_diff', 'host_team', 'winner']
    games['team1_code'] = games["team1"].apply(lambda x: team_code.get(x))
    games['team2_code'] = games["team2"].apply(lambda x: team_code.get(x))
    games = games[['team1', 'team2', 'winner', 'team1_code', 'team2_code']]
    
    games = games.apply(lambda x: x.str.strip() if x.dtype == "object" else x)
    
    return games

In [5]:
## all files but 2022 need cleaning

def clean_rankings(rankings):
    rankings = rankings.rename(columns={0: 'year', 1: 'team_code', 2: 'team_name', 3: 'ranking_code',
                                        4: 'ranking_name', 5: 'date_str', 6: 'rank'})
    rankings['date'] = pd.to_datetime(rankings['date_str'], format='%Y%m%d')
    rankings = rankings[rankings['date'] == rankings['date'].max()]
    rankings['week'] = 1
    rankings = rankings[['year', 'team_name', 'ranking_name', 'ranking_code', 'rank', 'week']]
    rankings = rankings.apply(lambda x: x.str.strip() if x.dtype == "object" else x)
    rankings = rankings[rankings['ranking_code'].isin(rankers)]
    
    return rankings

In [6]:
## formatting to compare rankings on head-to-head matchups

def set_rankings(rankings, games):

    rankers = rankings.ranking_code.unique()

    rankers_adj = []

    for ranker in rankers:
        ranker = ranker + '_team_1'
        rankers_adj.append(ranker)
    for ranker in rankers:
        ranker = ranker + '_team_2'
        rankers_adj.append(ranker)


    for ranker in rankers_adj:
        games[ranker] = np.nan

    games = games.set_index('team2')
    for index, row in rankings.iterrows():
        games[row[3] + '_team_2'][row[1]] = row[4]

    games = games.reset_index()
    games = games.set_index('team1')
    for index, row in rankings.iterrows():
        games[row[3] + '_team_1'][row[1]] = row[4]

    games = games.reset_index()
    for i in games:
        if i != 'winner':
            games[i] = games[i].fillna(350)

    return games

In [7]:
## building final document

def compile_rankings():
    
    rankings = pd.DataFrame()

    for file in os.listdir():
        if '.txt' in file:
            year = file[2:6]
            with open(file) as f:
                lines = f.readlines()
            games = clean_data(lines)
            if year != '2022' and rankings.empty:
                rankings_temp = pd.read_csv('cf' + year + '.csv', header = None)
                rankings_temp = clean_rankings(rankings_temp)
                rankings_temp = set_rankings(rankings_temp, games)
                rankings = rankings_temp
            elif year != '2022':
                rankings_temp = pd.read_csv('cf' + year + '.csv', header = None)
                rankings_temp = clean_rankings(rankings_temp)
                rankings_temp = set_rankings(rankings_temp, games)
                rankings = pd.concat([rankings, rankings_temp], ignore_index = True)
            if year == '2022':
                rankings_temp = pd.read_csv('cf' + year + '.csv')
                rankings_temp = set_rankings(rankings_temp, games)
                rankings = pd.concat([rankings, rankings_temp], ignore_index = True)
    
    rankings = rankings.dropna(axis = 'columns')
    return rankings

In [8]:
rankings = compile_rankings()

In [9]:
rankings

Unnamed: 0,team1,team2,winner,team1_code,team2_code,AND_team_1,COL_team_1,MAS_team_1,AP_team_1,BAS_team_1,...,SAG_team_2,SOR_team_2,DOK_team_2,USA_team_2,WEL_team_2,WOB_team_2,COF_team_2,BRN_team_2,DOL_team_2,LAZ_team_2
0,California,Hawaii,team1,500,501,62.0,66.0,62.0,350.0,59.0,...,90.0,86.0,103.0,350.0,75.0,86.0,90.0,106.0,84.0,82.0
1,N Dakota St,Charleston So,team1,502,503,350.0,350.0,2.0,350.0,2.0,...,14.0,350.0,10.0,350.0,350.0,350.0,350.0,10.0,18.0,19.0
2,Presbyterian,C Michigan,team2,504,505,350.0,350.0,97.0,350.0,108.0,...,101.0,100.0,93.0,350.0,81.0,90.0,89.0,105.0,85.0,91.0
3,TN Martin,Cincinnati,team2,506,507,350.0,350.0,40.0,350.0,44.0,...,99.0,94.0,91.0,350.0,99.0,100.0,100.0,86.0,94.0,100.0
4,Connecticut,Maine,team1,508,509,112.0,117.0,117.0,350.0,118.0,...,38.0,350.0,43.0,350.0,350.0,350.0,350.0,46.0,29.0,35.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
28270,Kentucky,Penn St,team1,600,615,10.0,13.0,11.0,12.0,31.0,...,11.0,17.0,15.0,17.0,24.0,20.0,13.0,12.0,18.0,17.0
28271,Washington,Ohio St,team2,691,613,18.0,18.0,19.0,13.0,15.0,...,3.0,6.0,7.0,3.0,4.0,4.0,3.0,4.0,4.0,3.0
28272,Georgia,Texas,team2,610,754,6.0,6.0,5.0,7.0,3.0,...,17.0,16.0,25.0,9.0,21.0,15.0,24.0,14.0,14.0,11.0
28273,E Washington,N Dakota St,team2,693,502,350.0,350.0,3.0,350.0,5.0,...,1.0,350.0,1.0,350.0,350.0,350.0,350.0,1.0,1.0,1.0


In [10]:
X_played = rankings[rankings['winner'] != 'tbd'] ## all games that have been played in data set
X_dev = rankings[rankings['winner'] == 'tbd'] ## pulling out bowl games, predicting these

In [11]:
X_train, X_test, y_train, y_test = train_test_split(X_played, X_played['winner'], test_size = 0.2, 
                                                    random_state = 3) ## splitting data

X_train = X_train.drop(columns = ['winner', 'team1', 'team2'])
X_test = X_test.drop(columns = ['winner', 'team1', 'team2'])

In [12]:
X_dev = X_dev.drop(columns = ['winner', 'team1', 'team2'])
X_dev.head()

Unnamed: 0,team1_code,team2_code,AND_team_1,COL_team_1,MAS_team_1,AP_team_1,BAS_team_1,PFZ_team_1,BIH_team_1,BIL_team_1,...,SAG_team_2,SOR_team_2,DOK_team_2,USA_team_2,WEL_team_2,WOB_team_2,COF_team_2,BRN_team_2,DOL_team_2,LAZ_team_2
21174,502,553,350.0,350.0,350.0,350.0,350.0,350.0,350.0,350.0,...,350.0,350.0,350.0,350.0,350.0,350.0,350.0,350.0,350.0,350.0
21175,527,514,350.0,350.0,350.0,350.0,350.0,350.0,350.0,350.0,...,350.0,350.0,350.0,350.0,350.0,350.0,350.0,350.0,350.0,350.0
21176,740,714,350.0,350.0,350.0,350.0,350.0,350.0,350.0,350.0,...,350.0,350.0,350.0,350.0,350.0,350.0,350.0,350.0,350.0,350.0
21177,574,607,94.0,82.0,91.0,350.0,72.0,67.0,101.0,81.0,...,73.0,86.0,73.0,350.0,96.0,92.0,84.0,62.0,79.0,89.0
21178,661,733,350.0,350.0,350.0,350.0,350.0,350.0,350.0,350.0,...,350.0,350.0,350.0,350.0,350.0,350.0,350.0,350.0,350.0,350.0


In [13]:
## building + fitting decision tree classifier

winner_model = RandomForestClassifier(criterion = 'log_loss', random_state = 3).fit(X_train, y_train)
score = winner_model.score(X_test, y_test) ## score = percent at which model is predicting correctly
print(score)

## using model for predictions
y_pred = winner_model.predict(X_dev)
X_dev['winner'] = y_pred

d = {v:k for k, v in team_code.items()}
X_dev['team1_code'] = X_dev['team1_code'].map(d)
X_dev['team2_code'] = X_dev['team2_code'].map(d)

first_column = X_dev.pop('winner')
X_dev.insert(0, 'winner', first_column)

0.7800212539851222


In [14]:
X_dev

Unnamed: 0,winner,team1_code,team2_code,AND_team_1,COL_team_1,MAS_team_1,AP_team_1,BAS_team_1,PFZ_team_1,BIH_team_1,...,SAG_team_2,SOR_team_2,DOK_team_2,USA_team_2,WEL_team_2,WOB_team_2,COF_team_2,BRN_team_2,DOL_team_2,LAZ_team_2
21174,team1,N Dakota St,Samford,350.0,350.0,350.0,350.0,350.0,350.0,350.0,...,350.0,350.0,350.0,350.0,350.0,350.0,350.0,350.0,350.0,350.0
21175,team1,Montana St,William & Mary,350.0,350.0,350.0,350.0,350.0,350.0,350.0,...,350.0,350.0,350.0,350.0,350.0,350.0,350.0,350.0,350.0,350.0
21176,team1,CS Sacramento,Incarnate Word,350.0,350.0,350.0,350.0,350.0,350.0,350.0,...,350.0,350.0,350.0,350.0,350.0,350.0,350.0,350.0,350.0,350.0
21177,team2,Army,Navy,94.0,82.0,91.0,350.0,72.0,67.0,101.0,...,73.0,86.0,73.0,350.0,96.0,92.0,84.0,62.0,79.0,89.0
21178,team2,S Dakota St,Holy Cross,350.0,350.0,350.0,350.0,350.0,350.0,350.0,...,350.0,350.0,350.0,350.0,350.0,350.0,350.0,350.0,350.0,350.0
21179,team1,UAB,Miami OH,92.0,74.0,87.0,350.0,79.0,70.0,94.0,...,103.0,108.0,105.0,350.0,93.0,98.0,104.0,103.0,101.0,104.0
21180,team2,UT San Antonio,Troy,27.0,20.0,52.0,22.0,58.0,38.0,26.0,...,56.0,30.0,64.0,24.0,11.0,21.0,22.0,51.0,25.0,25.0
21181,team1,Jackson St,NC Central,350.0,350.0,350.0,350.0,350.0,350.0,350.0,...,350.0,350.0,350.0,350.0,350.0,350.0,350.0,350.0,350.0,350.0
21182,team2,Cincinnati,Louisville,38.0,31.0,34.0,350.0,30.0,40.0,30.0,...,32.0,27.0,27.0,350.0,36.0,35.0,25.0,36.0,44.0,27.0
21183,team1,SMU,BYU,57.0,48.0,55.0,350.0,59.0,52.0,49.0,...,61.0,59.0,56.0,350.0,53.0,56.0,59.0,65.0,60.0,57.0


In [15]:
## championship game has yet to be determined, pulling winners from the semi-final games

def championship_game():

    X_dev_rankers = []

    for x in X_dev.columns:
        if x == 'winner':
            pass
        elif x[:4] == 'team':
            pass
        elif x[:2] == 'AP':
            x = x[:2]
            if x not in X_dev_rankers:
                X_dev_rankers.append(x)
        else:
            x = x[:3]
            if x not in X_dev_rankers:
                X_dev_rankers.append(x)

    X_dev_chmp = pd.DataFrame(columns = X_dev.columns)
    X_dev_chmp['winner'] = 1
    X_dev_chmp
    X_dev_dict = {'winner': np.nan, 'team1': 'Michigan', 'team2': 'Georgia', 'team1_code': 606, 'team2_code': 610}
    X_dev_dict = pd.DataFrame([X_dev_dict])
    X_dev_chmp = pd.concat([X_dev_chmp, X_dev_dict], ignore_index = True)

    rankings2022 = pd.read_csv('cf2022.csv')
    rankings2022 = rankings2022[rankings2022['ranking_code'].isin(X_dev_rankers)]

    X_dev_chmp = X_dev_chmp.set_index('team2')
    for index, row in rankings2022.iterrows():
        X_dev_chmp[row[3] + '_team_2'][row[1]] = row[4]

    X_dev_chmp = X_dev_chmp.reset_index()
    X_dev_chmp = X_dev_chmp.set_index('team1')
    for index, row in rankings2022.iterrows():
        X_dev_chmp[row[3] + '_team_1'][row[1]] = row[4]


    X_dev_chmp = X_dev_chmp.reset_index()   

    X_dev_chmp = X_dev_chmp.drop(columns = ['winner', 'team1', 'team2'])
    
    y_pred = winner_model.predict(X_dev_chmp)
    X_dev_chmp['winner'] = y_pred

    d = {v:k for k, v in team_code.items()}
    X_dev_chmp['team1_code'] = X_dev_chmp['team1_code'].map(d)
    X_dev_chmp['team2_code'] = X_dev_chmp['team2_code'].map(d)

    first_column = X_dev_chmp.pop('winner')
    X_dev_chmp.insert(0, 'winner', first_column)

    return X_dev_chmp

In [16]:
champs = championship_game()
champs

Unnamed: 0,winner,team1_code,team2_code,AND_team_1,COL_team_1,MAS_team_1,AP_team_1,BAS_team_1,PFZ_team_1,BIH_team_1,...,SAG_team_2,SOR_team_2,DOK_team_2,USA_team_2,WEL_team_2,WOB_team_2,COF_team_2,BRN_team_2,DOL_team_2,LAZ_team_2
0,team2,Michigan,Georgia,2.0,2.0,2.0,2.0,3.0,2.0,2.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,2.0,1.0,1.0


In [17]:
bowl_games = [
    
    ('UAB', 'Miami OH', 'Hometown Lenders Bahamas Bowl'),
    ('UT San Antonio', 'Troy', 'Duluth Trading Cure Bowl'),
    ('Cincinnati', 'Louisville', 'Wasabi Fenway Bowl'),
    ('Jackson St', 'NC Central', 'Cricket Celebration Bowl'),
    ('Oregon St', 'Florida', 'SRS Bistribution Las Vegas Bowl'),
    ('Washington St', 'Fresno St', 'Jimmy Kimmel LA Bowl'),
    ('Southern Miss', 'Rice', 'Lendingtree Bowl'),
    ('SMU', 'BYU', 'New Mexico Bowl'),
    ('Boise St', 'North Texas', 'Frisco Bowl'),
    ('Marshall', 'Connecticut', 'Myrtle Beach Bowl'),
    ('E Michigan', 'San Jose St', 'Famous Idaho Potato Bowl'),
    ('Toledo', 'Liberty', 'Roofclaim.com Boca Raton Bowl'),
    ('South Alabama', 'WKU', 'R+L Carriers New Orleans Bowl'),
    ('Baylor', 'Air Force', 'Lockheed Martin Armed Forces Bowl'),
    ('Louisiana', 'Houston', 'Radiance Technologies Independence Bowl'),
    ('Wake Forest', 'Missouri', 'Union Home Mortgage Gasparilla Bowl'),
    ('MTSU', 'San Diego St', 'Easypost Hawai’i Bowl'),
    ('Bowling Green', 'New Mexico St', 'Quick Lane Bowl'),
    ('Ga Southern', 'Buffalo', 'Camellia Bowl'),
    ('Utah St', 'Memphis', 'Servpro First Responder Bowl'),
    ('Coastal Car', 'East Carolina', 'Ticketsmarter Birmingham Bowl'),
    ('Wisconsin', 'Oklahoma St', 'Guaranteed Rate Bowl'),
    ('Duke', 'UCF', 'Military Bowl Presented by Peraton'),
    ('Kansas', 'Arkansas', 'Autozone Liberty Bowl'),
    ('North Carolina', 'Oregon', 'San Diego County Credit Union Holiday Bowl'),
    ('Texas Tech', 'Mississippi', 'Taxact Texas Bowl'),
    ('Syracuse', 'Minnesota', 'Bad Boy Mowers Pinstripe Bowl'),
    ('Florida St', 'Oklahoma', 'Cheez-it Bowl'),
    ('Washington', 'Texas', 'Valero Alamo Bowl'),
    ('NC State', 'Maryland', 'Duke’s Mayo Bowl'),
    ('Pittsburgh', 'UCLA', 'Tony the Tiger Sun Bowl'),
    ('Notre Dame', 'South Carolina', 'Taxslayer Gator Bowl'),
    ('Ohio', 'Wyoming', 'Barstool Sports Arizona Bowl'),
    ('Tennessee', 'Clemson', 'Capital One Orange Bowl'),
    ('Kentucky', 'Iowa', 'Transperfect Music City Bowl'),
    ('Kansas St', 'Alabama', 'Allstate Sugar Bowl'),
    ('Michigan', 'TCU', 'VRBO Fiesta Bowl (CFP Semifinal)'),
    ('Georgia', 'Ohio St', 'Chick-fil-a Peach Bowl (CFP Semifinal)'),
    ('Mississippi St', 'Illinois', 'Reliaquest Bowl'),
    ('Tulane', 'USC', 'Goodyear Cotton Bowl Classic'),
    ('LSU', 'Purdue', 'Cheez-It Citrus Bowl'),
    ('Utah', 'Penn St', 'Rose Bowl Game'),
    ('Michigan', 'Georgia', 'College Football Playoff National Championship Presented by AT&T')
]


In [18]:
for game in bowl_games:
    print(game[2], '--', game[0], 'vs.', game[1])
    if game[2] != 'College Football Playoff National Championship Presented by AT&T':
        temp = X_dev[X_dev['team1_code'] == game[0]]        
        if temp['winner'].item() == 'team1':
            print('projected winner:', game[0])
        elif temp['winner'].item() == 'team2':
            print('projected winner:', game[1])
    else:
        if champs['winner'].item() == 'team1':
            print('projected winner:', game[0])
        elif champs['winner'].item() == 'team2':
            print('projected winner:', game[1])
    print('    ')

Hometown Lenders Bahamas Bowl -- UAB vs. Miami OH
projected winner: UAB
    
Duluth Trading Cure Bowl -- UT San Antonio vs. Troy
projected winner: Troy
    
Wasabi Fenway Bowl -- Cincinnati vs. Louisville
projected winner: Louisville
    
Cricket Celebration Bowl -- Jackson St vs. NC Central
projected winner: Jackson St
    
SRS Bistribution Las Vegas Bowl -- Oregon St vs. Florida
projected winner: Oregon St
    
Jimmy Kimmel LA Bowl -- Washington St vs. Fresno St
projected winner: Washington St
    
Lendingtree Bowl -- Southern Miss vs. Rice
projected winner: Southern Miss
    
New Mexico Bowl -- SMU vs. BYU
projected winner: SMU
    
Frisco Bowl -- Boise St vs. North Texas
projected winner: Boise St
    
Myrtle Beach Bowl -- Marshall vs. Connecticut
projected winner: Marshall
    
Famous Idaho Potato Bowl -- E Michigan vs. San Jose St
projected winner: San Jose St
    
Roofclaim.com Boca Raton Bowl -- Toledo vs. Liberty
projected winner: Liberty
    
R+L Carriers New Orleans Bowl -- 