In [1]:
import pandas as pd
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
import json

In [2]:
games = pd.read_json("data/games_2021.json")
teams = pd.read_json('data/big_10_teams_2021.json')
df = pd.merge(teams, games)

for index, row in df.iterrows():
    for team in row['teams']:
        
        # find home and away teams        
        if team['homeAway'] == 'home':
            # add home teams
            df.loc[index, 'home_team'] = team['school']
            for i in team['stats']:
                # add home stats
                stats = ('home_' + i['category'], i['stat'])
                df.loc[index, stats[0]] = stats[1]

        if team['homeAway'] == 'away':
            # add away teams
            df.loc[index, 'away_team'] = team['school']
            for i in team['stats']:
                # add away stats
                stats = ('away_' + i['category'], i['stat'])
                df.loc[index, stats[0]] = stats[1]

# Move home and away team columns to begininng
first_column = df.pop('home_team')
df.insert(1, 'home_team', first_column)

second_column = df.pop('away_team')
df.insert(2, 'away_team', second_column)

# Drop 'teams' columns
df = df.drop('teams', axis=1)

# First row of df
df.iloc[0].dropna()

id                      401331447
home_team                    Iowa
away_team                Michigan
season                       2021
week                           14
                          ...    
home_netPassingYards          175
home_totalYards               279
home_fourthDownEff            0-1
home_thirdDownEff            5-19
home_firstDowns                15
Name: 0, Length: 93, dtype: object

In [3]:
df = df[df.home_conference == 'Big Ten']
df = df[df.away_conference == 'Big Ten']

df = df[df["home_points"].notna()]
df = df[df["away_points"].notna()]

# Makes a new column called "home_win" with value True
# if home_points is greater than away_points, False otherwise 
df["home_win"] = df["home_points"] > df["away_points"]

In [4]:
f = open("data/2021_stats_advanced.json")
advanced_stats = json.load(f)

In [5]:
stat1 = {}
stat2 = {}
stat3 = {}

for dict in advanced_stats:
    stat1[dict["team"]] = dict["offense"]["drives"]
    stat2[dict["team"]] = dict["defense"]["totalPPA"]
    stat3[dict["team"]] = dict["offense"]["totalPPA"]

In [6]:
df['home_stat1'] = df['home_team'].map(stat1)
df['away_stat1'] = df['away_team'].map(stat1)

df['home_stat2'] = df['home_team'].map(stat2)
df['away_stat2'] = df['away_team'].map(stat2)

df['home_stat3'] = df['home_team'].map(stat3)
df['away_stat3'] = df['away_team'].map(stat3)

df

Unnamed: 0,id,home_team,away_team,season,week,season_type,start_date,start_time_tbd,neutral_site,conference_game,...,away_kickReturnYards,away_kickReturnTDs,away_kickReturns,home_win,home_stat1,away_stat1,home_stat2,away_stat2,home_stat3,away_stat3
0,401331447,Iowa,Michigan,2021,14,regular,2021-12-05T01:00:00.000Z,False,True,True,...,,,,False,177,164,71.482638,97.728482,41.475493,257.227094
4,401282812,Ohio State,Purdue,2021,11,regular,2021-11-13T20:30:00.000Z,False,False,True,...,18,0,3,True,154,151,119.334972,130.470684,399.765062,222.133193
7,401282807,Northwestern,Rutgers,2021,7,regular,2021-10-16T16:00:00.000Z,False,False,True,...,50,0,3,True,144,164,231.835736,176.253812,84.882058,55.593657
8,401282796,Minnesota,Nebraska,2021,7,regular,2021-10-16T16:00:00.000Z,False,False,True,...,0,0,1,True,143,143,72.938192,140.067859,159.061215,233.884577
9,401282728,Indiana,Rutgers,2021,11,regular,2021-11-13T17:00:00.000Z,False,False,True,...,34,0,2,False,151,164,171.767098,176.253812,25.988369,55.593657
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
84,401282716,Purdue,Illinois,2021,4,regular,2021-09-25T19:30:00.000Z,False,False,True,...,,,,True,151,145,130.470684,156.164201,222.133193,96.533795
95,401282729,Penn State,Indiana,2021,5,regular,2021-10-02T23:30:00.000Z,False,False,True,...,,,,True,159,151,61.059677,171.767098,136.271984,25.988369
97,401282714,Illinois,Nebraska,2021,1,regular,2021-08-28T17:20:00.000Z,False,False,True,...,30,0,2,True,145,143,156.164201,140.067859,96.533795,233.884577
98,401282815,Wisconsin,Penn State,2021,1,regular,2021-09-04T16:00:00.000Z,False,False,True,...,29,0,2,False,154,159,-7.114814,61.059677,132.394420,136.271984


In [7]:
columns = ["home_stat1", "away_stat1", "home_stat2", "away_stat2", "home_stat3", "away_stat3"]

X = df[columns]
y = df["home_win"]


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25)

clf = SVC(kernel='linear', C = 1.0)
clf.fit(X_train, y_train)
# score_train = clf.score(X_train, y_train)
score_test = clf.score(X_test, y_test)
score_test

0.875

In [8]:
home = "Michigan"
away = "Nebraska"
todays_game = [stat1[home], stat1[away], 
               stat2[home], stat2[away], 
               stat3[home], stat3[away]]

clf.predict([todays_game])



array([ True])