# Total Runs Model -  Random Forest

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_val_score
from sklearn import metrics
import pickle

pd.set_option('display.max_columns', 150)
pd.set_option('display.max_rows',500)

In [2]:
games = pd.read_csv('../Data/Created/train.csv')

In [3]:
val = pd.read_csv('../Data/Created/test.csv')

In [4]:
features = ['v_ba','h_ba','v_obp','h_obp','v_slg','h_slg','v_runs','v_ops','h_ops','home_team','visitor_team']
X = games[features]
y = games['total']

In [5]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=2023)

In [6]:
categorical_columns = ['home_team', 'visitor_team']

ct = ColumnTransformer(
    transformers=[
        ('oh', OneHotEncoder(drop='first', sparse_output=False, handle_unknown='ignore'), categorical_columns)
    ],
    remainder='passthrough'
)

In [7]:
pipe_tot = Pipeline([
    ('ct', ct),
    ('rf', LinearRegression())
])

In [8]:
%%time
pipe_tot.fit(X_train, y_train)

CPU times: user 979 ms, sys: 203 ms, total: 1.18 s
Wall time: 709 ms


In [9]:
pipe_tot.score(X_val, y_val)

0.8642209152950036

In [12]:
preds = pipe_tot.predict(X_val)
preds = pd.DataFrame(preds)
preds.to_csv('../Data/Created/TotalPreds.csv', index = False)

In [11]:
file_name = '../Models/TotalRuns.pkl'
with open(file_name, 'wb') as file:
    pickle.dump(pipe_tot, file)