In [4]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

### Running the model with only basic metadata

I want to see how well the model performs with nearly no data aside from the game date, the two teams that played, and where the game was played. It performed horribly, as expected.

In [8]:
df = pd.read_csv('/Users/hannahwurzel/Desktop/MLB/other_data/all_transformed_metadata.csv')
df = df.drop(['away_team_score', 'home_team_score'], axis=1)
X = df.iloc[:, :-1]
y = df.iloc[:, -1]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [9]:
model = LinearRegression()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"Mean Squared Error: {mse}")
print(f"Mean Absolute Error: {mae}")
print(f"R2: {r2}")
print(f"Coefficients: {model.coef_}")
print(f"Intercept: {model.intercept_}")

Mean Squared Error: 20.451651295528332
Mean Absolute Error: 3.5978595175241783
R2: 0.0007029256560994179
Coefficients: [-1.10295710e-05 -1.94265712e-02  4.38564123e-03 -2.83317343e-02]
Intercept: 17.726888310051812


### Running the model with basic metadata and weather data

The next data points I wanted to add was weather data. Although, after I did some data analysis, I do not believe these data points will help the model in any way (look under the visualizations folder to see findings). However, I still want to test it out just to see what the results are.

In [13]:
df_with_weather = pd.read_csv('/Users/hannahwurzel/Desktop/MLB/metadata/metadata_transformed.csv')
y_with_weather = df_with_weather['total_score']
df_with_weather = df_with_weather.drop(['away_team_score', 'home_team_score', 'game_date', 'total_score'], axis=1)
X_with_weather = df_with_weather.iloc[:, :-1]
X_train_weather, X_test_weather, y_train_weather, y_test_weather = train_test_split(
    X_with_weather, y_with_weather, test_size=0.2, random_state=42)

In [14]:
model_with_weather = LinearRegression()
model_with_weather.fit(X_train_weather, y_train_weather)
y_pred_weather = model_with_weather.predict(X_test_weather)

mse_weather = mean_squared_error(y_test_weather, y_pred_weather)
mae_weather = mean_absolute_error(y_test_weather, y_pred_weather)
r2_weather = r2_score(y_test_weather, y_pred_weather)

print(f"Mean Squared Error: {mse_weather}")
print(f"Mean Absolute Error: {mae_weather}")
print(f"R2: {r2_weather}")
print(f"Coefficients: {model_with_weather.coef_}")
print(f"Intercept: {model_with_weather.intercept_}")

Mean Squared Error: 20.350096494942573
Mean Absolute Error: 3.5930247340503536
R2: 0.005665039161971475
Coefficients: [-5.10384247e-05 -1.55376302e-02  4.14387572e-03 -2.59800189e-02
  1.24047244e-02 -6.27315790e-03 -3.22093539e-03  1.30943183e-02
  3.50752158e-04]
Intercept: 9.34954158011113
