# Week 1 Jupyter Notebook – Linear Regression 1


In [75]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import PolynomialFeatures
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import cross_val_score
import numpy as np
from sklearn.model_selection import RepeatedKFold
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score


In [76]:
# Dataset 1: Video_review

df_video_review = pd.read_csv('video_review.csv')
df_video_review.columns

Index(['Season_Year', 'GameKey', 'PlayID', 'GSISID', 'Player_Activity_Derived',
       'Turnover_Related', 'Primary_Impact_Type', 'Primary_Partner_GSISID',
       'Primary_Partner_Activity_Derived', 'Friendly_Fire'],
      dtype='object')

In [77]:
# Dataset 1: Video_review and One Hot Encoding of Categorical Variables with Linear Regression

impact_score_map = {
    'Helmet-to-helmet': 3,
    'Helmet-to-body': 1,
    'Helmet-to-ground': 2,
}
df_video_review['impact_score'] = df_video_review['Primary_Impact_Type'].map(impact_score_map).fillna(0)

categorical_features = [
    'Player_Activity_Derived',
    'Turnover_Related',
    'Primary_Partner_Activity_Derived',
    'Friendly_Fire'
]
continuous_features = ['Season_Year', 'GameKey', 'PlayID', 'GSISID', 'Primary_Partner_GSISID']

df_clean = df_video_review[~df_video_review[categorical_features].isin(['Unclear']).any(axis=1)].copy()

for col in continuous_features:
    df_clean[col] = pd.to_numeric(df_clean[col], errors='coerce')
df_clean[continuous_features] = df_clean[continuous_features].fillna(0)

X_cat = df_clean[categorical_features]
X_cont = df_clean[continuous_features]
y = df_clean['impact_score']

encoder = OneHotEncoder(sparse_output=False, drop=None)
X_cat_encoded = encoder.fit_transform(X_cat)
encoded_cols = encoder.get_feature_names_out(categorical_features)
X_cat_df = pd.DataFrame(X_cat_encoded, columns=encoded_cols, index=df_clean.index)

X_all = pd.concat([X_cat_df, X_cont], axis=1)

model = LinearRegression()
model.fit(X_all, y)

y_pred = model.predict(X_all)

print(f"Mean Squared Error (train): {mean_squared_error(y, y_pred):.3f}")
print(f"R^2 Score (train): {r2_score(y, y_pred):.3f}")


Mean Squared Error (train): 0.715
R^2 Score (train): 0.243


Small dataset (~37 rows) of football plays with categorical variables - created an impact severity score (impact_score) by mapping categories like 'Helmet-to-helmet' = 3, etc. and tried predicting it with linear regression using one-hot encoded features.

The features left after removing Primary_Impact_Type have weak or no correlation with the target. The dataset is very small, making generalization hard.

All features are categorical, and current ones don’t carry enough signal to predict impact severity well.

To move forward meaningfully, it's best to reframe the problem as a classification task - Classification is more appropriate for the nature and size of data, and models like Random Forest or Logistic Regression will handle the categorical structure effectively.


In [None]:
# Dataset 1: Video_review and Polynomial/Interaction Terms with Linear Regression

poly = PolynomialFeatures(degree=2, interaction_only=False, include_bias=False)
X_poly_train = poly.fit_transform(X_train)  

scaler = StandardScaler()
X_poly_train_scaled = scaler.fit_transform(X_poly_train)  

model = LinearRegression()
model.fit(X_poly_train_scaled, y_train)

y_pred_train = model.predict(X_poly_train_scaled)

print(f"Mean Squared Error (train): {mean_squared_error(y_train, y_pred_train):.3f}")
print(f"R^2 Score (train): {r2_score(y_train, y_pred_train):.3f}")



Mean Squared Error (train): 0.776
R^2 Score (train): 0.243


In [79]:
# Dataset 2: InjuryRecord
file_path = 'InjuryRecord.csv'
df_injury = pd.read_csv(file_path)
df_injury.columns


Index(['PlayerKey', 'GameID', 'PlayKey', 'BodyPart', 'Surface', 'DM_M1',
       'DM_M7', 'DM_M28', 'DM_M42'],
      dtype='object')

In [80]:
# Dataset 2: InjuryRecord and and Polynomial Terms/VIF



In [81]:
# Dataset 2: InjuryRecord and Interaction Terms



In [82]:
# Dataset 2: InjuryRecord and Categorical 

In [83]:
# Dataset 3: 