# Deliverable 3 - Final Project

## Imports

In [314]:
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LinearRegression

## Prediction Model

#### Preprocessing

In [308]:
# read dataframe from csv
df = pd.read_csv("./data/athletes.csv")

# drop unused columns for optimization
df = df.drop(['athlete_id', 'name', 'region', 'team', 'affiliate', 'eat', 'train', 'background', 'experience', 'schedule', 'howlong'], axis=1)

# drop bad values in gender column
df = df.dropna(subset=['gender'])
df = df[df.gender != '--']

le = LabelEncoder()
df['gender'] = le.fit_transform(df['gender'])

# create an imputer with mean strategy
imputer = SimpleImputer(strategy='mean')

# imputting median values for each NaN value
clean_data = pd.DataFrame(imputer.fit_transform(df), columns=df.columns)

# remove corrupt values
clean_data = clean_data[clean_data['height'] <= 202]
clean_data = clean_data[clean_data['weight'] <= 210] 

# scale the data
scaler = StandardScaler()
scaled_data = pd.DataFrame(scaler.fit_transform(clean_data), columns=clean_data.columns)


#### Train/Test Split

In [364]:
# Split the data into two random halves
half1 = scaled_data.sample(frac=0.5, random_state=0)
half2 = scaled_data.drop(half1.index)

# Split each half into training and test sets
X_train1, X_test1, y_train1, y_test1 = train_test_split(half1[['gender', 'weight']], half1[['fran', 'helen', 'grace', 'filthy50', 'fgonebad', 'run400', 'run5k']], test_size=0.2, random_state=0)
X_train2, X_test2, y_train2, y_test2 = train_test_split(half2[['gender', 'weight']], half2[['fran', 'helen', 'grace', 'filthy50', 'fgonebad', 'run400', 'run5k']], test_size=0.2, random_state=0)

# Concatenate the training and test sets from each half
X_train = pd.concat([X_train1, X_train2])
X_test = pd.concat([X_test1, X_test2])
y_train = pd.concat([y_train1, y_train2])
y_test = pd.concat([y_test1, y_test2])

#### Training

In [365]:
# Create a ridge regression model
ridge = Ridge()

# Set the hyperparameters to tune
parameters = {'alpha': [0.1, 1, 10, 100]}

# Use GridSearchCV to find the best hyperparameters
grid = GridSearchCV(ridge, parameters, cv=5)
grid.fit(X_train, y_train)

# Train the model on the training set with the best hyperparameters
best_ridge = grid.best_estimator_
best_ridge.fit(X_train, y_train)

# Make predictions on the test set
y_pred = best_ridge.predict(X_test)

# Evaluate the performance of the model
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print("MSE:", mse)
print("R2 score:", r2)

MSE: 0.025272263296622088
R2 score: -0.05673835287324141


#### Evaluate