In [1]:
import pandas as pd
import numpy as np

In [2]:
train_df = pd.read_csv('../data/raw/train.csv')

Data Preparation

In [3]:
train_df_clean = train_df.copy()

In [4]:
train_df_clean.drop('Id', axis=1, inplace=True)

In [5]:
train_df_clean.columns = train_df_clean.columns.str.strip()

In [6]:
target = train_df_clean.pop('TARGET_5Yrs')

In [7]:
train_df_clean.head()

Unnamed: 0,GP,MIN,PTS,FGM,FGA,FG%,3P Made,3PA,3P%,FTM,FTA,FT%,OREB,DREB,REB,AST,STL,BLK,TOV
0,80,24.3,7.8,3.0,6.4,45.7,0.1,0.3,22.6,2.0,2.9,72.1,2.2,2.0,3.8,3.2,1.1,0.2,1.6
1,75,21.8,10.5,4.2,7.9,55.1,-0.3,-1.0,34.9,2.4,3.6,67.8,3.6,3.7,6.6,0.7,0.5,0.6,1.4
2,85,19.1,4.5,1.9,4.5,42.8,0.4,1.2,34.3,0.4,0.6,75.7,0.6,1.8,2.4,0.8,0.4,0.2,0.6
3,63,19.1,8.2,3.5,6.7,52.5,0.3,0.8,23.7,0.9,1.5,66.9,0.8,2.0,3.0,1.8,0.4,0.1,1.9
4,63,17.8,3.7,1.7,3.4,50.8,0.5,1.4,13.7,0.2,0.5,54.0,2.4,2.7,4.9,0.4,0.4,0.6,0.7


Scale Data

In [8]:
from sklearn.preprocessing import StandardScaler
from joblib import dump

In [37]:
scaler = StandardScaler()
train_df_clean = scaler.fit_transform(train_df_clean)
dump(scaler, '../models/shannon_scaler.joblib')

['../models/shannon_scaler.joblib']

In [38]:
from sklearn.model_selection import train_test_split

In [39]:
X_data, X_test, y_data, y_test = train_test_split (train_df_clean, target, test_size=0.2, random_state=8)

In [40]:
X_train, X_val, y_train, y_val = train_test_split(X_data, y_data, test_size=0.2, random_state=8)

Save Scaled Data

In [41]:
from src.data.shannon_sets import save_sets

In [42]:
save_sets(X_train, y_train, X_val, y_val, X_test, y_test, path='../data/processed/')

Baseline Model

In [43]:
y_mean = y_train.mean()

In [44]:
y_base = np.full((len(y_train), 1), y_mean)

In [46]:
from src.models.performance import print_reg_perf

print_reg_perf(y_preds=y_base, y_actuals=y_train, set_name='Training')

RMSE Training: 0.3692658517749907
MAE Training: 0.27271453857421873


In [47]:
X_train.shape

(5120, 19)

In [48]:
from sklearn.preprocessing import PolynomialFeatures

In [49]:
poly = PolynomialFeatures(2)

In [50]:
X_train = poly.fit_transform(X_train)

In [80]:
from sklearn.linear_model import LinearRegression 

In [82]:
pol_reg = LinearRegression()

In [83]:
pol_reg.fit(X_train, y_train)

LinearRegression()

In [88]:
dump(pol_reg,  '../models/linear_poly_2.joblib')

['../models/linear_poly_2.joblib']

In [89]:
y_train_preds = pol_reg.predict(X_train)
y_val_preds = pol_reg.predict(X_val)

In [90]:
print_reg_perf(y_preds=y_train_preds, y_actuals=y_train, set_name='Training')

RMSE Training: 0.348030186220739
MAE Training: 0.2485724364694258


In [91]:
print_reg_perf(y_preds=y_val_preds, y_actuals=y_val, set_name='Training')

RMSE Training: 0.3820524252286411
MAE Training: 0.2748706889322397


In [92]:
y_train_preds

array([0.60741781, 0.81360973, 0.80522875, ..., 0.87372025, 0.94997452,
       0.52374595])

In [93]:
test_df = pd.read_csv('../data/raw/test.csv')

In [94]:
test_df.head()

Unnamed: 0,Id,GP,MIN,PTS,FGM,FGA,FG%,3P Made,3PA,3P%,FTM,FTA,FT%,OREB,DREB,REB,AST,STL,BLK,TOV
0,1,56,9.1,4.0,1.6,3.7,43.7,0.1,0.3,7.3,0.7,1.2,63.4,1.2,0.8,1.7,0.4,0.2,0.3,0.8
1,8194,43,19.3,10.1,3.7,8.1,46.0,0.6,1.7,35.1,1.8,2.5,75.3,0.5,0.9,1.5,3.5,0.6,-0.0,1.8
2,3,82,33.9,11.3,4.9,10.6,45.6,0.5,1.9,44.8,1.8,2.7,71.2,1.3,3.3,4.5,2.5,1.3,0.3,2.0
3,8196,86,44.7,18.8,6.8,15.9,42.9,0.5,1.8,13.5,4.5,6.3,70.9,1.5,3.2,5.0,4.1,0.9,0.1,3.6
4,8197,58,12.3,4.7,1.6,4.0,40.0,0.5,1.7,38.7,1.1,1.3,76.9,0.2,0.6,0.9,1.5,0.5,-0.4,0.9


In [95]:
test_X = test_df.loc[:, test_df.columns != 'Id']

In [96]:
test_df.shape

(3799, 20)

In [101]:
test_X

Unnamed: 0,GP,MIN,PTS,FGM,FGA,FG%,3P Made,3PA,3P%,FTM,FTA,FT%,OREB,DREB,REB,AST,STL,BLK,TOV
0,56,9.1,4.0,1.6,3.7,43.7,0.1,0.3,7.3,0.7,1.2,63.4,1.2,0.8,1.7,0.4,0.2,0.3,0.8
1,43,19.3,10.1,3.7,8.1,46.0,0.6,1.7,35.1,1.8,2.5,75.3,0.5,0.9,1.5,3.5,0.6,-0.0,1.8
2,82,33.9,11.3,4.9,10.6,45.6,0.5,1.9,44.8,1.8,2.7,71.2,1.3,3.3,4.5,2.5,1.3,0.3,2.0
3,86,44.7,18.8,6.8,15.9,42.9,0.5,1.8,13.5,4.5,6.3,70.9,1.5,3.2,5.0,4.1,0.9,0.1,3.6
4,58,12.3,4.7,1.6,4.0,40.0,0.5,1.7,38.7,1.1,1.3,76.9,0.2,0.6,0.9,1.5,0.5,-0.4,0.9
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3794,84,21.2,8.7,3.4,6.7,50.2,0.0,-0.0,-4.5,1.7,2.5,68.1,1.9,2.3,3.9,1.5,0.6,0.3,2.0
3795,49,16.3,6.4,2.9,6.6,44.4,-0.1,-0.4,19.8,1.0,1.9,50.2,1.7,2.8,4.4,0.4,0.4,0.4,0.7
3796,53,9.9,2.1,0.8,1.8,43.1,-0.4,-0.6,1.9,0.6,1.0,63.9,0.7,1.0,1.7,0.4,0.4,0.2,0.5
3797,89,38.3,14.5,5.4,11.8,45.2,0.5,1.2,24.7,2.5,2.9,89.2,1.5,4.0,5.5,3.7,1.3,0.3,2.4


In [102]:
test_preds = reg.predict(X)

NameError: name 'X' is not defined

In [99]:
test_preds

array([ -43.66513064,   -6.0669782 ,  -59.47399214, ...,  -31.91545418,
       -160.82235132,  -20.82281077])

In [78]:
output = pd.DataFrame({'Id': test_df.Id,
                       'TARGET_5Yrs': test_preds})
output.to_csv('../data/submissions/SM_submission_2.csv', index=False)

In [79]:
output

Unnamed: 0,Id,TARGET_5Yrs
0,1,-43.665131
1,8194,-6.066978
2,3,-59.473992
3,8196,-170.258337
4,8197,0.938135
...,...,...
3794,8175,-153.602213
3795,8176,-17.263258
3796,8178,-31.915454
3797,8181,-160.822351
