To predict how well a user will climb a month from now, I tried out four neural networks. 

*   Model 1:  I had one layer with the features I thought would be most important. 
*   Model 2: I added more features, which improved the performance of the model. 
*   Model 3: I standardized the features first. This did not improve the model.
*   Model 4: I added an additional layer. This did not improve the model.

Training each model takes roughly half an hour using Google's GPU. I've decided to use the Random Forest Regressor, since it is much faster to train while getting a similar baseline RMSE score.


In [1]:
# Make sure the GPU can be found
%tensorflow_version 2.x
import tensorflow as tf
device_name = tf.test.gpu_device_name()
if device_name != '/device:GPU:0':
  raise SystemError('GPU device not found')
print('Found GPU at: {}'.format(device_name))

Found GPU at: /device:GPU:0


In [2]:
tf.test.gpu_device_name() 

'/device:GPU:0'

In [3]:
# Import libraries
from tensorflow import keras
import pandas as pd
from keras.models import Sequential
from keras.layers import Dense
from keras.wrappers.scikit_learn import KerasRegressor
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold

In [4]:
# Mount google drive to access data
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [5]:
open('/content/gdrive/My Drive/data/train_data_regr.csv').read()

',Date,Name,Month,month_year,Gender,Age,Rating,Style,Lead Style,Pitches,numerical_difficulty,max_month,avg_month,sport_ticks_this_month,first_tick,months_since_first_tick,last_month_max,last_month_ticks,last_month_diff,numerical_gender,predictions_m3\n21500,2019-03-30,Aaron Cassebeer,3,2019-03-01,Male,36.0,5.12a,Lead,Redpoint,1.0,18.0,18.0,18.0,1,2010-05-01,106,22.0,4.0,14.0,1,\n36653,2011-12-30,Adam P,12,2011-12-01,Male,39.0,5.10b,,,1.0,11.0,16.0,10.785714285714288,28,2000-08-01,136,10.0,1.0,15.0,1,\n2782,2017-06-27,Neillong Feller,6,2017-06-01,Male,30.0,5.11b,,,1.0,15.0,15.0,11.071428571428573,14,2014-01-01,41,15.0,7.0,8.0,1,\n38660,2020-09-26,Milo Lang,9,2020-09-01,Male,41.0,5.10c,TR,,1.0,12.0,12.0,9.75,8,2020-07-01,2,8.0,2.0,10.0,1,\n14064,2004-07-05,Matt Chan,7,2004-07-01,Male,43.0,5.11a,,,3.0,14.0,14.0,14.0,1,2003-07-01,12,12.0,7.0,7.0,1,\n2257,2019-06-24,Kevin L,6,2019-06-01,Male,29.0,5.12a,,,1.0,18.0,18.0,13.714285714285715,7,2015-03-01,51,16.0,5.0,13.0,1,10.711\n36143,2018-04-

In [6]:
df = pd.read_csv("/content/gdrive/My Drive/data/train_data_regr.csv")
df.head()

Unnamed: 0.1,Unnamed: 0,Date,Name,Month,month_year,Gender,Age,Rating,Style,Lead Style,Pitches,numerical_difficulty,max_month,avg_month,sport_ticks_this_month,first_tick,months_since_first_tick,last_month_max,last_month_ticks,last_month_diff,numerical_gender,predictions_m3
0,21500,2019-03-30,Aaron Cassebeer,3,2019-03-01,Male,36.0,5.12a,Lead,Redpoint,1.0,18.0,18.0,18.0,1,2010-05-01,106,22.0,4.0,14.0,1,
1,36653,2011-12-30,Adam P,12,2011-12-01,Male,39.0,5.10b,,,1.0,11.0,16.0,10.785714,28,2000-08-01,136,10.0,1.0,15.0,1,
2,2782,2017-06-27,Neillong Feller,6,2017-06-01,Male,30.0,5.11b,,,1.0,15.0,15.0,11.071429,14,2014-01-01,41,15.0,7.0,8.0,1,
3,38660,2020-09-26,Milo Lang,9,2020-09-01,Male,41.0,5.10c,TR,,1.0,12.0,12.0,9.75,8,2020-07-01,2,8.0,2.0,10.0,1,
4,14064,2004-07-05,Matt Chan,7,2004-07-01,Male,43.0,5.11a,,,3.0,14.0,14.0,14.0,1,2003-07-01,12,12.0,7.0,7.0,1,


# Model 1: Baseline

In [10]:
X = df[['last_month_max', 'last_month_ticks']]
y = df['max_month']

In [11]:
from sklearn.model_selection import train_test_split

In [12]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

Below is a baseline model with a fully connected hidden layer. No activation function is used for the output layer because it's a regression problem. A mean squared error loss function is optimized.

In [None]:
def baseline_model():
    # create model
    model = Sequential()
    model.add(Dense(2, input_dim=2, kernel_initializer='normal', activation='relu'))
    model.add(Dense(1, kernel_initializer='normal'))
    # Compile model
    model.compile(loss='mean_squared_error', optimizer='adam')
    return model

In [29]:
estimator = KerasRegressor(build_fn=baseline_model, epochs=100, batch_size=5, verbose=0)

In [13]:
kfold = KFold(n_splits=5)

In [25]:
results = cross_val_score(estimator, X_train, y_train, cv=kfold)


In [68]:
results

array([ -9.88522053, -16.49189377, -10.10259628,  -9.67506027,
        -9.79685688])

In [69]:
print("Baseline: %.2f (%.2f) MSE" % (results.mean(), results.std()))

Baseline: -11.19 (2.65) MSE


Baseline: -11.19 (2.65) MSE -> 3.34 RMSE

# Model 2: More features

In [7]:
df.columns

Index(['Unnamed: 0', 'Date', 'Name', 'Month', 'month_year', 'Gender', 'Age',
       'Rating', 'Style', 'Lead Style', 'Pitches', 'numerical_difficulty',
       'max_month', 'avg_month', 'sport_ticks_this_month', 'first_tick',
       'months_since_first_tick', 'last_month_max', 'last_month_ticks',
       'last_month_diff', 'numerical_gender', 'predictions_m3'],
      dtype='object')

In [8]:
X = df[['last_month_max', 'last_month_ticks', 'numerical_gender', 'Month', 'months_since_first_tick']]
y = df['max_month']

In [9]:
def more_features_model():
    # create model
    model = Sequential()
    model.add(Dense(5, input_dim=5, kernel_initializer='normal', activation='relu'))
    model.add(Dense(1, kernel_initializer='normal'))
    # Compile model
    model.compile(loss='mean_squared_error', optimizer='adam')
    return model

In [10]:
estimator2 = KerasRegressor(build_fn=more_features_model, epochs=100, batch_size=5, verbose=0)

In [17]:
kfold = KFold(n_splits=5)

In [19]:
model = more_features_model()

In [12]:
results2 = cross_val_score(estimator2, X, y, cv=kfold)

In [13]:
results2

array([ -9.77653503,  -9.68037701, -10.12135029,  -9.86698437,
        -9.8209734 ])

In [15]:
print("More Features: %.2f (%.2f) MSE" % (results2.mean(), results2.std()))

More Features: -9.85 (0.15) MSE


More Features: -9.70 (2.65) MSE -> 3.1 RMSE

# Model 3: Standardized Features

In [74]:
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

In [75]:
def more_features_standardized_model():
    # create model
    model = Sequential()
    model.add(Dense(5, input_dim=5, kernel_initializer='normal', activation='relu'))
    model.add(Dense(5, input_dim=5, kernel_initializer='normal', activation='relu'))
    model.add(Dense(1, kernel_initializer='normal'))
    # Compile model
    model.compile(loss='mean_squared_error', optimizer='adam')
    return model

In [76]:
estimators = []

In [77]:
estimators.append(('standardize', StandardScaler()))


In [78]:
estimators.append(('mlp', KerasRegressor(build_fn=more_features_standardized_model, epochs=50, batch_size=5, verbose=0)))


In [79]:
pipeline = Pipeline(estimators)

In [81]:
results3 = cross_val_score(pipeline, X, y, cv=kfold)


In [82]:
results3

array([-9.66740322, -9.59107113, -9.72552013, -9.73739815, -9.5126524 ])

In [84]:
print("Standardized Features: %.2f (%.2f) MSE" % (results3.mean(), results.std()))

Standardized Features: -9.65 (2.65) MSE


In [None]:
More Features: -9.65 (2.65) MSE -> 3.1 RMSE

# Model 4: More Layers

In [14]:
X = df[['last_month_max', 'last_month_ticks', 'numerical_gender', 'Month', 'months_since_first_tick']]
y = df['max_month']

In [15]:
def more_layers_model():
    # create model
    model = Sequential()
    model.add(Dense(5, input_dim=5, kernel_initializer='normal', activation='relu'))
    model.add(Dense(3, kernel_initializer='normal', activation='relu'))
    model.add(Dense(1, kernel_initializer='normal'))
    # Compile model
    model.compile(loss='mean_squared_error', optimizer='adam')
    return model

In [16]:
estimator4 = KerasRegressor(build_fn=more_layers_model, epochs=100, batch_size=5, verbose=0)

In [17]:
results4 = cross_val_score(estimator4, X, y, cv=kfold)

In [18]:
results4

array([-9.64668465, -9.5955019 , -9.72338104, -9.76371479, -9.77097511])

In [20]:
print("More Layers: %.2f (%.2f) MSE" % (results4.mean(), results4.std()))

More Layers: -9.70 (0.07) MSE
