# Overfitting

In [15]:
import pandas as pd
import numpy as np

In [16]:
from keras.models import Sequential
from keras.layers import Dense, Activation

In [17]:
df = pd.read_csv('../Resources/meet_or_beat.csv')
df

Unnamed: 0,EPS,forecasted_eps,noOfEsts,after_total_returns,before_total_returns
0,2.01,1.67,11.0,0.051444,0.018585
1,0.17,0.19,6.0,0.112955,-0.000510
2,-0.07,0.14,4.0,0.077167,-0.046104
3,0.48,0.51,8.0,-0.006130,-0.004899
4,-0.24,-0.27,9.0,0.089762,-0.025466
...,...,...,...,...,...
71963,0.31,0.30,4.0,0.006035,0.016854
71964,-0.65,-0.66,3.0,0.179327,-0.039052
71965,0.27,0.28,3.0,0.059002,0.141599
71966,0.11,0.10,4.0,-0.035755,0.026346


### Preparing the Data

In [18]:
# Use qcut to group earnings results into 5 buckets
df['earnings_quantile'] = (pd.qcut(df['EPS'], q=5, labels=False))+1
# Verify buckets are approximately balanced 
# (Slight imbalance from ties is OK)
df['earnings_quantile'].value_counts()

earnings_quantile
2    14676
1    14555
5    14268
4    14251
3    14218
Name: count, dtype: int64

In [19]:
# Save bucket ("quantile") value as the new  `y` variable
y_quantile = df['earnings_quantile']

In [20]:
# Save the unique count of categories for later use
number_of_classes = len(y_quantile.unique())
number_of_classes

5

In [21]:
# Encode quantiles into labels
from sklearn.preprocessing import LabelEncoder
encoder = LabelEncoder()
encoder.fit(y_quantile)
encoded_y = encoder.transform(y_quantile)
encoded_y

array([4, 2, 1, ..., 2, 1, 0], dtype=int64)

In [22]:
# Encode labels into categories
from tensorflow.keras.utils import to_categorical
y_categorical = to_categorical(encoded_y, num_classes=number_of_classes)

In [23]:
# Specify X (predictor) variables
X = df[['forecasted_eps',
              'before_total_returns','noOfEsts']]
X.head(3)

Unnamed: 0,forecasted_eps,before_total_returns,noOfEsts
0,1.67,0.018585,11.0
1,0.19,-0.00051,6.0
2,0.14,-0.046104,4.0


In [24]:
# Split into training and testing windows
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y_categorical, random_state=1)

In [25]:
# Preview the `y_train` values
y_train

array([[0., 0., 0., 0., 1.],
       [0., 0., 0., 1., 0.],
       [0., 0., 0., 0., 1.],
       ...,
       [0., 0., 1., 0., 0.],
       [0., 0., 1., 0., 0.],
       [0., 0., 0., 1., 0.]])

### Building the Model