# Initial Models

These are basic SciKitLearn models to test whether our initial EDA and data cleaning approaches were leading anywhere. We were also hoping that we could use these to guide us in creating our eventual CNN models. Unfortunately, none of them performed well, though surprisingly KNN and DT regressors did perform better than we expected (though still poorly overall and in comparison to how CNN's perform). A summary of the results can be found [here](https://github.com/jcweaver/blackboxes/blob/master/deliverables/initial_modelling.pdf).

In [None]:
# Import the necessary packages

import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import pickle

# Import SKLearn packages
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge, Lasso
from sklearn.tree import DecisionTreeRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.ensemble import RandomForestRegressor
import sklearn.metrics as metrics

In [None]:
# Extract data 

IdLookupTable = pd.read_csv('data/IdLookupTable.csv', header=0, sep=',', quotechar='"')

SampleSubmission = pd.read_csv('data/SampleSubmission.csv', header=0, sep=',', quotechar='"')

# Load pickles

train_data = pickle.load(open('data/train.p', 'rb'))

test_data = pickle.load(open('data/test.p', 'rb'))

clean_train = pickle.load(open('data/clean_train.p', 'rb'))

clean_test = pickle.load(open('data/clean_test.p', 'rb'))

augmented_data = pickle.load(open('data/augmented_train.p', 'rb'))

In [None]:
# Drop the extra columns

train_data = train_data.drop(['index', 'check_sum'], axis=1)

test_data = test_data.drop(['index', 'check_sum'], axis=1)

clean_train = clean_train.drop(['level_0', 'check_sum'], axis=1)

clean_test = clean_test.drop(['index', 'check_sum'], axis=1)

## Function to Plot Images and Keypoints, and Score Models

In [None]:
# def plot_img(data, indexes, columns=5, points=1):
    
#     # Determine size of image array
#     plt.figure(figsize = (15,10))
#     rows = len(indexes)//columns + 1
    
#     # Transform image strings into arrays
#     for index, value in enumerate(indexes):
#         #image_array = np.fromstring(data.loc[value, 'image'], sep = ' ').astype(int).reshape(96, 96)
#         image_array = data.loc[value, 'image'].reshape(96, 96)
#         # Optional add keypoints
#         if points == 1:
#             keypoints = train_data.loc[value].drop('image').values.astype(float).reshape(-1, 2)
#         else:
#             keypoints = []
            
#         # Plot figure matrix 
#         plt.subplot(rows, columns, index+1)
#         plt.title('Training Sample: {}'.format(index+1))
#         plt.axis('off')
#         plt.imshow(image_array, cmap='gray')
#         plt.tight_layout()
#         # Add keypoints
#         plt.scatter(keypoints[:, 0], keypoints[:, 1], s = 10, marker='.', c = 'red')
#     plt.show() 
    
#     return

In [None]:
# plot_img(train_data, range(20))

### Function to Get Model Performance Metrics

In [None]:
def regression_results(y_true, y_pred):

    # Regression metrics
    explained_variance = metrics.explained_variance_score(y_true, y_pred)
    mean_absolute_error = metrics.mean_absolute_error(y_true, y_pred) 
    mse = metrics.mean_squared_error(y_true, y_pred) 
    #mean_squared_log_error=metrics.mean_squared_log_error(y_true, y_pred)
    median_absolute_error = metrics.median_absolute_error(y_true, y_pred)
    r2 = metrics.r2_score(y_true, y_pred)

    print('explained_variance: ', round(explained_variance,4))    
    #print('mean_squared_log_error: ', round(mean_squared_log_error,4))
    print('r2: ', round(r2,4))
    print('MAE: ', round(mean_absolute_error,4))
    print('MSE: ', round(mse,4))
    print('RMSE: ', round(np.sqrt(mse),4))

### Function to Plot Consecutive Images

In [None]:
def new_plot_img(images, labels, indexes, columns=5, points=1):

    # Determine size of image array
    plt.figure(figsize = (15,10))
    rows = len(indexes)//columns + 1
    
    # Transform image strings into arrays
    for index, value in enumerate(indexes):
        image_array = images[value].reshape(96, 96)
         # Optional add keypoints
        if points == 1:
            keypoints = labels[value].reshape(-1, 2)
        else:
            keypoints = []
            
        # Plot figure matrix 
        plt.subplot(rows, columns, index + 1)
        plt.title('Training Sample: {}'.format(index+1))
        plt.axis('off')
        plt.imshow(image_array, cmap='gray')
        plt.tight_layout()
        # Add keypoints
        if points == 1:
            plt.scatter(keypoints[:, 0], keypoints[:, 1], s = 10, marker = '.', c = 'red')
        else:
            pass
        
    plt.show() 
    
    return

### Function to Plot Random Images

In [None]:
def new_random_img(images, labels, indexes, columns = 5, points=1):
    
    rand_list = np.random.randint(len(images), size=len(indexes))
    
    # Determine size of image array
    plt.figure(figsize = (15,10))
    rows = len(indexes)//columns + 1
    
    # Transform image strings into arrays
    for index, value in enumerate(list(rand_list)):
        image_array = images[value].reshape(96, 96)
         # Optional add keypoints
        if points == 1:
            keypoints = labels[value].reshape(-1, 2)
        else:
            keypoints = []
            
        # Plot figure matrix 
        plt.subplot(rows, columns, index + 1)
        plt.title('Training Sample: {}'.format(index+1))
        plt.axis('off')
        plt.imshow(image_array, cmap = 'gray')
        plt.tight_layout()
        # Add keypoints
        if points == 1:
            plt.scatter(keypoints[:, 0], keypoints[:, 1], s = 10, marker = '.', c = 'red')
        else:
            pass
        
    plt.show() 
    
    return

## Prep Data

Uses a simple mean fill function. 

In [None]:
# Define feature and target columns
feature_col, target_cols = 'image', list(train_data.drop('image', axis = 1).columns)

# Fill NA's with mean of column
train_data[target_cols] = train_data[target_cols].fillna(train_data[target_cols].mean())

# Specify image dimensions
width  = 96
height = 96
channels = 1

# Create image array in numpy (reshaped)
train_images = np.array(train_data[feature_col].tolist(), dtype = 'float')
train_labels = train_data[target_cols].to_numpy()

# (Optional) Normalize?
normalized_train_images = train_images/255

# Prepare train-test split
train_images, test_images, train_labels, test_labels = train_test_split(normalized_train_images, train_labels, test_size=0.1, random_state=7)

In [None]:
# Check shapes of train test datasets

print(train_images.shape)
print(train_labels.shape)
print(test_images.shape)
print(test_labels.shape)

# Raw Dataset with Mean Fill

## Multiple Linear Regression Models

### Ordinary Least Squares

The most basic multiple regression model.

In [None]:
# Fit model
LR1 = LinearRegression()
LR1_fit = LR1.fit(train_images, train_labels)

# Predict
LR1_predict = LR1_fit.predict(test_images)

In [None]:
# Get results
regression_results(test_labels, LR1_predict)

In [None]:
# Plot 20 random images
new_random_img(test_images, LR1_predict, range(20))

### Ridge (L1) Regression

Ridge regression is particularly useful to mitigate the problem of multicollinearity in linear regression, which commonly occurs in models with large numbers of parameters. In ridge regression, the cost function is altered by adding a penalty equivalent to square of the magnitude of the coefficients. Ridge regression shrinks the coefficients and it helps to reduce the model complexity and multi-collinearity.

In [None]:
# Fit model
LR2 = Ridge()
LR2_fit = LR2.fit(train_images, train_labels)

# Predict
LR2_predict = LR2_fit.predict(test_images)

In [None]:
# Get results
regression_results(test_labels, LR2_predict)

In [None]:
# Plot 20 random images
new_random_img(test_images, LR2_predict, range(20))

### Lasso (L2) Regression

Lasso is short for least absolute shrinkage and selection operator. Instead of taking the square of the coefficients, magnitudes are taken into account. Lasso regression not only helps in reducing over-fitting but it also helps in feature selection.

In [None]:
# Fit models
LR3 = Lasso()
LR3_fit = LR3.fit(train_images, train_labels)

# Predict
LR3_predict = LR3_fit.predict(test_images)

In [None]:
# Get results
regression_results(test_labels, LR3_predict)

## **Decision Tree Regressor**

A decision tree breaks down a dataset into smaller and smaller subsets while at the same time an associated decision tree is incrementally developed. The final result is a tree with decision nodes and leaf nodes. Though commonly used for classification problems, decision tree regression can also be achieved. When using a decision tree for classification problems, a metric like information gain can be used. To use a decision tree for regression however, an impurity metric that is suitable for continuous variables is used, such as weighted mean squared error.

In [None]:
# Fit models
DT1 = DecisionTreeRegressor()
DT1_fit = DT1.fit(train_images, train_labels)

# Predict
DT1_predict = DT1_fit.predict(test_images)

In [None]:
# Get results
regression_results(test_labels, DT1_predict)

## **K-Nearest Neighbors Regressor**

The k-nearest neighbors algorithm (k-NN) is a non-parametric classification method.  In both cases, the input consists of the k closest training examples in data set. The output depends on whether k-NN is used for classification or regression. In k-NN regression, the output is the property value for the object. This value is the average of the values of k nearest neighbors.

### K=5 Nearest Neighbors

In [None]:
# Train model with 5 nearest-neighbors
KNR1 = KNeighborsRegressor(n_neighbors = 5)
KNR1_fit = KNR1.fit(train_images, train_labels)

# Predict
KNR1_predict = KNR1_fit.predict(test_images)

In [None]:
# Get regression results
regression_results(test_labels, KNR1_predict)

### K=7 Nearest Neighbors

In [None]:
# Train model with 7 nearest-neighbors
KNR3 = KNeighborsRegressor(n_neighbors = 7)
KNR3_fit = KNR3.fit(train_images, train_labels)

# Predict
KNR3_predict = KNR3_fit.predict(test_images)

In [None]:
# Get results
regression_results(test_labels, KNR3_predict)

## Random Forest Regression

Random forests are a supervised learning technique using ensemble learning for classification or regression problems. The trees in random forests are run in parallel without any interaction while building. It constructs a multitude of decision trees during training and outputs the class that is the mode of the classes (classification) or mean prediction (regression) of the individual trees. Random forests are a meta-estimators (i.e. they combine the results of multiple predictions), aggregating many decision trees.

In [None]:
# Train model using 5 estimators (limited without a GPU, SKLearn does not support GPU functionality)
RF = RandomForestRegressor(n_estimators=5)
RF_fit = RF.fit(train_images, train_labels)

# Predict
RF_predict = RF_fit.predict(test_images)

In [None]:
# Get results
regression_results(test_labels, RF_predict)

# Dataset With Removed Duplicates and Mean Fill

Still uses mean fill to fill in NA values.

In [None]:
# Define feature and target columns
feature_col, target_cols = 'image', list(clean_train.drop('image', axis = 1).columns)

# Fill NA's with mean of column
clean_train[target_cols] = clean_train[target_cols].fillna(clean_train[target_cols].mean())

# Specify image dimensions
width  = 96
height = 96
channels = 1

# Create image array in numpy (reshaped)
clean_train_images = np.array(clean_train[feature_col].tolist(), dtype = 'float')
clean_train_labels = clean_train[target_cols].to_numpy()

# (Optional) Normalize?
normalized_clean_images = clean_train_images/255

# Prepare train-test split
train_images2, test_images2, train_labels2, test_labels2 = train_test_split(normalized_clean_images, clean_train_labels, test_size=0.1, random_state=7)

## Multiple Linear Regression Models

### Ordinary Least Squares

In [None]:
# Train models
LR_1 = LinearRegression()
LR_1_fit = LR_1.fit(train_images2, train_labels2)

# Predict
LR_1_predict = LR_1.predict(test_images2)

In [None]:
# Get results
regression_results(test_labels2, LR_1_predict)

### Ridge (L1) Regression

In [None]:
# Train models
LR_2 = Ridge()
LR_2_fit = LR_2.fit(train_images2, train_labels2)

# Predict
LR_2_predict = LR_2.predict(test_images2)

In [None]:
# Get results
regression_results(test_labels2, LR_2_predict)

### Lasso (L2) Regression

In [None]:
# Train models
LR_3 = Lasso()
LR_3_fit = LR_3.fit(train_images2, train_labels2)

# Predict
LR_3_predict = LR_3_fit.predict(test_images2)

In [None]:
# Get results
regression_results(test_labels2, LR_3_predict)

## Decision Tree Regressor

In [None]:
# Train models
DT2 = DecisionTreeRegressor()
DT2_fit = DT2.fit(train_images2, train_labels2)

# Predict
DT2_predict = DT2.predict(test_images2)

In [None]:
# Get results
regression_results(test_labels2, DT2_predict)

## K-Nearest Neighbors Regressor

### K=5 Nearest Neighbors

In [None]:
# Train models
KNR_1 = KNeighborsRegressor(n_neighbors = 5)
KNR_1_fit = KNR_1.fit(train_images2, train_labels2)

# Predict
KNR_1_predict = KNR_1.predict(test_images2)

In [None]:
# Get results
regression_results(test_labels2, KNR_1_predict)

### K=7 Nearest Neighbors

In [None]:
# Train models
KNR_2 = KNeighborsRegressor(n_neighbors = 7)
KNR_2_fit = KNR_2.fit(train_images2, train_labels2)

# Predict
KNR_2_predict = KNR_2.predict(test_images2)

In [None]:
# Get results
regression_results(test_labels2, KNR_2_predict)

## Random Forest Regressor

In [None]:
# Train models
RF2 = RandomForestRegressor(n_estimators=5)
RF2_fit = RF2.fit(train_images2, train_labels2)

# Predict
RF2_predict = RF2_fit.predict(test_images2)

In [None]:
# Get results
regression_results(test_labels2, RF2_predict)

# Augmented Dataset

This uses the linear model approach to fill in missing values, as described in the EDA notebook. 

In [None]:
# Define feature and target columns
feature_col, target_cols = 'image', list(augmented_data.columns)

# Specify image dimensions
width  = 96
height = 96
channels = 1

# Fill NA's with mean of column - there were still some NA's left with the first version of the linear model.
augmented_data = augmented_data.fillna(augmented_data.mean())

# Create label array
aug_train_labels = augmented_data.to_numpy()

# Prepare train-test split
train_images3, test_images3, train_labels3, test_labels3 = train_test_split(normalized_train_images, aug_train_labels, test_size=0.1, random_state=7)

## Multiple Linear Regression Models

### Ordinary Least Squares

In [None]:
# Train models
LR_1_2 = LinearRegression()
LR_1_2_fit = LR_1_2.fit(train_images3, train_labels3)

# Predict
LR_1_2_predict = LR_1_2_fit.predict(test_images3)

In [None]:
# Get results
regression_results(test_labels3, LR_1_2_predict)

### Ridge (L1) Regression

In [None]:
# Train models
LR_2_2 = Ridge()
LR_2_2_fit = LR_2_2.fit(train_images3, train_labels3)

# Predict
LR_2_2_predict = LR_2_2_fit.predict(test_images3)

In [None]:
# Get results
regression_results(test_labels3, LR_2_2_predict)

### Lasso (L2) Regression

In [None]:
# Train models
LR_3_2 = Lasso()
LR_3_2_fit = LR_3_2.fit(train_images3, train_labels3)

# Predict
LR_3_2_predict = LR_3_2_fit.predict(test_images3)

In [None]:
# Get results
regression_results(test_labels3, LR_3_2_predict)

## Decision Tree Regressor

In [None]:
# Train models
DT3 = DecisionTreeRegressor()
DT3_fit = DT3.fit(train_images3, train_labels3)

# Predict
DT3_predict = DT3_fit.predict(test_images3)

In [None]:
# Get results
regression_results(test_labels3, DT3_predict)

## K-Nearest Neighbors Regressor

### K=5 Nearest Neighbors

In [None]:
# Train models
KNR_1_2 = KNeighborsRegressor(n_neighbors = 5)
KNR_1_2_fit = KNR_1_2.fit(train_images3, train_labels3)

# Predict
KNR_1_2_predict = KNR_1_2_fit.predict(test_images3)

In [None]:
# Get results
regression_results(test_labels3, KNR_1_2_predict)

### K=7 Nearest Neighbors

In [None]:
# Train models
KNR_1_3 = KNeighborsRegressor(n_neighbors = 7)
KNR_1_3_fit = KNR_1_3.fit(train_images3, train_labels3)

# Predict
KNR_1_3_predict = KNR_1_3_fit.predict(test_images3)

In [None]:
# Get results
regression_results(test_labels3, KNR_1_3_predict)

## Random Forest Regressor

In [None]:
# Train models
RF3 = RandomForestRegressor(n_estimators=5)
RF3_fit = RF3.fit(train_images3, train_labels3)

# Predict
RF3_predict = RF3_fit.predict(test_images3)

In [None]:
# Get results
regression_results(test_labels3, RF3_predict)

## Initial Simple Neural Network Model using Keras

In [None]:
#Note these models are using data without nas
#To start, I'm filtering out all data that contains nas
#Drop any rows with NA
print("Before dropping NA",clean_train.shape)
temp_df = clean_train.dropna()
print("After dropping NA",temp_df.shape)

#Normalizing train
X = np.vstack(temp_df['image'].values)/255 #Convert to a 0 to 1 value
X = X.astype(np.float32)

#Remove index and image from temp_df with [1:-1]
y = temp_df[temp_df.columns[1:-1]].values
y = (y - 48) / 48  #Convert to a -1 to 1 value 
y = y.astype(np.float32)

In [None]:
#Splitting into train and dev sets
train_images, dev_images, train_labels, dev_labels = train_test_split(X, y, test_size=0.2, random_state=7)

In [None]:
#Model 1: Intermediate layer 1 level from: https://elix-tech.github.io/ja/2016/06/02/kaggle-facial-keypoints-ja.html
from keras.models import Sequential
from keras.layers import Dense, Activation
from keras.optimizers import SGD

model = Sequential()

#Each image is 96x96 so input is 9216
#100 neurons in layer
model.add(Dense(100, input_dim=9216))

#Activation function uses relu ie max(0.0,input)
model.add(Activation('relu'))

model.add(Dense(30))

#Optimizer = stochastic gradient descent
sgd = SGD(lr=0.01, momentum=0.9, nesterov=True)
model.compile(loss="mean_squared_error", optimizer=sgd)
hist = model.fit(train_images, train_labels, epochs=20, validation_split=0.2)

In [None]:
#Return value hist from the model fit can be used to plot
plt.plot(hist.history['loss'], linewidth=3, label='train')
plt.plot(hist.history['val_loss'], linewidth=3, label='valid')
plt.grid()
plt.legend()
plt.xlabel('epoch')
plt.ylabel('loss')
#plt.ylim(1e-3, 1e-2)
#plt.yscale('log')
plt.show()

In [None]:
def plot_sample(X,y,axis):
  axis.imshow(X.reshape(96,96),cmap='gray')
  for i in range(15):
    axis.scatter(y[i*2]*48+48,y[i*2+1]*48+48,color="green",marker="+")

In [None]:
#Plotting the results on some images
y_pred = model.predict(dev_images)

fig = plt.figure(figsize=(12, 12))
fig.subplots_adjust(
    left=0, right=1, bottom=0, top=1, hspace=0.05, wspace=0.05)

for i in range(16):
    axis = fig.add_subplot(4, 4, i+1, xticks=[], yticks=[])
    plot_sample(dev_images[i], dev_labels[i], axis)

plt.show()

In [None]:
regression_results(dev_labels,y_pred)