# Initial Models

In [None]:
# from google.colab import drive

# drive.mount('/content/drive')

In [None]:
# Import the necessary packages

import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import pickle

# Import SKLearn packages
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge, Lasso
from sklearn.tree import DecisionTreeRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.ensemble import RandomForestRegressor
import sklearn.metrics as metrics

In [None]:
# Extract data 

IdLookupTable = pd.read_csv('data/IdLookupTable.csv', header=0, sep=',', quotechar='"')

SampleSubmission = pd.read_csv('data/SampleSubmission.csv', header=0, sep=',', quotechar='"')

# Load pickles

train_data = pickle.load(open('data/train.p', 'rb'))

test_data = pickle.load(open('data/test.p', 'rb'))

clean_train = pickle.load(open('data/clean_train.p', 'rb'))

clean_test = pickle.load(open('data/clean_test.p', 'rb'))

augmented_data = pickle.load(open('data/augmented_train.p', 'rb'))

In [None]:
# Drop the extra columns

train_data = train_data.drop(['index', 'check_sum'], axis=1)

test_data = test_data.drop(['index', 'check_sum'], axis=1)

clean_train = clean_train.drop(['level_0', 'check_sum'], axis=1)

clean_test = clean_test.drop(['index', 'check_sum'], axis=1)

## Function to plot images and keypoints

In [None]:
# def plot_img(data, indexes, columns=5, points=1):
    
#     # Determine size of image array
#     plt.figure(figsize = (15,10))
#     rows = len(indexes)//columns + 1
    
#     # Transform image strings into arrays
#     for index, value in enumerate(indexes):
#         #image_array = np.fromstring(data.loc[value, 'image'], sep = ' ').astype(int).reshape(96, 96)
#         image_array = data.loc[value, 'image'].reshape(96, 96)
#         # Optional add keypoints
#         if points == 1:
#             keypoints = train_data.loc[value].drop('image').values.astype(float).reshape(-1, 2)
#         else:
#             keypoints = []
            
#         # Plot figure matrix 
#         plt.subplot(rows, columns, index+1)
#         plt.title('Training Sample: {}'.format(index+1))
#         plt.axis('off')
#         plt.imshow(image_array, cmap='gray')
#         plt.tight_layout()
#         # Add keypoints
#         plt.scatter(keypoints[:, 0], keypoints[:, 1], s = 10, marker='.', c = 'red')
#     plt.show() 
    
#     return

In [None]:
# plot_img(train_data, range(20))

## Visualize values with missing datapoints:

In [None]:
# indexes = np.random.choice(train_data[train_data.isnull().any(axis=1)].index, 10)
# plot_img(train_data, indexes)

Not only are there missing data points but there are also blurred images. 

It also seems that for some of these, only portions of data are missing and some of the images are not of real people.

## Test out some simple models for performance

In [None]:
def regression_results(y_true, y_pred):

    # Regression metrics
    explained_variance = metrics.explained_variance_score(y_true, y_pred)
    mean_absolute_error = metrics.mean_absolute_error(y_true, y_pred) 
    mse = metrics.mean_squared_error(y_true, y_pred) 
    #mean_squared_log_error=metrics.mean_squared_log_error(y_true, y_pred)
    median_absolute_error = metrics.median_absolute_error(y_true, y_pred)
    r2 = metrics.r2_score(y_true, y_pred)

    print('explained_variance: ', round(explained_variance,4))    
    #print('mean_squared_log_error: ', round(mean_squared_log_error,4))
    print('r2: ', round(r2,4))
    print('MAE: ', round(mean_absolute_error,4))
    print('MSE: ', round(mse,4))
    print('RMSE: ', round(np.sqrt(mse),4))

In [None]:
def r2(y_true, y_pred):
  r2 = metrics.r2_score(y_true, y_pred)
  return r2

In [None]:
def new_plot_img(images, labels, indexes, columns=5, points=1):

    # Determine size of image array
    plt.figure(figsize = (15,10))
    rows = len(indexes)//columns + 1
    
    # Transform image strings into arrays
    for index, value in enumerate(indexes):
        image_array = images[value].reshape(96, 96)
         # Optional add keypoints
        if points == 1:
            keypoints = labels[value].reshape(-1, 2)
        else:
            keypoints = []
            
        # Plot figure matrix 
        plt.subplot(rows, columns, index + 1)
        plt.title('Training Sample: {}'.format(index+1))
        plt.axis('off')
        plt.imshow(image_array, cmap='gray')
        plt.tight_layout()
        # Add keypoints
        if points == 1:
            plt.scatter(keypoints[:, 0], keypoints[:, 1], s = 10, marker = '.', c = 'red')
        else:
            pass
        
    plt.show() 
    
    return

In [None]:
def new_random_img(images, labels, indexes, columns = 5, points=1):
    
    rand_list = np.random.randint(len(images), size=len(indexes))
    
    # Determine size of image array
    plt.figure(figsize = (15,10))
    rows = len(indexes)//columns + 1
    
    # Transform image strings into arrays
    for index, value in enumerate(list(rand_list)):
        image_array = images[value].reshape(96, 96)
         # Optional add keypoints
        if points == 1:
            keypoints = labels[value].reshape(-1, 2)
        else:
            keypoints = []
            
        # Plot figure matrix 
        plt.subplot(rows, columns, index + 1)
        plt.title('Training Sample: {}'.format(index+1))
        plt.axis('off')
        plt.imshow(image_array, cmap = 'gray')
        plt.tight_layout()
        # Add keypoints
        if points == 1:
            plt.scatter(keypoints[:, 0], keypoints[:, 1], s = 10, marker = '.', c = 'red')
        else:
            pass
        
    plt.show() 
    
    return

## Prep Data

In [None]:
# Define feature and target columns
feature_col, target_cols = 'image', list(train_data.drop('image', axis = 1).columns)

# Fill NA's with mean of column
train_data[target_cols] = train_data[target_cols].fillna(train_data[target_cols].mean())

# Specify image dimensions
width  = 96
height = 96
channels = 1

# Create image array in numpy (reshaped)
train_images = np.array(train_data[feature_col].tolist(), dtype = 'float')
train_labels = train_data[target_cols].to_numpy()

# (Optional) Normalize?
normalized_train_images = train_images/255

# Prepare train-test split
train_images, test_images, train_labels, test_labels = train_test_split(normalized_train_images, train_labels, test_size=0.1, random_state=7)

In [None]:
# Check shapes of train test datasets

print(train_images.shape)
print(train_labels.shape)
print(test_images.shape)
print(test_labels.shape)

In [None]:
## Define some lists for performance scoring

OLS_list = []
ridge_list = []
lasso_list= []
DT_list = []
KNN_list = []
KNN_list1 = []
RF_list = []

In [None]:
OLS_list.clear()
ridge_list.clear()
lasso_list.clear()
DT_list.clear()
KNN_list.clear()
KNN_list1.clear()
RF_list.clear()

## **Multiple Linear Regression**

### Try OLS

In [None]:
LR1 = LinearRegression()
LR1_fit = LR1.fit(train_images, train_labels)
LR1_predict = LR1_fit.predict(test_images)

In [None]:
regression_results(test_labels, LR1_predict)

In [None]:
OLS_list.append({'raw': r2(test_labels, LR1_predict)})

In [None]:
new_random_img(test_images, LR1_predict, range(20))

### Try Ridge

In [None]:
LR2 = Ridge()
LR2_fit = LR2.fit(train_images, train_labels)
LR2_predict = LR2_fit.predict(test_images)

In [None]:
regression_results(test_labels, LR2_predict)

In [None]:
ridge_list.append({'raw':r2(test_labels, LR2_predict)})

In [None]:
new_random_img(test_images, LR2_predict, range(20))

### Try Lasso

In [None]:
LR3 = Lasso()
LR3_fit = LR3.fit(train_images, train_labels)
LR3_predict = LR3_fit.predict(test_images)

In [None]:
regression_results(test_labels, LR3_predict)

In [None]:
lasso_list.append({'raw': r2(test_labels, LR3_predict)})

In [None]:
new_random_img(test_images, LR3_predict, range(20))

## **DT Regressor**

In [None]:
DT1 = DecisionTreeRegressor()
DT1_fit = DT1.fit(train_images, train_labels)
DT1_predict = DT1_fit.predict(test_images)

In [None]:
regression_results(test_labels, DT1_predict)

In [None]:
DT_list.append({'raw': r2(test_labels, DT1_predict)})

In [None]:
new_random_img(test_images, DT1_predict, range(20))

## **KNN Regressor**

In [None]:
KNR1 = KNeighborsRegressor(n_neighbors = 5)
KNR1_fit = KNR1.fit(train_images, train_labels)
KNR1_predict = KNR1_fit.predict(test_images)

In [None]:
regression_results(test_labels, KNR1_predict)

In [None]:
KNN_list.append({'raw': r2(test_labels, KNR1_predict)})

In [None]:
KNR3 = KNeighborsRegressor(n_neighbors = 7)
KNR3_fit = KNR3.fit(train_images, train_labels)
KNR3_predict = KNR3_fit.predict(test_images)

In [None]:
regression_results(test_labels, KNR3_predict)

In [None]:
KNN_list1.append({'raw': r2(test_labels, KNR3_predict)})

### Random Forest Regression

In [None]:
RF = RandomForestRegressor(n_estimators=5)
RF_fit = RF.fit(train_images, train_labels)
RF_predict = RF_fit.predict(test_images)

In [None]:
regression_results(test_labels, RF_predict)

In [None]:
RF_list.append({'raw': r2(test_labels, RF_predict)})

## **MLP Regression**

In [None]:
#MLPR1 = MLPRegressor(hidden_layer_sizes = 50, activation = 'relu', solver = 'adam', alpha = 0.001, batch_size = 'auto')
#MLPR1_fit = MLPR1.fit(train_images, train_labels)
#MLPR1_predict = MLPR1_fit.predict(test_images)

In [None]:
#regression_results(test_labels, MLPR1_predict)

## Try using Dataset with Removed Duplicates

### Prep Data

In [None]:
# Define feature and target columns
feature_col, target_cols = 'image', list(clean_train.drop('image', axis = 1).columns)

# Fill NA's with mean of column
clean_train[target_cols] = clean_train[target_cols].fillna(clean_train[target_cols].mean())

# Specify image dimensions
width  = 96
height = 96
channels = 1

# Create image array in numpy (reshaped)
clean_train_images = np.array(clean_train[feature_col].tolist(), dtype = 'float')
clean_train_labels = clean_train[target_cols].to_numpy()

# (Optional) Normalize?
normalized_clean_images = clean_train_images/255

# Prepare train-test split
train_images2, test_images2, train_labels2, test_labels2 = train_test_split(normalized_clean_images, clean_train_labels, test_size=0.1, random_state=7)

## **Multiple Linear Regression**

### Try OLS

In [None]:
LR_1 = LinearRegression()
LR_1_fit = LR_1.fit(train_images2, train_labels2)
LR_1_predict = LR_1.predict(test_images2)

In [None]:
regression_results(test_labels2, LR_1_predict)

In [None]:
OLS_list.append({'dp_rm': r2(test_labels2, LR_1_predict)})

### Try Ridge

In [None]:
LR_2 = Ridge()
LR_2_fit = LR_2.fit(train_images2, train_labels2)
LR_2_predict = LR_2.predict(test_images2)

In [None]:
regression_results(test_labels2, LR_2_predict)

In [None]:
ridge_list.append({'dp_rm': r2(test_labels2, LR_2_predict)})

### Try Lasso

In [None]:
LR_3 = Lasso()
LR_3_fit = LR_3.fit(train_images2, train_labels2)
LR_3_predict = LR_3_fit.predict(test_images2)

In [None]:
regression_results(test_labels2, LR_3_predict)

In [None]:
lasso_list.append({'dp_rm': r2(test_labels2, LR_3_predict)})

### Try DT Regressor

In [None]:
DT2 = DecisionTreeRegressor()
DT2_fit = DT2.fit(train_images2, train_labels2)
DT2_predict = DT2.predict(test_images2)

In [None]:
regression_results(test_labels2, DT2_predict)

In [None]:
DT_list.append({'dp_rm': r2(test_labels2, DT2_predict)})

### Try KNN Regressor

In [None]:
KNR_1 = KNeighborsRegressor(n_neighbors = 5)
KNR_1_fit = KNR_1.fit(train_images2, train_labels2)
KNR_1_predict = KNR_1.predict(test_images2)

In [None]:
regression_results(test_labels2, KNR_1_predict)

In [None]:
KNN_list.append({'dp_rm': r2(test_labels2, KNR_1_predict)})

In [None]:
KNR_2 = KNeighborsRegressor(n_neighbors = 7)
KNR_2_fit = KNR_2.fit(train_images2, train_labels2)
KNR_2_predict = KNR_2.predict(test_images2)

In [None]:
regression_results(test_labels2, KNR_2_predict)

In [None]:
KNN_list1.append({'dp_rm': r2(test_labels2, KNR_2_predict)})

### Try Random Forest Regression

In [None]:
RF2 = RandomForestRegressor(n_estimators=5)
RF2_fit = RF2.fit(train_images2, train_labels2)
RF2_predict = RF2_fit.predict(test_images2)

In [None]:
regression_results(test_labels2, RF2_predict)

In [None]:
RF_list.append({'dp_rm': r2(test_labels2, RF2_predict)})

## Using Augmented Data

In [None]:
# Define feature and target columns
feature_col, target_cols = 'image', list(augmented_data.columns)

# Specify image dimensions
width  = 96
height = 96
channels = 1

# Fill NA's with mean of column
augmented_data = augmented_data.fillna(augmented_data.mean())

# Create label array
aug_train_labels = augmented_data.to_numpy()

# Prepare train-test split
train_images3, test_images3, train_labels3, test_labels3 = train_test_split(normalized_train_images, aug_train_labels, test_size=0.1, random_state=7)

In [None]:
print(train_images3.shape)
print(test_images3.shape)
print(train_labels3.shape)
print(test_images3.shape)

In [None]:
augmented_data.shape

In [None]:
train_data.shape

### Try OLS

In [None]:
LR_1_2 = LinearRegression()
LR_1_2_fit = LR_1_2.fit(train_images3, train_labels3)
LR_1_2_predict = LR_1_2_fit.predict(test_images3)

In [None]:
regression_results(test_labels3, LR_1_2_predict)

In [None]:
OLS_list.append({'reg_m': r2(test_labels3, LR_1_2_predict)})

### Try Ridge

In [None]:
LR_2_2 = Ridge()
LR_2_2_fit = LR_2_2.fit(train_images3, train_labels3)
LR_2_2_predict = LR_2_2_fit.predict(test_images3)

In [None]:
ridge_list.append({'reg_m': r2(test_labels3, LR_2_2_predict)})

### Try Lasso

In [None]:
LR_3_2 = Lasso()
LR_3_2_fit = LR_3_2.fit(train_images3, train_labels3)
LR_3_2_predict = LR_3_2_fit.predict(test_images3)

In [None]:
regression_results(test_labels3, LR_3_2_predict)

In [None]:
lasso_list.append({'reg_m': r2(test_labels3, LR_3_2_predict)})

In [None]:
lasso_list

### Try DT Regressor

In [None]:
DT3 = DecisionTreeRegressor()
DT3_fit = DT3.fit(train_images3, train_labels3)
DT3_predict = DT3_fit.predict(test_images3)

In [None]:
regression_results(test_labels3, DT3_predict)

In [None]:
DT_list.append({'reg_m': r2(test_labels3, DT3_predict)})

### Try KNN Regressor

In [None]:
KNR_1_2 = KNeighborsRegressor(n_neighbors = 5)
KNR_1_2_fit = KNR_1_2.fit(train_images3, train_labels3)
KNR_1_2_predict = KNR_1_2_fit.predict(test_images3)

In [None]:
regression_results(test_labels3, KNR_1_2_predict)

In [None]:
KNN_list.append({'reg_m': r2(test_labels3, KNR_1_2_predict)})

In [None]:
KNR_1_3 = KNeighborsRegressor(n_neighbors = 7)
KNR_1_3_fit = KNR_1_3.fit(train_images3, train_labels3)
KNR_1_3_predict = KNR_1_3_fit.predict(test_images3)

In [None]:
regression_results(test_labels3, KNR_1_3_predict)

In [None]:
KNN_list1.append({'reg_m': r2(test_labels3, KNR_1_3_predict)})

### Try Random Forest Regression

In [None]:
RF3 = RandomForestRegressor(n_estimators=5)
RF3_fit = RF3.fit(train_images3, train_labels3)
RF3_predict = RF3_fit.predict(test_images3)

In [None]:
regression_results(test_labels3, RF3_predict)

In [None]:
RF_list.append({'reg_m': r2(test_labels3, RF3_predict)})

## Graph Performance Metrics

In [None]:
print(OLS_list)
print(ridge_list)
print(lasso_list)
print(DT_list)
print(KNN_list)
print(KNN_list1)
print(RF_list)

In [None]:
master = {"OLS Performance": OLS_list, "Ridge Performance": ridge_list, "Lasso Performance": lasso_list,
         "DTR Performance": DT_list, "KNNR(5) Performance": KNN_list, "KNNR(7) Performance": KNN_list1,
         "RFR Performance": RF_list}

In [None]:
# Plot some graphs

def plot_performance(master):
    for key, value in master.items():
        fig, ax = plt.subplots(figsize=(7,4))
        ax.set_title(key)
        ax.set_xlabel("Data Type")
        ax.set_ylabel("R squared")
        for item in value:
            ax.bar(list(item.keys()), list(item.values()))

    return

In [None]:
plot_performance(master)