# Initial Models

In [None]:
# Import the necessary packages

import zipfile
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import pickle

# Import SKLearn packages
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge, Lasso
from sklearn.tree import DecisionTreeRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.neural_network import MLPRegressor
import sklearn.metrics as metrics

In [None]:
# Extract data 

IdLookupTable = pd.read_csv('/home/jupyter/blackboxes/data/IdLookupTable.csv', header=0, sep=',', quotechar='"')

SampleSubmission = pd.read_csv('/home/jupyter/blackboxes/data/SampleSubmission.csv', header=0, sep=',', quotechar='"')

# Load pickles

train_data = pickle.load(open('/home/jupyter/blackboxes/data/train.p', 'rb'))

test_data = pickle.load(open('/home/jupyter/blackboxes/data/test.p', 'rb'))


In [None]:
# Drop the extra columns

train_data = train_data.drop(['index', 'check_sum'], axis=1)

test_data = test_data.drop(['index', 'check_sum'], axis=1)

In [None]:
dup_train = pickle.load(open('/home/jupyter/blackboxes/data/traindup.p', 'rb'))
dup_test = pickle.load(open('/home/jupyter/blackboxes/data/testdup.p', 'rb'))

## Function to plot images and keypoints

In [None]:
def plot_img(data, indexes, columns=5, points=1):
    
    # Determine size of image array
    plt.figure(figsize = (15,10))
    rows = len(indexes)//columns + 1
    
    # Transform image strings into arrays
    for index, value in enumerate(indexes):
        #image_array = np.fromstring(data.loc[value, 'image'], sep = ' ').astype(int).reshape(96, 96)
        image_array = data.loc[value, 'image'].reshape(96, 96)
        # Optional add keypoints
        if points == 1:
            keypoints = train_data.loc[value].drop('image').values.astype(float).reshape(-1, 2)
        else:
            keypoints = []
            
        # Plot figure matrix 
        plt.subplot(rows, columns, index+1)
        plt.title('Training Sample: {}'.format(index+1))
        plt.axis('off')
        plt.imshow(image_array, cmap='gray')
        plt.tight_layout()
        # Add keypoints
        plt.scatter(keypoints[:, 0], keypoints[:, 1], s = 10, marker='.', c = 'red')
    plt.show() 
    
    return

In [None]:
plot_img(train_data, range(20))

## Lets look at some of the values with missing datapoints:

In [None]:
indexes = np.random.choice(train_data[train_data.isnull().any(axis=1)].index, 10)
plot_img(train_data, indexes)

Not only are there missing data points but there are also blurred images. 

It also seems that for some of these, only portions of data are missing and some of the images are not of real people.

## Test out some simple models for performance

In [None]:
def regression_results(y_true, y_pred):

    # Regression metrics
    explained_variance = metrics.explained_variance_score(y_true, y_pred)
    mean_absolute_error = metrics.mean_absolute_error(y_true, y_pred) 
    mse = metrics.mean_squared_error(y_true, y_pred) 
    #mean_squared_log_error=metrics.mean_squared_log_error(y_true, y_pred)
    median_absolute_error = metrics.median_absolute_error(y_true, y_pred)
    r2 = metrics.r2_score(y_true, y_pred)

    print('explained_variance: ', round(explained_variance,4))    
    #print('mean_squared_log_error: ', round(mean_squared_log_error,4))
    print('r2: ', round(r2,4))
    print('MAE: ', round(mean_absolute_error,4))
    print('MSE: ', round(mse,4))
    print('RMSE: ', round(np.sqrt(mse),4))

In [None]:
def new_plot_img(images, labels, indexes, columns=5, points=1):

    # Determine size of image array
    plt.figure(figsize = (15,10))
    rows = len(indexes)//columns + 1
    
    # Transform image strings into arrays
    for index, value in enumerate(indexes):
        image_array = images[value].reshape(96, 96)
         # Optional add keypoints
        if points == 1:
            keypoints = labels[value].reshape(-1, 2)
        else:
            keypoints = []
            
        # Plot figure matrix 
        plt.subplot(rows, columns, index + 1)
        plt.title('Training Sample: {}'.format(index+1))
        plt.axis('off')
        plt.imshow(image_array, cmap='gray')
        plt.tight_layout()
        # Add keypoints
        if points == 1:
            plt.scatter(keypoints[:, 0], keypoints[:, 1], s = 10, marker = '.', c = 'red')
        else:
            pass
        
    plt.show() 
    
    return

In [None]:
def new_random_img(images, labels, indexes, columns = 5, points=1):
    
    rand_list = np.random.randint(len(images), size=len(indexes))
    
    # Determine size of image array
    plt.figure(figsize = (15,10))
    rows = len(indexes)//columns + 1
    
    # Transform image strings into arrays
    for index, value in enumerate(list(rand_list)):
        image_array = images[value].reshape(96, 96)
         # Optional add keypoints
        if points == 1:
            keypoints = labels[value].reshape(-1, 2)
        else:
            keypoints = []
            
        # Plot figure matrix 
        plt.subplot(rows, columns, index + 1)
        plt.title('Training Sample: {}'.format(index+1))
        plt.axis('off')
        plt.imshow(image_array, cmap = 'gray')
        plt.tight_layout()
        # Add keypoints
        if points == 1:
            plt.scatter(keypoints[:, 0], keypoints[:, 1], s = 10, marker = '.', c = 'red')
        else:
            pass
        
    plt.show() 
    
    return

## Prep Data

In [None]:
# Define feature and target columns
feature_col, target_cols = 'image', list(train_data.drop('image', axis = 1).columns)

# Fill NA's with mean of column
train_data[target_cols] = train_data[target_cols].fillna(train_data[target_cols].mean())

# Specify image dimensions
width  = 96
height = 96
channels = 1

# Create image array in numpy (reshaped)
train_images = np.array(train_data[feature_col].tolist(), dtype = 'float')
train_labels = train_data[target_cols].to_numpy()

# (Optional) Normalize?
normalized_train_images = train_images/255

# Prepare train-test split
train_images, test_images, train_labels, test_labels = train_test_split(normalized_train_images, train_labels, test_size=0.1, random_state=7)

In [None]:
print("Size of training dataset: {}".format(len(train_images)))
print("Size of testing dataset: {}".format(len(test_images)))

In [None]:
print(train_images.shape)
print(train_labels.shape)
print(test_images.shape)
print(test_labels.shape)

## Multiple Linear Regression

### Try OLS

In [None]:
LR1 = LinearRegression()
LR1_fit = LR1.fit(train_images, train_labels)
LR1_predict = LR1_fit.predict(test_images)

In [None]:
regression_results(test_labels, LR1_predict)

### Try Ridge

In [None]:
LR2 = Ridge()
LR2_fit = LR2.fit(train_images, train_labels)
LR2_predict = LR2_fit.predict(test_images)

In [None]:
regression_results(test_labels, LR2_predict)

### Try Lasso

In [None]:
LR3 = Lasso()
LR3_fit = LR3.fit(train_images, train_labels)
LR3_predict = LR3_fit.predict(test_images)

In [None]:
regression_results(test_labels, LR3_predict)

### Try DT Regressor

In [None]:
DT1 = DecisionTreeRegressor()
DT1_fit = DT1.fit(train_images, train_labels)
DT1_predict = DT1_fit.predict(test_images)

In [None]:
regression_results(test_labels, DT1_predict)

### Try KNN Regressor

In [None]:
KNR1 = KNeighborsRegressor(n_neighbors = 5)
KNR1_fit = KNR1.fit(train_images, train_labels)
KNR1_predict = KNR1_fit.predict(test_images)

In [None]:
regression_results(test_labels, KNR1_predict)

In [None]:
KNR2 = KNeighborsRegressor(n_neighbors = 6)
KNR2_fit = KNR2.fit(train_images, train_labels)
KNR2_predict = KNR2_fit.predict(test_images)

In [None]:
regression_results(test_labels, KNR2_predict)

In [None]:
KNR3 = KNeighborsRegressor(n_neighbors = 7)
KNR3_fit = KNR3.fit(train_images, train_labels)
KNR3_predict = KNR3_fit.predict(test_images)

In [None]:
regression_results(test_labels, KNR3_predict)

## MLP Regression

In [None]:
MLPR1 = MLPRegressor(hidden_layer_sizes = 50, activation = 'relu', solver = 'adam', alpha = 0.001, batch_size = 'auto')
MLPR1_fit = MLPR1.fit(train_images, train_labels)
MLPR1_predict = MLPR1_fit.predict(test_images)

In [None]:
regression_results(test_labels, MLPR1_predict)

## Try using Dataset with Removed Duplicates

### Prep Data

In [None]:
# Check dimensions of new DFs

print(train_rm_dup.shape)
print(test_rm_dup.shape)

In [None]:
# Drop the additional rows

train_rm_dup.drop(['level_0', 'check_sum'], axis = 1, inplace=True)
test_rm_dup.drop(['index', 'check_sum'], axis = 1, inplace=True)

In [None]:
# Check dimensions of new DFs with additional columns removed

print(train_rm_dup.shape)
print(test_rm_dup.shape)

### Try OLS

In [None]:
LR_1 = LinearRegression()
LR_1_fit = LR_1.fit(train_images2, train_labels2)
LR_1_predict = LR_1.predict(test_images2)

In [None]:
regression_results(test_labels2, LR_1_predict)

### Try Ridge

In [None]:
LR_2 = Ridge()
LR_2_fit = LR_2.fit(train_images2, train_labels2)
LR_2_predict = LR_2.predict(test_images2)

In [None]:
regression_results(test_labels2, LR_2_predict)

### Try Lasso

In [None]:
LR_3 = Lasso()
LR_3_fit = L3.fit(train_images2, train_labels2)
LR_3_predict = L3.predict(test_images2)

In [None]:
regression_results(test_labels2, LR_3_predict)

### Try DT Regressor

In [None]:
DT2 = DecisionTreeRegressor()
DT2_fit = DT2.fit(train_images2, train_labels2)
DT2_predict = DT2.predict(test_images2)

In [None]:
regression_results(test_labels2, DT2_predict)

### Try KNN Regressor

In [None]:
KNR_1 = KNeighborsRegressor(n_neighbors = 5)
KNR_1_fit = KNR_1.fit(train_images2, train_labels2)
KNR_1_predict = KNR_1.predict(test_images2)

In [None]:
regression_results(test_labels2, KNR_1_predict)

In [None]:
KNR_2 = KNeighborsRegressor(n_neighbors = 7)
KNR_2_fit = KNR_2.fit(train_images2, train_labels2)
KNR_2_predict = KNR_2.predict(test_images2)

In [None]:
regression_results(test_labels2, KNR_2_predict)

## Try Using Rakesh's Method

In [None]:
# Fetch the most significantly-coorelated features for each feature in the `data_under_cleansing` set.
def get_feature_correlations(data_under_cleansing):
    correlations = data_under_cleansing.corr()
    max_correlations = correlations[(correlations>0.5) & (correlations<1)]
    feature_corrs = {}
    for column in max_correlations:
        corr_scores = max_correlations[column]
        significant_correlations = corr_scores.dropna()
        feature_corrs[significant_correlations.name]=significant_correlations
    return feature_corrs

# In the data set `data_under_cleaning`, this method looks for features which do not have more than 50 missing data-values.
# returns a bool-mask representing : <feature> :: <bool? is data dense>
def get_data_density_mask(data_under_cleansing):
    features = data_under_cleansing.columns
    data_under_cleansing_mask = {}
    for i in features:
        missing_count = sum(data_under_cleansing[i].isna())
        data_under_cleansing_mask[i] = missing_count<50
    return data_under_cleansing_mask

def do_augment_missing_data(data_under_cleansing, density_mask):
    feat_corrs = get_feature_correlations(data_under_cleansing)
    print("Complete Features: ", len([key for key in density_mask.keys() if density_mask[key]]))
    #all feature-correlations for features which are reported as not dense
    all_features_to_augment = [feat_corrs[feature] for feature in density_mask if not density_mask[feature]]
    for feature_data in all_features_to_augment:
        # Do this for each feature that needs to be augmented due to large missing values
        feat_to_be_augmented = feature_data.name
        high_corr_full_features = [feat for feat in feature_data.index.tolist() if density_mask[feat]]
        if len(high_corr_full_features) < 2:
            # a feature_threshold to identify how many features are to be used to model 
            # feature being augmented. Minimum is 2.
            continue
        print("\nfeat ..", feat_to_be_augmented)
        print("corr ..", high_corr_full_features)
        #"filtering train-data set where all high-corr-features and feat-to-be-augmented are not-NA"
        query_str_train = ' & '.join(['~{}.isna()'.format(k) for k in high_corr_full_features])
        query_str_train = ' & '.join([query_str_train, '~{}.isna()'.format(feat_to_be_augmented)])
        tmp_train_data  = data_under_cleansing.query(query_str_train)
        tmp_train_X = tmp_train_data[high_corr_full_features]
        tmp_train_y = tmp_train_data[feat_to_be_augmented]
        #print("Plotting y against each X.... \n\n ")
        #for x in high_corr_full_features:
            #tmp_train_data.plot(x=x, y=feat_to_be_augmented, style='o')
            #plt.show()
        #"filtering predict-data set where all high-corr-features are not-NA and feat-to-be-augmented are NA"
        query_str_predict = ' & '.join(['~{}.isna()'.format(k) for k in high_corr_full_features])
        query_str_predict = ' & '.join([query_str_predict, '{}.isna()'.format(feat_to_be_augmented)])
        tmp_predict_data  = data_under_cleansing.query(query_str_predict)
        tmp_predict_X = tmp_predict_data[high_corr_full_features]
        lm = LinearRegression().fit(tmp_train_X, tmp_train_y)
        model_score =  lm.score(tmp_train_X, tmp_train_y)
        print("Model score: ", model_score)
        if model_score < 0.45:
            # do not use a model to augment data when model is less than 45% accurate. Shifting this threshold to 50% leads to NON-CONVERGENCE
            print("aborting augmenting..")
            continue
        print("Model coef: " , lm.coef_)
        tmp_predict_y = list(lm.predict(tmp_predict_X))
        feat_column_index = data_under_cleansing.columns.get_loc(feat_to_be_augmented)
        index_list = tmp_predict_data.index.tolist()
        for i, index in enumerate(index_list):
            data_under_cleansing.iloc[index][feat_column_index] = tmp_predict_y[i]
    return data_under_cleansing

def augment_missing_data(given_dataset):
    data_to_be_cleansed = given_dataset.loc[:, given_dataset.columns != 'Image']
    while True:
        print("\n\n==========================================================")
        data_density_mask = get_data_density_mask(data_to_be_cleansed)
        incomplete_features = [key for key in data_density_mask.keys() if not data_density_mask[key]]
        print("Incomplete Features: ", len(incomplete_features))
        if len(incomplete_features) > 0:
            data_to_be_cleansed = do_augment_missing_data(data_to_be_cleansed, data_density_mask)
        else:
            break
    return data_to_be_cleansed

In [None]:
# Transform data using Rakesh method
augment_labels = augment_missing_data(train_data)

In [None]:
augment_labels = np.array(augment_labels.drop('image', axis=1))

In [None]:
# Prepare train-test split for new data
train_images3, test_images3, train_labels3, test_labels3 = train_test_split(normalized_train_images, augment_labels, test_size=0.1, random_state=7)

In [None]:
print(train_images3.shape)
print(test_images3.shape)
print(train_labels3.shape)
print(test_images3.shape)

In [None]:
print(train_images.shape)
print(test_images.shape)
print(train_labels.shape)
print(test_images.shape)

### Try OLS

In [None]:
LR_1_2 = LinearRegression()
LR_1_2_fit = LR_1_2.fit(train_images3, train_labels3)
LR_1_2_predict = LR_1_2_fit.predict(test_images3)

In [None]:
regression_results(test_labels3, LR_1_2_predict)

### Try Ridge

In [None]:
LR_2_2 = Ridge()
LR_2_2_fit = LR_2_2.fit(train_images3, train_labels3)
LR_2_2_predict = LR_2_2_fit.predict(test_images3)

In [None]:
regression_results(test_labels3, LR_2_2_predict)

### Try Lasso

In [None]:
LR_3_2 = Lasso()
LR_3_2_fit = LR_3_2.fit(train_images3, train_labels3)
LR_3_2_predict = LR_3_2_fit.predict(test_images3)

In [None]:
regression_results(test_labels3, LR_3_2_predict)

### Try DT Regressor

In [None]:
DT3 = DecisionTreeRegressor()
DT3_fit = DT3.fit(train_images3, train_labels3)
DT3_predict = DT3_fit.predict(test_images3)

In [None]:
regression_results(test_labels3, DT3_predict)

### Try KNN Regressor

In [None]:
KNR_1_2 = KNeighborsRegressor(n_neighbors = 5)
KNR_1_2_fit = KNR_1_2.fit(train_images3, train_labels3)
KNR_1_2_predict = KNR_1_2_fit.predict(test_images3)

In [None]:
regression_results(test_labels3, KNR_1_2_predict)

In [None]:
KNR_1_3 = KNeighborsRegressor(n_neighbors = 7)
KNR_1_3_fit = KNR_1_3.fit(train_images3, train_labels3)
KNR_1_3_predict = KNR_1_3_fit.predict(test_images3)

In [None]:
regression_results(test_labels3, KNR_1_3_predict)