# W207 Final Project : Facial Keypoint Detection 
# Team: Joanie Weaver, Sandip Panesar, Jackie Nichols, Rakesh Walisheter
W207 Tuesday @4pm

ref: https://www.kaggle.com/c/facial-keypoints-detection

In [None]:
import pandas as pd
import numpy as np
import os
import warnings

from tqdm import tqdm
import zlib

import matplotlib.pyplot as plt
import matplotlib.image as mpimg
from matplotlib import rc
from matplotlib.ticker import PercentFormatter
import pickle
from  sklearn.linear_model import LinearRegression


In [None]:
#Load the pickle files

train_data = pickle.load( open( "data/clean_train.p", "rb" ) )

train_data.rename(columns = {'level_0' : 'index'}, inplace = True)


print("Train shape is: ", train_data.shape)



# Potential Approach for missing data
# Filling In Missing Data Using Linear-Models

Notes:

* 'y' : An incomplete feature in the data set which is being augmented

* 'X' : a collection of features which are dense in data and are highly-correlated to 'y'.

* Features which have no more than 50-missing data-points are considered `Dense`. Features which have no missing-data-points are considered `full`.

* Augmentation considers atleast 2-reference points which are `significantly correlated` to augment any feature; this is to have a `triangulation` in the image rather than just depend on one point.

* Correlations of more than 0.5 are considered as `significant correlations`. The corresponding-features are earmarked to be used to triangulate a predicted location.

* Data-points which are full in ('X' and 'y') are used to train the corresponding linear-model.

* Data-points which are full in 'X' but empty in 'y' are augmented by this model.

* This augmentation was possible by setting the acceptable accuracy (R^2) of the linear-models generated to a minimum-acceptable-score of 45% accurate.. any higher than this and the augmentation does not converge; for a few features, the models that came up were less than 50% accurate.


In [None]:
#CODE CELL FOR RAKESH



# Fetch the most significantly-correlated features for each feature in the `data_under_cleansing` set.
def get_feature_correlations(data_under_cleansing):
    correlations = data_under_cleansing.corr()
    max_correlations = correlations[(correlations>0.5) & (correlations<1)]
    feature_corrs = {}
    for column in max_correlations:
        corr_scores = max_correlations[column]
        significant_correlations = corr_scores.dropna()
        feature_corrs[significant_correlations.name]=significant_correlations
    return feature_corrs


# In the data set `data_under_cleaning`, this method looks for features which do not have more than 50 missing data-values.
# returns a bool-mask representing : <feature> :: <bool? is data dense>
def get_data_density_mask(data_under_cleansing):
    features = data_under_cleansing.columns
    data_under_cleansing_mask = {}
    for i in features:
        missing_count = sum(data_under_cleansing[i].isna())
        data_under_cleansing_mask[i] = missing_count<50
    return data_under_cleansing_mask

# Method to run the augmentation on given data.
def do_augment_missing_data(data_under_cleansing, density_mask, plot_correlations):
    feat_corrs = get_feature_correlations(data_under_cleansing)
    print("Complete Features: ", len([key for key in density_mask.keys() if density_mask[key]]))

    #all feature-correlations for features which are reported as not dense
    all_features_to_augment = [feat_corrs[feature] for feature in density_mask if not density_mask[feature]]

    for feature_data in all_features_to_augment:
        # Do this for each feature that needs to be augmented due to large missing values
        feat_to_be_augmented = feature_data.name
        
        high_corr_full_features = [feat for feat in feature_data.index.tolist() if density_mask[feat]]
        if len(high_corr_full_features) < 2:
            # a feature_threshold to identify how many features are to be used to model 
            # feature being augmented. Minimum is 2.
            continue
        
        print("\nfeat ..", feat_to_be_augmented)
        print("corr ..", high_corr_full_features)

        #"filtering train-data set where all high-corr-features and feat-to-be-augmented are not-NA"
        query_str_train = ' & '.join(['~{}.isna()'.format(k) for k in high_corr_full_features])
        query_str_train = ' & '.join([query_str_train, '~{}.isna()'.format(feat_to_be_augmented)])
        #print(query_str_train)
        tmp_train_data  = data_under_cleansing.query(query_str_train,engine="python")
        tmp_train_X = tmp_train_data[high_corr_full_features]
        tmp_train_y = tmp_train_data[feat_to_be_augmented]
        
        if plot_correlations:
            print("Plotting y against each X.... \n\n ")
            for x in high_corr_full_features:
                tmp_train_data.plot(x=x, y=feat_to_be_augmented, style='o')
                plt.show()

        #"filtering predict-data set where all high-corr-features are not-NA and feat-to-be-augmented are NA"
        query_str_predict = ' & '.join(['~{}.isna()'.format(k) for k in high_corr_full_features])
        query_str_predict = ' & '.join([query_str_predict, '{}.isna()'.format(feat_to_be_augmented)])
        
    
        tmp_predict_data  = data_under_cleansing.query(query_str_predict,engine="python")
        tmp_predict_X = tmp_predict_data[high_corr_full_features]

        lm = LinearRegression().fit(tmp_train_X, tmp_train_y)
        model_score =  lm.score(tmp_train_X, tmp_train_y)
        print("Model score: ", model_score)
        if model_score < 0.45:
            # do not use a model to augment data when model is less than 45% accurate. Shifting this threshold to 50% leads to NON-CONVERGENCE
            print("aborting augmenting..")
            continue

        print("Model coef: " , lm.coef_)
        tmp_predict_y = list(lm.predict(tmp_predict_X))
        feat_column_index = data_under_cleansing.columns.get_loc(feat_to_be_augmented)
        index_list = tmp_predict_data.index.tolist()

        for i, index in enumerate(index_list):
            data_under_cleansing.iloc[index][feat_column_index] = tmp_predict_y[i]

    return data_under_cleansing


def augment_missing_data(given_dataset, plot_correlations=False):
    '''
    Utility Method which takes a data set of size `n` with `m`-features and augments 
    features which are missing using linear-regression models. There is not guarantee that 
    this augmentation process will converge for all data-sets. But it is known to work for the train-data
    from this project.
    '''
    data_to_be_cleansed = given_dataset.loc[:, given_dataset.columns != 'Image']
    
    while True:
        print("\n\n==========================================================")
        data_density_mask = get_data_density_mask(data_to_be_cleansed)
        incomplete_features = [key for key in data_density_mask.keys() if not data_density_mask[key]]
        print("Incomplete Features: ", len(incomplete_features))
        if len(incomplete_features) > 0:
            data_to_be_cleansed = do_augment_missing_data(data_to_be_cleansed, data_density_mask, plot_correlations)
        else:
            break
    
    return data_to_be_cleansed

In [None]:
#CODE CELL FOR RAKESH

## Pass in the right data-set to augment and get the augmented data back (filtering 'image' column here)
augmented_data = augment_missing_data(train_data.loc[:, train_data.columns != 'image'])


## Save Augmented Data to a Pickle file

In [None]:
#CODE CELL FOR JACKIE
print(augmented_data.shape)

#Pickle train and test so that we can jump in with cleaning this data
pickle.dump( augmented_data, open( "data/aug_train.p", "wb" ) )
