In [1]:
import pandas as pd
import numpy as np
import os
import warnings
import sys

from tqdm import tqdm
import zlib

import matplotlib.pyplot as plt
import matplotlib.image as mpimg
from matplotlib import rc
from matplotlib.ticker import PercentFormatter

from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
import plotly.graph_objects as go

from sklearn.linear_model import LinearRegression

warnings.filterwarnings('ignore')
%matplotlib inline
np.random.seed(42)
np.set_printoptions(threshold=sys.maxsize)

In [3]:
def load_data(data_path):
    df, data_path = {}, data_path
    for f, l, n, t in zip(['IdLookupTable.csv', 'SampleSubmission.csv', 'test.csv', 'training.csv'],
                            ['id_lookup', 'sample_submission', 'test', 'train'],
                            [['row_id', 'image_id', 'feature_name', 'location'],
                                ['row_id', 'location'],
                                ['image_id', 'image'],
                                ['left_eye_center_x', 'left_eye_center_y', 'right_eye_center_x', 'right_eye_center_y', 'left_eye_inner_corner_x', 'left_eye_inner_corner_y', 
                                'left_eye_outer_corner_x', 'left_eye_outer_corner_y', 'right_eye_inner_corner_x', 'right_eye_inner_corner_y', 'right_eye_outer_corner_x', 
                                'right_eye_outer_corner_y', 'left_eyebrow_inner_end_x', 'left_eyebrow_inner_end_y', 'left_eyebrow_outer_end_x', 'left_eyebrow_outer_end_y', 
                                'right_eyebrow_inner_end_x', 'right_eyebrow_inner_end_y', 'right_eyebrow_outer_end_x', 'right_eyebrow_outer_end_y', 'nose_tip_x', 'nose_tip_y', 
                                'mouth_left_corner_x', 'mouth_left_corner_y', 'mouth_right_corner_x', 'mouth_right_corner_y', 'mouth_center_top_lip_x', 'mouth_center_top_lip_y', 
                                'mouth_center_bottom_lip_x', 'mouth_center_bottom_lip_y', 'image']],
                                [{'row_id':'uint16', 'image_id':'uint16', 'location':'float32'},
                                {'row_id':'uint16', 'location':'float32'},
                                {'image_id':'uint16', 'image':'object'},
                                {'left_eye_center_x':'float32', 'left_eye_center_y':'float32', 'right_eye_center_x':'float32', 'right_eye_center_y':'float32', 
                                'left_eye_inner_corner_x':'float32', 'left_eye_inner_corner_y':'float32', 'left_eye_outer_corner_x':'float32', 'left_eye_outer_corner_y':'float32', 
                                'right_eye_inner_corner_x':'float32', 'right_eye_inner_corner_y':'float32', 'right_eye_outer_corner_x':'float32', 'right_eye_outer_corner_y':'float32', 
                                'left_eyebrow_inner_end_x':'float32', 'left_eyebrow_inner_end_y':'float32', 'left_eyebrow_outer_end_x':'float32', 'left_eyebrow_outer_end_y':'float32', 
                                'right_eyebrow_inner_end_x':'float32', 'right_eyebrow_inner_end_y':'float32', 'right_eyebrow_outer_end_x':'float32', 'right_eyebrow_outer_end_y':'float32', 
                                'nose_tip_x':'float32', 'nose_tip_y':'float32', 'mouth_left_corner_x':'float32', 'mouth_left_corner_y':'float32', 'mouth_right_corner_x':'float32', 
                                'mouth_right_corner_y':'float32', 'mouth_center_top_lip_x':'float32', 'mouth_center_top_lip_y':'float32', 'mouth_center_bottom_lip_x':'float32', 
                                'mouth_center_bottom_lip_y':'float32', 'image':'object'}]):
        print("Loading file '%s'..." % "".join( (data_path, f)))
        df[l] = pd.read_csv("".join( (data_path, f) ), names = n, dtype = t, skiprows = 1)
        if "image" in df[l]:
            print("\tProcessing %d images..." % df[l].shape[0])
            df[l]['image'] = df[l]["image"].map(lambda x: np.array(list(map(int, x.split(" ")))))
        print("\tDone!  shape:", df[l].shape, "\n")

    print("All data loaded in to dataframe 'df'.")
    return df # df['training'] and df['test']


In [None]:
def flag_extreme(df_img):
    idx_vals = df_img.index.values
    df_rng = {}
    for c in [c for c in df_img.columns if c.endswith('_x') | c.endswith('_y')]:
        df_rng[c] = {'min':np.nanmin(df_img[c].values), 'max':np.nanmax(df_img[c].values),
            'avg':np.nanmean(df_img[c].values)}
    df_rng = pd.DataFrame(df_rng).T

    cols = ['min_value', 'max_value', 'min_img', 'max_img', 'min_idx', 'max_idx']
    extremes = {}
    for i in df_rng.index.values:
        min_val, max_val = df_rng.loc[i]['min'], df_rng.loc[i]['max']
        min_img = df_img[(df_img[i] == min_val)].image.values[0]
        max_img = df_img[(df_img[i] == max_val)].image.values[0]
        min_idx = df_img[(df_img[i] == min_val)].index.values
        max_idx = df_img[(df_img[i] == max_val)].index.values
        extremes[i] = [min_val, max_val, min_img, max_img, min_idx, max_idx]

    df_extremes = pd.DataFrame(extremes).T
    df_extremes.columns = cols

    # add 1st flag for preprocessing : isBadImage
    extremeIndex = []
    for keypoint in extremes.keys():
        minIndex = extremes[keypoint][4]
        maxIndex = extremes[keypoint][5]
        if minIndex not in extremeIndex:
            extremeIndex.append(minIndex[0])
        if maxIndex not in extremeIndex:
            extremeIndex.append(maxIndex[0])
    extremeArray = np.zeros(7049)
    for i in extremeIndex:
        extremeArray[i] = 1
    df_img['is_extreme_image'] = extremeArray
    return df_img

In [4]:
def flag_missing_pixel(df):
    # add 2nd flag for preprocessing : isMissingPixel 
    isMissingPixel = np.zeros(df.shape[0])
    df['is_missing_pixel'] = isMissingPixel
    return df

In [5]:
def flag_missing_keypoints(df):
    # add 3rdflag for preprocessing : missing keypoints
    cols = [c for c in df.columns if 'image' not in c]
    missing = pd.DataFrame(df[cols].isnull().sum().sort_values(ascending = False)).reset_index()
    missing.columns = ['keypoint', 'num_missing']
    missing['pct_missing'] = (missing.num_missing / df.shape[0]).astype(np.float32)
    missing['pct_present'] = 1 - missing.pct_missing

    cols = [c for c in df.columns if c.endswith('_x') | c.endswith('_y')]
    df_temp = df.copy()[cols]
    df_temp = df_temp.isnull().sum(axis=1).reset_index()
    df['num_missing_keypoints'] = df_temp[0]
    return df

In [6]:
def flag_duplicate(df):
    # add 4th flag for preprocesssing : has_duplicate (averaged keypoint values)
    # Evaluate the presence of duplicate images

    df['hash_image'] = df.image.map(lambda x: zlib.adler32(x))
    df_dupes_hash = pd.DataFrame(df.groupby(by='hash_image').index.count().sort_values()).reset_index()
    df_dupes_hash.columns = ['hash_image', 'frequency']
    df_dupes_hash = df_dupes_hash[(df_dupes_hash.frequency > 1)]
    df_dupes_hash = pd.merge(df_dupes_hash, df[['index', 'hash_image']],  how = 'left', on=['hash_image']).sort_values(by=['frequency', 'hash_image'], ascending = False)
    df.drop(columns=['hash_image'], inplace=True)
    print("TRAIN has %d duplicate rows (%d unique images)" % (len(df_dupes_hash), len(np.unique(df_dupes_hash.hash_image))))
    
    df_train_dupes = pd.DataFrame(df_dupes_hash.groupby(by='frequency').index.count()).sort_values(by='index', ascending = False)
    df_train_dupes.columns = ['duplicates']
    df_train_dupes['cuml_pct'] = round(df_train_dupes.cumsum()/df_train_dupes.duplicates.sum() * 100,2)

    # Do the duplicated images at least having matching labels?
    first_duped_image_hash = df_dupes_hash.iloc[0, df_dupes_hash.columns.get_loc('hash_image')]
    print(df_dupes_hash)
    print(first_duped_image_hash)
    train_dupes_idx = df_dupes_hash.loc[(df_dupes_hash.hash_image == first_duped_image_hash)]['index'].values
    print(train_dupes_idx)
    cols = [c for c in df.columns if c.endswith('_x') | c.endswith('_y')]

    def highlight_max(s):
        is_max = s == s.max()
        return ['background-color: yellow' if v else '' for v in is_max]

    df.loc[(df['index'].isin(train_dupes_idx))][cols].style\
        .apply(highlight_max)\
        .set_na_rep("N/A").format(None, na_rep="N/A").highlight_null(berkeley_palette['south_hall'])
    
    # It looks like the duplicated images in TRAIN don't reliably have the same truth label values (above)
    # One option would be for us to merge these images together, and average the coordinates acorss all labels.
    # Here's what that looks like:

    pd.DataFrame(df.loc[(df['index'].isin(train_dupes_idx))][cols].dropna(axis='columns').mean()).T.style\
        .set_properties(**{'background-color':'black',
                            'color': 'lawngreen',
                            'border-color':'white'})
    def check_dup(img, ref):
        if img not in ref:
            ref.append(img)
            return 0
        else:
            return 1

    # de-dupe copied of TRAIN and TEST 
    ref = []
    df = df.reset_index()
    df['hash_image'] = df.image.map(lambda x: zlib.adler32(x))
    df['is_duplicate'] = df.hash_image.map(lambda x: check_dup(x, ref))
    df = df.drop(['index', 'hash_image'], axis=1)

    return df