# Data Cleaning File

## Setup
Imports, reading in files, etc..

In [1]:
import pandas as pd
import numpy as np
import os
import warnings

from tqdm import tqdm
import zlib

import matplotlib.pyplot as plt
import matplotlib.image as mpimg
from matplotlib import rc
from matplotlib.ticker import PercentFormatter
import pickle


In [2]:
#Load the pickle files

train = pickle.load( open( "data/train.p", "rb" ) )
test = pickle.load( open("data/test.p", "rb"))

train.rename(columns = {'level_0' : 'index'}, inplace = True)

train_duplicates = pickle.load( open("data/traindup.p", "rb"))
test_duplicates = pickle.load( open("data/testdup.p", "rb"))

train_duplicates.set_index('index', inplace=True, drop=False)
print("Test shape is: ", test.shape)
print("Train shape is: ", train.shape)

print("Test duplicates shape is: ", test_duplicates.shape)
print("Train duplicates shape is: ", train_duplicates.shape)


Test shape is:  (1783, 4)
Train shape is:  (7049, 33)
Test duplicates shape is:  (60, 3)
Train duplicates shape is:  (1098, 3)


In [3]:
#HELPER FUNCTIONS to reset the train and test dataframes

def reset_train_df():
    #train = df['train'].reset_index().copy()
    new_train = train.reset_index().copy()
    #Get the images and perform a checksum on every image in train: https://www.geeksforgeeks.org/zlib-adler32-in-python/
    new_train['check_sum'] = train.image.map(lambda x: zlib.adler32(x))
    new_train.pop('level_0')
    return new_train

def reset_test_df():
    #test = df['test'].reset_index().copy()
    new_test = test.reset_index().copy()
    #Get the images and perform a checksum on every image in train: https://www.geeksforgeeks.org/zlib-adler32-in-python/
    new_test['check_sum'] = test.image.map(lambda x: zlib.adler32(x))
    new_test.pop('level_0')
    return new_test

def get_coordinate_columns():
    coordinates = [c for c in train.columns if c.endswith('_x') | c.endswith('_y')]
    return coordinates

In [4]:
#Creating a copy of the train data in train_data in case you want to add columns back in from df[train]
train_data=train.copy(deep=True)
train_data

Unnamed: 0,index,left_eye_center_x,left_eye_center_y,right_eye_center_x,right_eye_center_y,left_eye_inner_corner_x,left_eye_inner_corner_y,left_eye_outer_corner_x,left_eye_outer_corner_y,right_eye_inner_corner_x,...,mouth_left_corner_x,mouth_left_corner_y,mouth_right_corner_x,mouth_right_corner_y,mouth_center_top_lip_x,mouth_center_top_lip_y,mouth_center_bottom_lip_x,mouth_center_bottom_lip_y,image,check_sum
0,0,66.033562,39.002274,30.227007,36.421677,59.582077,39.647423,73.130348,39.969997,36.356571,...,61.195309,79.970169,28.614496,77.388992,43.312603,72.935455,43.130707,84.485771,"[238, 236, 237, 238, 240, 240, 239, 241, 241, ...",3990298755
1,1,64.332939,34.970078,29.949276,33.448715,58.856171,35.274349,70.722725,36.187164,36.034725,...,56.421448,76.351997,35.122383,76.047661,46.684597,70.266556,45.467915,85.480171,"[219, 215, 204, 196, 204, 211, 212, 200, 180, ...",1359000491
2,2,65.057053,34.909641,30.903790,34.909641,59.411999,36.320969,70.984421,36.320969,37.678104,...,60.822948,73.014313,33.726315,72.732002,47.274948,70.191788,47.274948,78.659370,"[144, 142, 159, 180, 188, 188, 184, 180, 167, ...",3822334647
3,3,65.225739,37.261772,32.023094,37.261772,60.003338,39.127178,72.314713,38.380966,37.618645,...,65.598885,72.703720,37.245495,74.195480,50.303165,70.091690,51.561184,78.268379,"[193, 192, 193, 194, 194, 194, 193, 192, 168, ...",3001797594
4,4,66.725304,39.621262,32.244808,38.042030,58.565891,39.621262,72.515930,39.884468,36.982380,...,60.671410,77.523239,31.191755,76.997299,44.962749,73.707390,44.227142,86.871162,"[147, 148, 160, 196, 215, 214, 216, 217, 219, ...",2161940314
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7044,7044,67.402550,31.842550,29.746750,38.632942,,,,,,...,,,,,,,50.426636,79.683922,"[71, 74, 85, 105, 116, 128, 139, 150, 170, 187...",585994373
7045,7045,66.134399,38.365501,30.478626,39.950199,,,,,,...,,,,,,,50.287395,77.983025,"[60, 60, 62, 57, 55, 51, 49, 48, 50, 53, 56, 5...",191806966
7046,7046,66.690735,36.845222,31.666420,39.685043,,,,,,...,,,,,,,49.462570,78.117119,"[74, 74, 74, 78, 79, 79, 79, 81, 77, 78, 80, 7...",3036254932
7047,7047,70.965080,39.853664,30.543285,40.772339,,,,,,...,,,,,,,50.065186,79.586449,"[254, 254, 254, 254, 254, 238, 193, 145, 121, ...",260257816


## Outlier Identification
As we saw in the EDA, there are a variety of types of images with a variety of keypoints. Below, we will remove some of the images we saw as outliers in the EDA.

Outlier types:
- Mislabelled images
- Weird/bad images
- All outliers (i.e. images that contain keypoints that are greater than 3 standard deviations away from the mean for that keypoint)

In [5]:
#CODE CELL FOR JOANIE

#This block is identifying and counting all outliers
#Outliers are images that contain keypoints > 3std from mean
def find_outliers():
    train=train_data.drop(["image"],axis=1)
    described_train=train.describe().T
    std=described_train["std"]
    mean=described_train["mean"]
    q1=described_train["25%"]
    q3=described_train["75%"]
    iqr=q3-q1

    #If we define outliers using IQR
    #outlier_low=q1-1.5*iqr
    #outlier_high=q3+1.5*iqr

    #If we define outliers using std
    outlier_low=mean-3*std
    outlier_high=mean+3*std



    #Keep track of these images in a list
    outlier_images=[]
    outlier_dict={}

    #Iterate through the data to find outliers based on whether they are lower/higher than defined outlier boundaries
    for col in train.columns:
        indices=list(np.where((train[col] < outlier_low[col]) | (train[col] > outlier_high[col]))[0])
        outlier_images.extend(indices)
        for i in indices:
            temp=outlier_dict.get(i,[])
            temp.append(col[:-1])
            outlier_dict[i]=temp

    #Only count each index once
    outliers=np.unique(outlier_images)
    outliers
    print("Finding points 3 standard deviations away from the mean results in ",len(outliers),
        "images being classified as outliers")
    print("This represents",len(outliers)/train.shape[0]*100,"% of our total data")

    print(train.shape)
    
    return outliers




In [6]:
#CODE CELL FOR JOANIE

#This function is removing the worst outliers
#The worst outliers are the mislabelled images and the weird/bad images
def drop_worst_outliers():

    print("Before dropping worst outliers train shape is: ", train.shape)
    print("Before dropping worst outliers train duplicates shape is: ", train_duplicates.shape)
    miss_labelled = [1747, 1877, 1907,2199] #These are the images with keypoints that are not right
    bad_images = [6492,6493,2430,3697] #These are the two collages and the two cartoons

    worst_outliers = miss_labelled + bad_images

    #Drop with inplace drops inplace
    #train_data.drop(index=worst_outliers,inplace=True)
    train.drop(index=worst_outliers,inplace=True,errors='ignore')
    train_duplicates.drop(index=worst_outliers,inplace=True,errors='ignore')
    print("After dropping worst outliers train shape is: ", train.shape)
    print("After dropping worst outliers train duplicates shape is: ", train_duplicates.shape)

#This function is for removing all outliers as defined above
def drop_all_outliers():
    print("Before dropping all outliers train shape is: ", train.shape)
    print(train.shape)
    outliers = find_outliers()
    train.drop(index=outliers,inplace=True, errors='ignore')
    train_duplicates.drop(index=worst_outliers,inplace=True,errors='ignore')
    print("After dropping all outliers train shape is: ", train.shape)





## Helper Code for Duplicate Data in Train and Test datasets

In [10]:
#CODE CELL FOR JACKIE
# Remove duplicates in the train dataset by taking the mean of all values for that image in each label 
def remove_train_duplicates(verbose=True):
    # First let's reset the index since we've been working on the df 
        
    train = reset_train_df()
    #train_duplicates.reset_index()

    #Get all of the coordinates
    coordinates = get_coordinate_columns()

    #Create an empty df with the coordinate columns in place
    final_images = train[(train.index == -1)][coordinates].copy()

    #For each unique check_sum in duplicates...
    for check_sum in train_duplicates.check_sum.unique():
        #Get all of the duplicates with the same check_sum
        duplicates = train_duplicates[(train_duplicates.check_sum == check_sum)]['index'].values
        
        #Get the first image that appears in the train dataset with this check_sum
        image = train[(train['index'].isin(duplicates))].image.values[0]
        #Take the mean of all the coordinate columns - this is what we will use for the final single image
        fixed = pd.DataFrame(pd.DataFrame(train[(train['index'].isin(duplicates))], columns=coordinates).mean(axis = 0)).T
        #Make sure to include the actual image (lol)
        fixed['image'] = [image]
        #Append it to the list of final_images
        final_images = final_images.append(fixed, ignore_index = True)
        
        
    #For reporting purposes: 
    if verbose: print("="*13 + "Train" + "="*13)
    if verbose: print("Before delete:     %s" % str(train.shape))

    #Remove the duplicates from train - danger, danger, must replace them
    train = train[~(train['index'].isin(train_duplicates['index'].values))]
    if verbose: print("After  delete:     %s" % str(train.shape))

    #Replace removed duplicates with final_images
    train = train.append(final_images, ignore_index = True).reset_index()
    train.drop(columns=['index'], inplace = True)
    if verbose: print("After  append:     %s" % str(train.shape))
    return train



In [7]:
#CODE CELL FOR JACKIE
##########Test Data set

#Now do the same for test, this will be easier since we don't need
#to deal with points and taking the mean
def remove_test_duplicates(verbose=True):
#We can do this differently since we don't need to take the mean. 
#Go through the test and only add items to the final test image if
#we do not already have the check_sum. If we find the check_sum, don't
#add it it's a duplicate. 
    test = reset_test_df()
    if verbose: print("="*13 + "Test=" + "="*13)
    if verbose: print("Before delete:     %s" % str(test.shape))
    test = reset_test_df()
    #Create an empty df with the coordinate columns in place
    final_test_images = test[(test.index == -1)]
    
    for test_index, check_sum in zip(test['index'], test.check_sum):
        if not (check_sum in list(final_test_images.check_sum.values)):
            final_test_images = final_test_images.append(test.loc[(test['index'] == test_index)], ignore_index = True)
    
    if verbose: print("After  delete:     %s" % str(final_test_images.shape))
    return final_test_images


## Remove worst outliers

In [8]:
#CODE CELL FOR JOANIE
#only drop the worst outliers for now
drop_worst_outliers()

Before dropping worst outliers train shape is:  (7049, 33)
Before dropping worst outliers train duplicates shape is:  (1098, 3)
After dropping worst outliers train shape is:  (7041, 33)
After dropping worst outliers train duplicates shape is:  (1095, 3)


## Remove duplicates

In [11]:
#CODE CELL FOR JACKIE
print("Applying EDA fix for duplicates")
print()
train = remove_train_duplicates()
print()
test = remove_test_duplicates()

Applying EDA fix for duplicates

Before delete:     (7041, 33)
After  delete:     (5946, 33)
After  append:     (6488, 33)

Before delete:     (1783, 4)
After  delete:     (1752, 4)


## Save Clean Data to a Pickle file

In [12]:
#CODE CELL FOR JACKIE
print(train.shape)
print(test.shape)
#Pickle train and test so that we can jump in with cleaning this data
pickle.dump( train, open( "data/clean_train.p", "wb" ) )
pickle.dump(test, open( "data/clean_test.p", "wb" ))


(6488, 33)
(1752, 4)
