## Imports

In [44]:
import pandas as pd
import numpy as np
import os
import warnings

from tqdm import tqdm
import zlib

import matplotlib.pyplot as plt
import matplotlib.image as mpimg
from matplotlib import rc
from matplotlib.ticker import PercentFormatter

In [None]:
pd.show_versions()

### Load Data For EDA
There are 4 files found in the data directory:

- IdLookupTable - TBD
- SampleSubmission - TBD
- test - TBD
- training - TBD 

We must load this data in order to perform EDA. 

TBD ADD MORE DETAILS HERE ABOUT GENERAL EDA APPRAOCH. 

EDA
1. DUPLICATES
2. OUTLIERS
3. MISSING DATA


In [None]:

#https://realpython.com/python-zip-function/#:~:text=%20Using%20the%20Python%20zip%20()%20Function%20for,zip%20()%20function%20works%20differently%20in...%20More

df, git_path = {}, 'data/'
for file_name, file_ref, n, t in zip(['test.csv', 'training.csv', 'IdLookupTable.csv', 'SampleSubmission.csv'],
                        ['test', 'train', 'id_lookup', 'sample_submission', ],
                        [   #test
                            ['image_id', 'image'], 
                            #train
                            ['left_eye_center_x', 'left_eye_center_y',  
                            'right_eye_center_x', 'right_eye_center_y', 
                            'left_eye_inner_corner_x', 'left_eye_inner_corner_y', 
                            'left_eye_outer_corner_x', 'left_eye_outer_corner_y', 
                            'right_eye_inner_corner_x', 'right_eye_inner_corner_y', 
                            'right_eye_outer_corner_x', 'right_eye_outer_corner_y', 
                            'left_eyebrow_inner_end_x', 'left_eyebrow_inner_end_y', 
                            'left_eyebrow_outer_end_x', 'left_eyebrow_outer_end_y', 
                            'right_eyebrow_inner_end_x', 'right_eyebrow_inner_end_y', 
                            'right_eyebrow_outer_end_x', 'right_eyebrow_outer_end_y', 
                            'nose_tip_x', 'nose_tip_y', 
                            'mouth_left_corner_x', 'mouth_left_corner_y', 
                            'mouth_right_corner_x', 'mouth_right_corner_y', 
                            'mouth_center_top_lip_x', 'mouth_center_top_lip_y', 
                            'mouth_center_bottom_lip_x', 'mouth_center_bottom_lip_y', 'image'],
                            #IdLookupTable
                            ['row_id', 'image_id', 'feature_name', 'location'],
                            #SampleSubmission
                            ['row_id', 'location']
                        ],

                        [
                             #test   
                            {'image_id':'uint16', 'image':'object'},
                            #train
                            {'left_eye_center_x':'float32', 'left_eye_center_y':'float32', 
                            'right_eye_center_x':'float32', 'right_eye_center_y':'float32', 
                            'left_eye_inner_corner_x':'float32', 'left_eye_inner_corner_y':'float32', 
                            'left_eye_outer_corner_x':'float32', 'left_eye_outer_corner_y':'float32', 
                            'right_eye_inner_corner_x':'float32', 'right_eye_inner_corner_y':'float32',
                            'right_eye_outer_corner_x':'float32', 'right_eye_outer_corner_y':'float32', 
                            'left_eyebrow_inner_end_x':'float32', 'left_eyebrow_inner_end_y':'float32',
                            'left_eyebrow_outer_end_x':'float32', 'left_eyebrow_outer_end_y':'float32', 
                            'right_eyebrow_inner_end_x':'float32', 'right_eyebrow_inner_end_y':'float32',
                            'right_eyebrow_outer_end_x':'float32', 'right_eyebrow_outer_end_y':'float32', 
                            'nose_tip_x':'float32', 'nose_tip_y':'float32', 'mouth_left_corner_x':'float32',
                            'mouth_left_corner_y':'float32', 'mouth_right_corner_x':'float32', 
                            'mouth_right_corner_y':'float32', 'mouth_center_top_lip_x':'float32', 
                            'mouth_center_top_lip_y':'float32','mouth_center_bottom_lip_x':'float32', 
                            'mouth_center_bottom_lip_y':'float32', 'image':'object'},
                             #IdLookupTable
                            {'row_id':'uint16', 'image_id':'uint16', 'location':'float32'},
                            #SampleSubmission
                            {'row_id':'uint16', 'location':'float32'}
                        ],
                        ):
    #This is the begining of the for loop for each file:
    print("Load files.")
    print("Begin loading file '%s' " % "".join( (git_path, file_name)))
    #print(file_ref)
    df[file_ref] = pd.read_csv("".join( (git_path,file_name) ), names = n, dtype = t, skiprows = 1)
    
    #If the file contains an image column like in the case of test.csv store those images now. 
    if "image" in df[file_ref]:
        print("\tFound %d images. Processing. " % df[file_ref].shape[0])
        #Get the row with the image data and store it in the dataframe 
        df[file_ref]['image'] = df[file_ref]["image"].map(lambda x: np.array(list(map(int, x.split(" ")))))
    print("\tFile", file_ref, " with shape:", df[file_ref].shape, " load complete\n")

print("Load files complete")

In [None]:
train, test = df['train'][['image']], df['test'][['image']]


In [None]:
#HELPER FUNCTIONS to reset the train and test dataframes

def reset_train_df():
    train = df['train'].reset_index().copy()
    #Get the images and perform a checksum on every image in train: https://www.geeksforgeeks.org/zlib-adler32-in-python/
    train['check_sum'] = train.image.map(lambda x: zlib.adler32(x))

    return train

def reset_test_df():
    test = df['test'].reset_index().copy()
    #Get the images and perform a checksum on every image in train: https://www.geeksforgeeks.org/zlib-adler32-in-python/
    test['check_sum'] = test.image.map(lambda x: zlib.adler32(x))

    return test

def get_coordinate_columns():
    coordinates = [c for c in train.columns if c.endswith('_x') | c.endswith('_y')]
    return coordinates

In [None]:
# Check for duplicate train images

#Get the reset train df
train = reset_train_df()
#Create a DF to store duplicates, grouping them together and sorting them
train_duplicates = pd.DataFrame(train.groupby(by='check_sum').index.count().sort_values()).reset_index()
#Add a column to keep track of how many of each check sum there are
train_duplicates.columns = ['check_sum', 'number_found']
#Keep the ones where we have > 1 number_found
train_duplicates = train_duplicates[(train_duplicates.number_found > 1)]
#Now do a left outer join back to train_duplicates.  This should only keep the duplicates 
# https://pandas.pydata.org/pandas-docs/stable/user_guide/merging.html#:~:text=merge%20is%20a%20function%20in%20the%20pandas%20namespace,,the%20index-on-index%20(by%20default)%20and%20column%20(s)-on-index%20join.
train_duplicates = pd.merge(train_duplicates, train[['index', 'check_sum']],  how = 'left', on=['check_sum']).sort_values(by=['number_found', 'check_sum'], ascending = False)


#Now do the same for test:
# Check for duplicate train images
test = reset_test_df()
#Create a DF to store duplicates, grouping them together and sorting them
test_duplicates = pd.DataFrame(test.groupby(by='check_sum').index.count().sort_values()).reset_index()
#Add a column to keep track of how many of each check sum there are
test_duplicates.columns = ['check_sum', 'number_found']
#Keep the ones where we have > 1 number_found
test_duplicates = test_duplicates[(test_duplicates.number_found > 1)]
#Now do a left outer join back to train_duplicates.  This should only keep the duplicates 
# https://pandas.pydata.org/pandas-docs/stable/user_guide/merging.html#:~:text=merge%20is%20a%20function%20in%20the%20pandas%20namespace,,the%20index-on-index%20(by%20default)%20and%20column%20(s)-on-index%20join.
test_duplicates = pd.merge(test_duplicates, test[['index', 'check_sum']],  how = 'left', on=['check_sum']).sort_values(by=['number_found', 'check_sum'], ascending = False)

print("EDA on duplicate data in train and test datasets: ")
print("The train dataset has %d unique images out of the %d duplicate images from the total of %d images" % (len(np.unique(train_duplicates.check_sum)), len(train_duplicates), train.size))
print("The test dataset has %d unique images out of %d duplicate images from the total of %d images" % (len(np.unique(test_duplicates.check_sum)),len(test_duplicates), test.size))

#Clean up:
#We don't really need the check_sum column anymore...so drop it
train.drop(columns=['check_sum'], inplace=True)
test.drop(columns=['check_sum'], inplace=True)

In [None]:
print(train.size)
print(train_duplicates.size)

In [None]:

## TRAIN 
# Let's view some of these duplicated train images
fig = plt.figure(figsize=(18,18))
fig.suptitle('Sample of duplicate images from the Train dataset\n n= 35', size = 20,  y = 1.04, weight = 'bold')
#Get the point coordinates for example: mouth_center_top_lip_x
coordinates = get_coordinate_columns()
#print(coordinates)
#Get the top 35 duplicate images
idx = train_duplicates.head(35)['index'].values
#For testing, these are the duplicate ID's
print(idx)

match_pts = pd.DataFrame(columns =['Points_Found', 'Count'])

#Loop through and plot each of the 35 images.  
for i, idx in enumerate(idx):
    plt.subplot(7,5,i+1)
    img = train[(train['index'] == idx)].image.values[0].reshape(96,96)
    #These are the points that have been identified on the images
    points = train[(train['index'] == idx)][coordinates].values[0]
    plt.imshow(img, cmap = 'gray')
    plt.axis('off')
    matching_pts = 0

    for pts in range(0, 30, 2):
        x_point, y_point = (points[pts], points[pts+1])
        if not (np.isnan(x_point)) and not (np.isnan(y_point)):
            matching_pts += 1
            #Add the point to the plot
            plt.plot(x_point, y_point, 'o', color = "red", markersize = 5)

    plt.title("Image #:[%d]\n#Points:[%d]" % (idx, matching_pts))
    if matching_pts in match_pts["Points_Found"].values:
            match_pts.loc[match_pts['Points_Found'] == matching_pts, 'Count'] = match_pts.loc[match_pts['Points_Found'] == matching_pts, 'Count'] + 1
    else:
        match_pts = match_pts.append({'Points_Found':matching_pts,'Count': 1},ignore_index=True)


plt.tight_layout()
plt.show()

In [None]:
idx = train_duplicates['index'].values
#For testing, these are the duplicate ID's
print(idx)
match_pts = pd.DataFrame(columns =['Points_Found', 'Count'])
coordinates = get_coordinate_columns()
#Loop through and plot each of the 35 images.  
for i, idx in enumerate(idx):
    img = train[(train['index'] == idx)].image.values[0].reshape(96,96)
    #These are the points that have been identified on the images
    points = train[(train['index'] == idx)][coordinates].values[0]
    matching_pts = 0

    for pts in range(0, 30, 2):
        x_point, y_point = (points[pts], points[pts+1])
        if not (np.isnan(x_point)) and not (np.isnan(y_point)):
            matching_pts += 1
            
    if matching_pts in match_pts["Points_Found"].values:
            match_pts.loc[match_pts['Points_Found'] == matching_pts, 'Count'] = match_pts.loc[match_pts['Points_Found'] == matching_pts, 'Count'] + 1
            
    else:
        match_pts = match_pts.append({'Points_Found': matching_pts,'Count': 1},ignore_index=True)
        
print(match_pts)

fig = plt.figure(figsize=(10,6))
ax = fig.add_subplot(111)

ax.set_title('Number of Points Found on Train Duplicate Data', fontsize = 20, fontweight = 'bold')
ax.bar(match_pts.Points_Found, match_pts.Count, width = 1.7)
ax.set_xticks(range(0,18,1))
for i, r in match_pts.iterrows():
    plt.text(r.Points_Found, r.Count + 25, format(r.Count, ",d"), 
        horizontalalignment = 'center', verticalalignment = 'center', fontweight ='bold')
ax.spines["top"].set_alpha(.0)
ax.spines["bottom"].set_alpha(.3)
ax.spines["right"].set_alpha(.0)
ax.spines["left"].set_alpha(.0)
ax.set_xlabel("Number Of Points Found On Image", fontsize = 12, horizontalalignment='center')
ax.set_ylabel("Number of Duplicate Train Images", fontsize = 12, horizontalalignment='center')

plt.tight_layout()
plt.show()


In [None]:
print(train_duplicates.columns)

In [None]:
#Do the duplicate Train images have the same labels? Let's test one out. 

#Get only the first images checksum from duplicate train and then get the images that match the check_sum
duplicate_image_chksum = train_duplicates.iloc[0, train_duplicates.columns.get_loc('check_sum')] 

duplicate_image_index = train_duplicates.loc[(train_duplicates.check_sum == duplicate_image_chksum)]['index'].values

#Create an array of all of the coumns with x,y in them
coordinate_columns = get_coordinate_columns()

#Get the df so we can display something meaningful
duplicate_image_df = train.loc[(train['index'].isin(duplicate_image_index))][coordinate_columns]

#https://mode.com/example-gallery/python_dataframe_styling/
duplicate_image_df.style\
    .highlight_max(subset=coordinate_columns,color='green')\
    .set_na_rep("N/A").format(None, na_rep="Missing")\
    .highlight_null('yellow')
    

In [None]:
#The lables do not match exactly in the duplicate Train images.  The challenge would be to determine which of the images to keep if we remove all but one of the duplicate images. We have two options:

#1 - Keep the first duplicate and disregard the others - Easy to do, low cost but we risk losing data.
#2 - Take the average for all coordiantes across the duplicate image and apply those coordinates moving forward. A little more work invovled and risk of introducing more errors to the lables. 

#If we were to do #2 this is how the above image would reconcile:
#Take the mean of the columns and create a new DF
duplicate_image_df = pd.DataFrame(train.loc[(train['index'].isin(duplicate_image_index))][coordinate_columns].mean())

#Display results
duplicate_image_df.T.style\
    .set_na_rep("N/A").format(None, na_rep="Missing")\
    .highlight_null('yellow')




## Test Dataset Duplicate Data

In [None]:
test = reset_test_df()
print(test.size)
print(test_duplicates.size)
print(test.size/test_duplicates.size, "% of test data is duplicates")

In [None]:
## TEST 
# Let's view some of these duplicated train images
fig = plt.figure(figsize=(18,18))
fig.suptitle('Sample of duplicate images from the Test dataset\n n= 35', size = 20,  y = 1.04, weight = 'bold')


#Get the top 35 duplicate images
idx = test_duplicates.head(35)['index'].values
#For testing, these are the duplicate ID's
print(idx)

#Loop through and plot each of the 35 images.  
for i, idx in enumerate(idx):
    plt.subplot(7,5,i+1)
    img = test[(test['index'] == idx)].image.values[0].reshape(96,96)
    plt.imshow(img, cmap = 'gray')
    plt.axis('off')
    plt.title("Image #:[%d]" % (idx))
    
plt.tight_layout()
plt.show()

## Duplication Conclusions -EDA on duplicate data in train and test datasets:
Train:

1. The train dataset has 543 unique images out of the 1098 duplicate images from the total of 232617 images

2. Of the 1098 duplicate images:
    - 1096 of them had 4 points
    - 1 had  13 points
    - 1 had 15 points

Test:

1. The test dataset has 29 unique images out of 60 duplicate images from the total of 7132 images


In [None]:
#CODE CELL FOR JACKIE
# Remove duplicates in the train dataset by taking the mean of all values for that image in each label 
def remove_train_duplicates(verbose=True):
    # First let's reset the index since we've been working on the df 
    train = reset_train_df()
    train_duplicates.reset_index()

    #Get all of the coordinates
    coordinates = get_coordinate_columns()

    #Create an empty df with the coordinate columns in place
    final_images = train[(train.index == -1)][coordinates].copy()

    #For each unique check_sum in duplicates...
    for check_sum in train_duplicates.check_sum.unique():
        #Get all of the duplicates with the same check_sum
        duplicates = train_duplicates[(train_duplicates.check_sum == check_sum)]['index'].values
        #Get the first image that appears in the train dataset with this check_sum
        image = train[(train['index'].isin(duplicates))].image.values[0]
        #Take the mean of all the coordinate columns - this is what we will use for the final single image
        fixed = pd.DataFrame(pd.DataFrame(train[(train['index'].isin(duplicates))], columns=coordinates).mean(axis = 0)).T
        #Make sure to include the actual image (lol)
        fixed['image'] = [image]
        #Append it to the list of final_images
        final_images = final_images.append(fixed, ignore_index = True)
        
    #For reporting purposes: 
    if verbose: print("Applying EDA fix for duplicates")
    if verbose: print("="*13 + "Train" + "="*13)
    if verbose: print("Before delete:     %s" % str(train.shape))

    #Remove the duplicates from train - danger, danger, must replace them
    train = train[~(train['index'].isin(train_duplicates['index'].values))]
    if verbose: print("After  delete:     %s" % str(train.shape))

    #Replace removed duplicates with final_images
    train = train.append(final_images, ignore_index = True).reset_index()
    train.drop(columns=['index'], inplace = True)
    if verbose: print("After  append:     %s" % str(train.shape))
    return train



In [None]:
#CODE CELL FOR JACKIE
##########Test Data set

#Now do the same for test, this will be easier since we don't need
#to deal with points and taking the mean
def remove_test_duplicates(verbose=True):
#We can do this differently since we don't need to take the mean. 
#Go through the test and only add items to the final test image if
#we do not already have the check_sum. If we find the check_sum, don't
#add it it's a duplicate. 
    test = reset_test_df()
    if verbose: print("="*13 + "Test=" + "="*13)
    if verbose: print("Before delete:     %s" % str(test.shape))
    test = reset_test_df()
    #Create an empty df with the coordinate columns in place
    final_test_images = test[(test.index == -1)]
    
    for test_index, check_sum in zip(test['index'], test.check_sum):
        if not (check_sum in list(final_test_images.check_sum.values)):
            final_test_images = final_test_images.append(test.loc[(test['index'] == test_index)], ignore_index = True)
    
    if verbose: print("After  delete:     %s" % str(test.shape))
    return final_test_images



In [None]:
#CODE CELL FOR JACKIE
train = remove_train_duplicates()
print()
test = remove_test_duplicates()

In [None]:
#CODE CELL FOR JACKIE

In [None]:
#CODE CELL FOR JOANIE

In [None]:
#CODE CELL FOR JOANIE

In [None]:
#CODE CELL FOR JOANIE

In [None]:
#CODE CELL FOR JOANIE

In [None]:
#CODE CELL FOR JOANIE

In [None]:
#CODE CELL FOR JOANIE

In [None]:
#CODE CELL FOR JOANIE

In [None]:
#CODE CELL FOR JOANIE

In [None]:
#CODE CELL FOR JOANIE

In [None]:
#CODE CELL FOR JOANIE

In [None]:
#CODE CELL FOR RAKESH

In [None]:
#CODE CELL FOR RAKESH

In [None]:
#CODE CELL FOR RAKESH

In [None]:
#CODE CELL FOR RAKESH

In [None]:
#CODE CELL FOR RAKESH

In [None]:
#CODE CELL FOR RAKESH

In [None]:
#CODE CELL FOR RAKESH

In [None]:
#CODE CELL FOR RAKESH

In [None]:
#CODE CELL FOR RAKESH

In [None]:
#CODE CELL FOR RAKESH

In [None]:
#CODE CELL FOR SANDIP

In [None]:
#CODE CELL FOR SANDIP

In [None]:
#CODE CELL FOR SANDIP

In [None]:
#CODE CELL FOR SANDIP

In [None]:
#CODE CELL FOR SANDIP

In [None]:
#CODE CELL FOR SANDIP

In [None]:
#CODE CELL FOR SANDIP

In [None]:
#CODE CELL FOR SANDIP

In [None]:
#CODE CELL FOR SANDIP

In [None]:
#CODE CELL FOR SANDIP