In [1]:
import numpy as np
import pandas as pd

import pandas_profiling as pp
from sklearn.model_selection import train_test_split

import seaborn as sns
sns.set()
import matplotlib.pyplot as plt
%matplotlib inline

import torch
import torchvision

import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader

TRAINPATH = 'training.csv'
#TESTPATH = '/home/izinizik/kagle_twarz/test.csv'

In [3]:
df_train = pd.read_csv(TRAINPATH)
#df_test = pd.read_csv(TESTPATH)

In [4]:
df_train['Image'] = df_train['Image'].apply(lambda im: np.fromstring(im, sep=' '))

In [5]:
df_train.dropna().var(axis=0)

left_eye_center_x             4.358422
left_eye_center_y             5.262562
right_eye_center_x            4.208961
right_eye_center_y            4.992247
left_eye_inner_corner_x       4.022554
left_eye_inner_corner_y       4.139191
left_eye_outer_corner_x       7.298855
left_eye_outer_corner_y       7.204727
right_eye_inner_corner_x      3.322540
right_eye_inner_corner_y      4.038109
right_eye_outer_corner_x      7.666276
right_eye_outer_corner_y      7.048508
left_eyebrow_inner_end_x      7.951913
left_eyebrow_inner_end_y      8.220442
left_eyebrow_outer_end_x     10.973629
left_eyebrow_outer_end_y     13.156485
right_eyebrow_inner_end_x     6.810261
right_eyebrow_inner_end_y     8.078207
right_eyebrow_outer_end_x    11.141585
right_eyebrow_outer_end_y    13.281230
nose_tip_x                   10.732525
nose_tip_y                   20.508537
mouth_left_corner_x          13.323456
mouth_left_corner_y          19.700859
mouth_right_corner_x         12.924764
mouth_right_corner_y     

In [6]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7049 entries, 0 to 7048
Data columns (total 31 columns):
left_eye_center_x            7039 non-null float64
left_eye_center_y            7039 non-null float64
right_eye_center_x           7036 non-null float64
right_eye_center_y           7036 non-null float64
left_eye_inner_corner_x      2271 non-null float64
left_eye_inner_corner_y      2271 non-null float64
left_eye_outer_corner_x      2267 non-null float64
left_eye_outer_corner_y      2267 non-null float64
right_eye_inner_corner_x     2268 non-null float64
right_eye_inner_corner_y     2268 non-null float64
right_eye_outer_corner_x     2268 non-null float64
right_eye_outer_corner_y     2268 non-null float64
left_eyebrow_inner_end_x     2270 non-null float64
left_eyebrow_inner_end_y     2270 non-null float64
left_eyebrow_outer_end_x     2225 non-null float64
left_eyebrow_outer_end_y     2225 non-null float64
right_eyebrow_inner_end_x    2270 non-null float64
right_eyebrow_inner_end_y 

In [9]:
df_train.isna().sum(axis=0) / len(df_train)

left_eye_center_x            0.001419
left_eye_center_y            0.001419
right_eye_center_x           0.001844
right_eye_center_y           0.001844
left_eye_inner_corner_x      0.677827
left_eye_inner_corner_y      0.677827
left_eye_outer_corner_x      0.678394
left_eye_outer_corner_y      0.678394
right_eye_inner_corner_x     0.678252
right_eye_inner_corner_y     0.678252
right_eye_outer_corner_x     0.678252
right_eye_outer_corner_y     0.678252
left_eyebrow_inner_end_x     0.677969
left_eyebrow_inner_end_y     0.677969
left_eyebrow_outer_end_x     0.684352
left_eyebrow_outer_end_y     0.684352
right_eyebrow_inner_end_x    0.677969
right_eyebrow_inner_end_y    0.677969
right_eyebrow_outer_end_x    0.682792
right_eyebrow_outer_end_y    0.682792
nose_tip_x                   0.000000
nose_tip_y                   0.000000
mouth_left_corner_x          0.678110
mouth_left_corner_y          0.678110
mouth_right_corner_x         0.677969
mouth_right_corner_y         0.677969
mouth_center

In [None]:
profile = pp.ProfileReport(df_train)
profile.to_file(outputfile="train.html")

In [16]:
feature_names_wo_xy = set([name[:-1] for name in list(df_train.columns)[:-1]])

In [11]:
for name in feature_names_wo_xy:
    print(name, sum(df_train[name+'x'].isna()==df_train[name+'y'].isna())/len(df_train))

left_eye_inner_corner_ 1.0
nose_tip_ 1.0
right_eyebrow_inner_end_ 1.0
mouth_center_bottom_lip_ 1.0
right_eye_inner_corner_ 1.0
left_eye_center_ 1.0
right_eye_outer_corner_ 1.0
left_eye_outer_corner_ 1.0
right_eye_center_ 1.0
mouth_left_corner_ 1.0
right_eyebrow_outer_end_ 1.0
mouth_center_top_lip_ 1.0
left_eyebrow_inner_end_ 1.0
left_eyebrow_outer_end_ 1.0
mouth_right_corner_ 1.0


In [48]:
big_var_names = []
for name in feature_names_wo_xy:
    if df_train[name+'x'].var()+df_train[name+'y'].var() > 40:
            big_var_names.append(name+'x')
            big_var_names.append(name+'y')

big_var_names.append('Image')

In [5]:
na_ratio = pd.DataFrame(df_train.isna().sum(axis=0) / len(df_train), columns=['ratio'])

In [12]:
df_train_low_na = df_train[na_ratio.loc[na_ratio['ratio'] < 0.0015].index]

In [49]:
df_train_big_var = df_train[big_var_names].dropna()

In [50]:
df_train_big_var.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 7016 entries, 0 to 7048
Data columns (total 5 columns):
nose_tip_x                   7016 non-null float64
nose_tip_y                   7016 non-null float64
mouth_center_bottom_lip_x    7016 non-null float64
mouth_center_bottom_lip_y    7016 non-null float64
Image                        7016 non-null object
dtypes: float64(4), object(1)
memory usage: 328.9+ KB


In [36]:
len(df_train_low_na.dropna())/len(df_train_low_na)

NameError: name 'df_train_low_na' is not defined

In [15]:
len(df_train_low_na.columns)

5

In [51]:
len(df_train_big_var.columns)

5