In [13]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import tabulate
import keras
from time import time

In [5]:
train = pd.read_csv('Train.csv')
test = pd.read_csv('Test.csv')
print "train size:", train.shape
print "test size:", test.shape

train_indices = train.index
train_labels = train.Footfall
columns = train.columns
data = pd.concat([train[columns[:-1]],test],ignore_index=True)

train size: (114539, 18)
test size: (39420, 17)


### data cleaning
- datatypes
- missing values
- zero-variance variables
- transformation (rescale, normalization, ...)

In [9]:
# methods
def validate_data_set(df):
    """
    checks the validity variables of a dataset. 
    It looks for datatypes, number of unique values and number of missing values
    df (pandas.dataframe): dataset
    """
    s = []
    for col in df.columns:
        s.append((col,df[col].dtype,len(pd.unique(df[col])),
                  round(100*df[df[col].isnull()].shape[0]/float(df.shape[0]),2)))
    print tabulate.tabulate(s,['Label','dtype','# unique values','# missing values (%)','Var'])
    
def string2date(x):
    """
    convert string date of format dd-mm-yyyy to datetime format
    input:
    x (str): date
    output:
    y (datetime)
    """
    return pd.datetime.strptime(x,'%d-%m-%Y').date()

def impute_nans(x,method):
    """
    replace NaN with the mean, median or most_frequent
    input:
    x (pandas.Series): array to be imputed
    method {'mean','median','most_frequent'}: method to be used in the imputation. Note that
    most_frequent works on categorical only
    output:
    y1 (pandas.Series): array with NaNs replaced by the method
    y2 (float,int,str): the value used to replace NaNs
    """
    if method == 'mean':
        val = x.mean()
        x = x.fillna(val)
    if method == 'median':
        val = x.median()
        x = x.fillna(val)
    if method == 'most_frequent':
        val = x.mode().values[0]
        x = x.fillna(val)
    return x,val

validate_data_set(data)


Label                         dtype      # unique values    # missing values (%)
----------------------------  -------  -----------------  ----------------------
ID                            int64               153959                    0
Park_ID                       int64                   28                    0
Date                          object                5600                    0
Direction_Of_Wind             float64                360                    3.52
Average_Breeze_Speed          float64                188                    3.52
Max_Breeze_Speed              float64                 30                    3.53
Min_Breeze_Speed              float64                 19                    3.52
Var1                          float64                493                    7.28
Average_Atmospheric_Pressure  float64                178                   34.66
Max_Atmospheric_Pressure      float64                176                   34.66
Min_Atmospheric_Pressure      float64

In [14]:
# datatypes
tic = time()
# convert date in datatime datatype
data['Date'] = data.Date.apply(lambda x:string2date(x))

# treat missing values
methods = {
    'median':['Direction_Of_Wind','Average_Breeze_Speed','Max_Breeze_Speed','Min_Breeze_Speed',
              'Var1','Min_Ambient_Pollution','Max_Ambient_Pollution','Average_Moisture_In_Park',
              'Max_Moisture_In_Park','Min_Moisture_In_Park'],
    'mean':['Average_Atmospheric_Pressure','Max_Atmospheric_Pressure','Min_Atmospheric_Pressure',
            ]
}
nan_substitutes = {}
park_ids = sorted(pd.unique(data.Park_ID))
for k,v in methods.iteritems():
    for pid in park_ids:
        nan_substitutes[pid] = {}
        indices = data[data.Park_ID==pid].index
        for col in v:
            if sum(~(data.loc[indices,col].isnull()))>0:
                imputes = impute_nans(data.loc[indices,col],k)
                data.loc[indices,col] = imputes[0]
                nan_substitutes[pid][col] = imputes[1]
            else:
                data.loc[indices,col] = 0
                nan_substitutes[pid][col] = 0
print "Processing time:", round(time()-tic,2)

Processing time: 9.98
