# 1. Imports
Useful libraries.

## 1.1 General
Utilities useful for handling data, displaying, etc.

In [2]:
%matplotlib inline

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

## 1.2. sklearn
Machine Learning library with models, methods for partitioning and preprocessing data.

In [19]:
from sklearn.neural_network import BernoulliRBM
from sklearn.pipeline import Pipeline
from sklearn import linear_model, datasets, metrics
from sklearn.cross_validation import train_test_split, StratifiedKFold, KFold
from sklearn.preprocessing import OneHotEncoder

## 1.3. scipy
Mathematical functions.

In [4]:
from scipy.ndimage import convolve

## 1.4. seaborn
Data visualisations, pretty graphs and statistics.

In [55]:
import seaborn as sns

# 2. Data preprocessing
Loading data, splitting into sets, normalisations.

## 2.1. Load data
Training and testing sets:

In [7]:
FILE_TRAIN = 'mnist_kaggle/train.csv'
train = pd.read_csv(FILE_TRAIN)

In [9]:
FILE_TEST = 'mnist_kaggle/test.csv'
test = pd.read_csv(FILE_TEST)

Split features and target variable:

In [10]:
def split_variables(df, target_label):
    """Splits a pandas dataframe into features and target variables.
    
    Parameters
    ----------
    df : a dataframe containing target_label column
    target_label : a string name of the column
    
    Returns
    -------
    X : a dataframe with features
    Y : a series with target values
    """
    
    X = df.ix[:, df.columns!=target_label]
    Y = df[target_label]
    return X, Y

In [21]:
TARGET_LABEL = 'label'
X, y = split_variables(train, TARGET_LABEL)

In [14]:
X_test = test

## 2.2. Split into training and validation

### 2.2.1 Stratified K-Fold for classification problems
Because all categories of *y* should be present in similar proportions in the data sets.

In [44]:
EVAL_FRAC = 0.1
kf = StratifiedKFold(y, round(1./EVAL_FRAC))
train_idx, valid_idx = next(iter(kf))
X_train = X.ix[train_idx, :]
y_train = y.ix[train_idx]
X_valid = X.ix[valid_idx, :]
y_valid = y.ix[valid_idx]

### 2.2.2 Usual K-Fold for regression problems

In [45]:
EVAL_FRAC = 0.1
kf = KFold(len(y), round(1./EVAL_FRAC))
train_idx, valid_idx = next(iter(kf))
X_train = X.ix[train_idx, :]
y_train = y.ix[train_idx]
X_valid = X.ix[valid_idx, :]
y_valid = y.ix[valid_idx]

### 2.2.3 Splitting function
Do the above in one line.

In [49]:
def get_next_split(X, y, eval_frac, kfold):
    """Splits data into training and validation sets
        
    Parameters
    ----------
    X : a dataframe containing features
    y : a series with target variable
    eval_frac : fraction of data used for validation
    kfold : (Stratified)KFold object
    
    Returns
    -------
    X_train, y_train, X_valid, y_valid : data split into training
    and validation 
    """
    train_idx, valid_idx = next(iter(kf))
    X_train = X.ix[train_idx, :]
    y_train = y.ix[train_idx]
    X_valid = X.ix[valid_idx, :]
    y_valid = y.ix[valid_idx]
    return X_train, y_train, X_valid, y_valid

In [50]:
X_train, y_train, X_valid, y_valid = get_next_split(X, y,
                                                    EVAL_FRAC, kf)

## 2.3. Preprocess

### 2.3.1 Categorical variables
Categorical to one-hot encoding and decoding

Load Titanic data to show some examples.

In [66]:
# dummy function so that other variables are not overshadowed
def titanic_demo():
    f = 'titanic_kaggle/train.csv'
    train = pd.read_csv(f)
    target = 'Survived'
    X, y = split_variables(train, target)
    print(X.head())
    
    
titanic_demo()

   PassengerId  Pclass                                               Name  \
0            1       3                            Braund, Mr. Owen Harris   
1            2       1  Cumings, Mrs. John Bradley (Florence Briggs Th...   
2            3       3                             Heikkinen, Miss. Laina   
3            4       1       Futrelle, Mrs. Jacques Heath (Lily May Peel)   
4            5       3                           Allen, Mr. William Henry   

      Sex   Age  SibSp  Parch            Ticket     Fare Cabin Embarked  
0    male  22.0      1      0         A/5 21171   7.2500   NaN        S  
1  female  38.0      1      0          PC 17599  71.2833   C85        C  
2  female  26.0      0      0  STON/O2. 3101282   7.9250   NaN        S  
3  female  35.0      1      0            113803  53.1000  C123        S  
4    male  35.0      0      0            373450   8.0500   NaN        S  


Extract features by looking at the type of the variables. This is not always sufficient -- variables represented by numbers can be categorical (e.g. *y* in MNIST).

In [61]:
categorical_features = X.dtypes[X.dtypes == 'object'].index
categorical_features = ['label']

Index([], dtype='object')

In [59]:
ohe = OneHotEncoder()
ohe.fit(Y[])



OneHotEncoder(categorical_features='all', dtype=<type 'float'>,
       handle_unknown='error', n_values='auto', sparse=True)

### 2.3.2 Numerical variables

In [54]:
numerical_features = train.dtypes[train.dtypes != 'object'].index