# 1. Imports
Useful libraries.

## 1.1 General
Utilities useful for handling data, displaying, etc.

In [1]:
%matplotlib inline

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

## 1.2. sklearn
Machine Learning library with models, methods for partitioning and preprocessing data.

In [2]:
from sklearn.neural_network import BernoulliRBM
from sklearn.pipeline import Pipeline
from sklearn import linear_model, datasets, metrics
from sklearn.cross_validation import train_test_split, StratifiedKFold, KFold
from sklearn.preprocessing import OneHotEncoder, LabelEncoder

## 1.3. scipy
Mathematical functions.

In [3]:
from scipy.ndimage import convolve

## 1.4. seaborn
Data visualisations, pretty graphs and statistics.

In [4]:
import seaborn as sns

# 2. Data preprocessing
Loading data, splitting into sets, normalisations.

## 2.1. Load data
Training and testing sets:

In [5]:
FILE_TRAIN = 'mnist_kaggle/train.csv'
train = pd.read_csv(FILE_TRAIN)

In [6]:
FILE_TEST = 'mnist_kaggle/test.csv'
test = pd.read_csv(FILE_TEST)

Split features and target variable:

In [7]:
def split_variables(df, target_label):
    """Splits a pandas dataframe into features and target variables.
    
    Parameters
    ----------
    df : a dataframe containing target_label column
    target_label : a string name of the column
    
    Returns
    -------
    X : a dataframe with features
    Y : a series with target values
    """
    
    X = df.ix[:, df.columns!=target_label]
    Y = df[target_label]
    return X, Y

In [8]:
TARGET_LABEL = 'label'
X, y = split_variables(train, TARGET_LABEL)

In [9]:
X_test = test

## 2.2. Split into training and validation

### 2.2.1 Stratified K-Fold for classification problems
Because all categories of *y* should be present in similar proportions in the data sets.

In [10]:
EVAL_FRAC = 0.1
kf = StratifiedKFold(y, round(1./EVAL_FRAC))
train_idx, valid_idx = next(iter(kf))
X_train = X.ix[train_idx, :]
y_train = y.ix[train_idx]
X_valid = X.ix[valid_idx, :]
y_valid = y.ix[valid_idx]

### 2.2.2 Usual K-Fold for regression problems

In [11]:
EVAL_FRAC = 0.1
kf = KFold(len(y), round(1./EVAL_FRAC))
train_idx, valid_idx = next(iter(kf))
X_train = X.ix[train_idx, :]
y_train = y.ix[train_idx]
X_valid = X.ix[valid_idx, :]
y_valid = y.ix[valid_idx]

### 2.2.3 Splitting function
Do the above in one line.

In [12]:
def get_next_split(X, y, eval_frac, kfold):
    """Splits data into training and validation sets
        
    Parameters
    ----------
    X : a dataframe containing features
    y : a series with target variable
    eval_frac : fraction of data used for validation
    kfold : (Stratified)KFold object
    
    Returns
    -------
    X_train, y_train, X_valid, y_valid : data split into training
    and validation 
    """
    train_idx, valid_idx = next(iter(kf))
    X_train = X.ix[train_idx, :]
    y_train = y.ix[train_idx]
    X_valid = X.ix[valid_idx, :]
    y_valid = y.ix[valid_idx]
    return X_train, y_train, X_valid, y_valid

In [13]:
X_train, y_train, X_valid, y_valid = get_next_split(X, y,
                                                    EVAL_FRAC, kf)

## 2.3. Preprocess

### 2.3.1 Deal with non-full columns
Replace NaNs or missing data

### 2.3.2 Categorical variables

Load Titanic data to show some examples.

In [14]:
# dummy function so that other variables are not overshadowed
def titanic_demo():
    f = 'titanic_kaggle/train.csv'
    train = pd.read_csv(f)
    target = 'Survived'
    X, y = split_variables(train, target)
    
    X_tmp = X.drop(['Name', 'Ticket', 'Cabin'], axis=1)
    X_cat = X_tmp.select_dtypes(include=['object'])
    cat_labels = X_cat.columns.values
    #print(cat_labels)
    
    # pandas way
    X_1h = pd.get_dummies(X,
                          prefix=cat_labels,
                          columns=cat_labels)
    #print(X_1h.head())

    
    #print(X_cat[cat_labels])
    
    # LabelEncoder + OneHotEncoder way
    #feature = 'Sex'
    #lbl = LabelEncoder()
    #lbl.fit(X[feature])
    #X = lbl.transform(X[feature])
    #
    #print(X)
    
    #ohe = OneHotEncoder()
    #ohe.fit(X_lbl)
    #X_1h = ohe.transform(X_lbl)
    #print(X_1h.toarray())
    
    
    
titanic_demo()

Extract features by looking at the types of the variables. This is not always sufficient -- variables represented by numbers can be categorical (e.g. *y* in MNIST) -- have a look at Titanic data above.

In [15]:
X_cat = X.select_dtypes(include=['object'])
print(X_cat.columns.values)

[]


### One hot encoding
For pandas use *get_dummies* (see Titanic example).

Otherwise use *OneHotEncoder* -- not there now (see Titanic example).

### Decoding
No general way. Fiddle with *df.stack* or have a solution specific
to data.

### 2.3.2 Numerical variables
Bear in mind that numerical variables can be used to represent categories, thus automatic extraction is not fool-proof.

In [16]:
X_num = X.select_dtypes(include=['float', 'int8', 'int16', 'int32', 'int64'])

### Normalise data
- $x = \frac{x - min(x)}{max(x) - mix(x)}$ - linearly scale data into range $[0, 1]$
- $x = x - min(x)$ - shift data to zero
- $x = log(1+x)$ - for numerical stability and mathematical reasons  (easier to catch trends) - only positive values allowed!

In [17]:
def scale_01(X):
    return (X - X.min())/(X.max() - X.min() + 0.001)

X_scaled = scale_01(X_num)

In [18]:
X_log1p = np.log1p(X_num)

In [19]:
X_zeroed = X_num - X_num.mean()

In [20]:
def normalise_labels(X, labels):
    X_tmp = X.copy()
    X_tmp[labels] = scale_01(X[labels])
    X_tmp[labels] = np.log1p(X[labels]) 

# 3. Models

## 3.1 Restricted Boltzman Machine
Binary