In [1]:
import logging
import random
from pprint import pformat

import numpy as np
import pandas as pd
from sklearn.cross_validation import train_test_split
from sklearn import metrics
from keras.models import Sequential
from keras.layers.core import Dense, Activation, Dropout
from keras.layers.normalization import BatchNormalization
from keras.layers.advanced_activations import LeakyReLU
from keras.optimizers import Adagrad
from keras.callbacks import EarlyStopping
from keras.utils import np_utils
from keras.regularizers import l1, l2, activity_l2

from bokeh.io import output_notebook
from bokeh.plotting import figure, show

Using Theano backend.




In [2]:
logging.basicConfig(
    level=logging.DEBUG,
    format='%(levelname)-8s %(message)s'
)

In [3]:
output_notebook()
np.random.seed(0)

In [4]:
# Load training data
def load_data(input_data):
    """
    Loads data as pandas dataframe
    input_data: a single csv file with an ID column
    """
    df_train = pd.read_csv(input_data, index_col="ID")

    # Fill categorical categories with NA value and convert them to the right
    # type
    for col in df_train.select_dtypes(include=['object']).columns:
        df_train[col] = df_train[col].fillna(value='NA', axis=0)
        df_train[col] = df_train[col].astype('category')

    # Fill the other columns with 0 as the fill value
    for col in df_train.select_dtypes(exclude=['category']).columns:
        df_train[col] = df_train[col].fillna(value=-1, axis=0)

    old_length = df_train.shape[0]
    df_train = df_train.dropna(axis=0, how='any')
    row_diff = old_length - df_train.shape[0]
    logging.debug(
        "Dropped {} rows with NAs {:.1%}".format(
            row_diff,
            float(row_diff)/old_length
        )
    )
    return df_train


def categorical_to_front(input_df):
    cat_columns = list(input_df.select_dtypes(include=['category']).columns)

    logging.debug("Number of categorical columns: {}".format(len(cat_columns)))

    other_columns = list(input_df.select_dtypes(exclude=['category']).columns)

    new_column_order = cat_columns + other_columns
    train_df = input_df[new_column_order]

    return train_df


def categorical_analysis(input_data):
    categories = []
    for col in input_data.columns:
        if str(input_data[col].dtype) == "category":
            cat = {
                "col_lbl": col,
                "cat_count": input_data[col].cat.categories.shape[0]
            }
            categories.append(cat)
    return categories


def convert_category_to_columns(input_data, column_name):
    if not isinstance(input_data, pd.DataFrame):
        raise TypeError("Input data must be a Pandas DataFrame")
    if str(input_data[column_name].dtype) != "category":
        raise RuntimeError("Can only run this on categorical columns")
    categories = input_data[column_name].cat.categories

    for cat in categories:
        new_col_name = "{col}_{cat}".format(col=column_name, cat=cat)
        input_data[new_col_name] = np.where(
            input_data[column_name] == cat,
            1,
            0
        )


def convert_categories_to_columns(input_data, cat_thres=130):
    cols_to_remove = []
    for col in input_data.columns:
        if str(input_data[col].dtype) == 'category':
            if input_data[col].cat.categories.shape[0] < cat_thres:
                convert_category_to_columns(input_data, col)
                cols_to_remove.append(col)
    input_data.drop(cols_to_remove, axis=1, inplace=True)

In [5]:
def remove_categorical(input_df):
    """
    Removes categorical and only leaves numerical variables in the dataframe
    Returns a pandas dataframe
    input: pandas dataframe with categorical & numerical data
    """
    cat_columns = list(input_df.select_dtypes(include=['category']).columns)

    logging.debug("Number of categorical columns: {}".format(len(cat_columns)))

    other_columns = list(input_df.select_dtypes(exclude=['category']).columns)

    new_column_order = other_columns
    train_df = input_df[new_column_order]

    return train_df

In [6]:
def train_test(input_data):
    """
    Splits data to training & testing sets
    Splits columns to input & output for training & testing set respectively
    Returns ndarrays
    input: a pandas dataframe with a "target" column
    """
    # Reorder the columns, categorical go first
    train_df = categorical_to_front(input_data)
    logging.debug(
        "Categories and label counts: {}".format(
            pformat(categorical_analysis(train_df))
        )
    )

    convert_categories_to_columns(train_df)

    logging.debug(train_df.get_dtype_counts())

    # Temporary
    for col in train_df.select_dtypes(include=['category']).columns:
        train_df[col] = train_df[col].astype('category').cat.codes

    train_inp = train_df.drop('target', axis=1).as_matrix()
    train_out = train_df['target'].as_matrix()

    logging.debug(
        "Train 0s/1s: {:.2%} / {:.2%}".format(
            1.0 - np.average(train_out),
            np.average(train_out)
        )
    )

    x_train, x_test, y_train, y_test = train_test_split(train_inp,
                                                        train_out,
                                                        test_size=0.33,
                                                        random_state=42)
    y_train = np_utils.to_categorical(y_train)
    y_test = np_utils.to_categorical(y_test)

    return (x_train, x_test, y_train, y_test)

In [7]:
def nn_model(x_train, y_train, epochs, batch):
    
    # learning_rate = .1
    # sgd = SGD(lr=learning_rate, decay=1e-6, momentum=0.9, nesterov=True)
    error_fun = Adagrad()
    
    model = Sequential()

    model.add(Dense(128, input_shape=(x_train.shape[1],)))
    model.add(Activation('relu'))
    model.add(BatchNormalization())

    model.add(Dense(64))
    model.add(Activation('relu'))
    model.add(Dropout(0.5))

    model.add(Dense(output_dim=2))
    model.add(Activation('softmax'))
    model.compile(class_mode='binary', loss='binary_crossentropy', optimizer=error_fun)

    early_stopping = EarlyStopping(
        monitor='val_loss',
        patience=10,
        verbose=0,
        mode='auto'
    )    
    
    hist = model.fit(
        x_train,
        y_train,
        nb_epoch=epochs,
        batch_size=batch,
        validation_split=0.1,
        show_accuracy=True,
        shuffle=True,
        callbacks=[early_stopping]
    )
    
    return model, hist

In [8]:
def nn_model2(x_train, y_train, epochs, batch):
    
    # learning_rate = .1
    # sgd = SGD(lr=learning_rate, decay=1e-6, momentum=0.9, nesterov=True)
    error_fun = Adagrad()
    
    model = Sequential()

    model.add(Dense(512, input_shape=(x_train.shape[1],)))
    model.add(Activation('linear'))
    model.add(BatchNormalization())

    model.add(Dense(128))
    model.add(Activation('linear'))
    model.add(Dropout(0.5))

    model.add(Dense(64))
    model.add(Activation('linear'))
    model.add(Dropout(0.5))
    
    model.add(Dense(output_dim=2))
    model.add(Activation('softmax'))
    model.compile(class_mode='binary', loss='binary_crossentropy', optimizer=error_fun)

    early_stopping = EarlyStopping(
        monitor='val_loss',
        patience=10,
        verbose=0,
        mode='auto'
    )    
    
    hist = model.fit(
        x_train,
        y_train,
        nb_epoch=epochs,
        batch_size=batch,
        validation_split=0.1,
        show_accuracy=True,
        shuffle=True,
        callbacks=[early_stopping]
    )
    
    return model, hist

In [9]:
def epochs_perf_plot(hist):
    """
    Create plot of model performance by epoch
    input: nn history object, # epochs
    returns bokeh line plot
    """
    epochs = len(hist.history['acc'])
    p = figure(title="Model Performance (Training Set)", plot_width=600, plot_height=600)

    p.line(x=range(0, epochs), y=hist.history['loss'],
           color="firebrick", line_width=4, legend="Loss")
    p.line(x=range(0, epochs), y=hist.history['acc'],
           color="navy", line_width=4, legend="Accuracy")
    
    p.legend.orientation = "bottom_left"
    p.xaxis.axis_label = "Epoch"
    
    show(p)

In [10]:
def undersample(X, class_column):
    """
    Undersamples a dataset to obtain equal number of classes from imbalanced data
    input: initial dataset as pandas dataframe, column with class labels as string
    """
    counts = X[class_column].value_counts(ascending=True)
    print("The frequency of each class: {}.".format(X[class_column].value_counts(normalize=True)))
    classes = pd.unique(X[class_column].ravel())
    l = []
    for value in classes:
        class_indices = X[X[class_column] == value].index
        random_index = random.sample(class_indices, counts[0])
        l.extend(random_index)
    return X.ix[l]

In [None]:
# Load data & split into testing & training set
train_df = load_data("train.csv")
x_train, x_test, y_train, y_test = train_test(train_df)

logging.debug("SHAPES: IN Train [{}], Test [{}]".format(x_train.shape, x_test.shape))
logging.debug("SHAPES: OUT Train [{}], Test [{}]".format(y_train.shape, y_test.shape))

# Create NN for 2-layer unidimensional regression
batch = 1024
epochs = 100

model, hist = nn_model(x_train, y_train, epochs, batch)

In [None]:
# Plot model training performance
epochs_perf_plot(hist)

In [None]:
# Test set predictions
predicted = model.predict(x_test)
logging.info("Predicted 0s/1s: {:.2%} {:.2%}".format(np.average(predicted[:, 0]), np.average(predicted[:, 1])))

score = model.evaluate(x_test, y_test, show_accuracy=True, batch_size=batch)

print('Test score (log loss): {}'.format(score[0]))
print('Test accuracy: {}'.format(score[1]))

In [None]:
# Get Precision & Recall metrics on test set
precision, recall, thresholds = metrics.precision_recall_curve(y_test[:,1], predicted[:,1])
# Plot PRC
p = figure(title="Model Metrics (PRC)", plot_width=600, plot_height=600)

p.line(x=recall, y=precision, color="firebrick", line_width=4)
p.xaxis.axis_label = "Recall"
p.yaxis.axis_label = "Precision"
    
show(p)

In [None]:
# Get true positive rate & false positive rate
fpr, tpr, thresholds = metrics.roc_curve(y_test[:,1], predicted[:,1])
# Plot ROC
p = figure(title="Model Metrics (ROC)", plot_width=600, plot_height=600)

p.line(x=fpr, y=tpr, color="navy", line_width=4)
p.xaxis.axis_label = "False Positive Rate"
p.yaxis.axis_label = "True Positive Rate"
    
show(p)

In [None]:
# Compute AUC
auc = metrics.roc_auc_score(y_test[:,1], predicted[:,1])
print("The AUC score is: {}".format(auc))

In [None]:
# NN with undersampled data (# class 1 == # class 0)
data_df = load_data("train.csv")
data_df = undersample(data_df, "target")
x_train, x_test, y_train, y_test = train_test(train_df)

logging.debug("SHAPES: IN Train [{}], Test [{}]".format(x_train.shape, x_test.shape))
logging.debug("SHAPES: OUT Train [{}], Test [{}]".format(y_train.shape, y_test.shape))

model, hist = nn_model(x_train, y_train, epochs, batch)

In [None]:
# Plot undersampled model training performance
epochs_perf_plot(hist)

In [None]:
# Undersampled Test set predictions
predicted = model.predict(x_test)
logging.info("Predicted 0s/1s: {:.2%} {:.2%}".format(np.average(predicted[:, 0]), np.average(predicted[:, 1])))

score = model.evaluate(x_test, y_test, show_accuracy=True, batch_size=batch)

print('Test score (log loss): {}'.format(score[0]))
print('Test accuracy: {}'.format(score[1]))

In [None]:
model3, hist3 = nn_model2(x_train2, y_train2, epochs, batch)

# Plot model 3 training performance
epochs_perf_plot(hist3)

# Test set 3 predictions
predicted3 = model3.predict(x_test2)
logging.info("Predicted 0s/1s: {:.2%} {:.2%}".format(np.average(predicted3[:, 0]), np.average(predicted3[:, 1])))

score3 = model3.evaluate(x_test2, y_test2, show_accuracy=True, batch_size=batch)

print('Test score (log loss): {}'.format(score3[0]))
print('Test accuracy: {}'.format(score3[1]))

In [None]:
train_df = remove_categorical(load_data("train.csv"))
train_df = undersample(train_df, "target")

x_train, x_test, y_train, y_test = train_test(train_df)

logging.debug("SHAPES: IN Train [{}], Test [{}]".format(x_train.shape, x_test.shape))
logging.debug("SHAPES: OUT Train [{}], Test [{}]".format(y_train.shape, y_test.shape))

model, hist = nn_model2(x_train, y_train, epochs, batch)
# Plot model training performance
epochs_perf_plot(hist)
# Test set predictions
predicted = model.predict(x_test)
logging.info("Predicted 0s/1s: {:.2%} {:.2%}".format(np.average(predicted[:, 0]), np.average(predicted[:, 1])))

score = model.evaluate(x_test, y_test, show_accuracy=True, batch_size=batch)

print('Test score (log loss): {}'.format(score[0]))
print('Test accuracy: {}'.format(score[1]))
print(predicted)

In [None]:
train_df = load_data("train.csv")
train_df = undersample(train_df, "target")

from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.feature_extraction import DictVectorizer

X = train_df.drop("target", axis=1).drop("v22", axis=1)
dv = DictVectorizer()

X = dv.fit_transform(X.T.to_dict().values())
y = train_df["target"].as_matrix()

lda = LinearDiscriminantAnalysis(n_components=5)

X = lda.fit_transform(X.toarray(), y)

train_df = pd.DataFrame(X)
train_df["target"] = y

x_train, x_test, y_train, y_test = train_test(train_df)

logging.debug("SHAPES: IN Train [{}], Test [{}]".format(x_train.shape, x_test.shape))
logging.debug("SHAPES: OUT Train [{}], Test [{}]".format(y_train.shape, y_test.shape))

In [None]:
# Run model on LDA output
model, hist = nn_model(x_train, y_train, epochs, batch)
# Plot model training performance
epochs_perf_plot(hist)
# Test set predictions
predicted = model.predict(x_test)
logging.info("Predicted 0s/1s: {:.2%} {:.2%}".format(np.average(predicted[:, 0]), np.average(predicted[:, 1])))

score = model.evaluate(x_test, y_test, show_accuracy=True, batch_size=batch)

print('Test score (log loss): {}'.format(score[0]))
print('Test accuracy: {}'.format(score[1]))
print(predicted)

In [20]:
# Create SVM
import matplotlib.pyplot as plt
from sklearn import svm
from sklearn.cross_validation import cross_val_score
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis

#train_df = load_data("train.csv")
#x_train, x_test, y_train, y_test = train_test(train_df)

train_df = load_data("train.csv")

X = train_df.drop("target", axis=1).drop("v22", axis=1)

X = pd.get_dummies(X).as_matrix()

y = train_df["target"].as_matrix()

lda = LinearDiscriminantAnalysis(n_components=10)

X = lda.fit_transform(X, y)


train_df = pd.DataFrame(X)
train_df["target"] = y

x_train, x_test, y_train_d, y_test_d = train_test(train_df)

y_train = y_train_d[:, 1]
y_test = y_test_d[:, 1]

# we create an instance of SVM and fit out data. We do not scale our
# data since we want to plot the support vectors
C = 1.0  # SVM regularization parameter
svc = svm.SVC(kernel='linear', C=C, cache_size=1024)
rbf_svc = svm.SVC(kernel='rbf', gamma=0.7, C=C, cache_size=1024)
poly_svc = svm.SVC(kernel='poly', degree=3, C=C, cache_size=1024)
print("ok")

svc_score = cross_val_score(svc, x_train, y_train, cv=3)
svc_pred = svc.fit(x_train, y_train)
svc_loss = metrics.log_loss(y_test, svc_pred.predict(x_test))
print("Accuracy training SVC(linear): {} (+/- {})".format(svc_score.mean(), svc_score.std() * 2))
print("Accuracy test set SVC(linear): {}".format(svc_pred.score(x_test, y_test)))
print("Logloss test set SVC(linear): {}".format(svc_loss))

rbf_svc_score = cross_val_score(rbf_svc, x_train, y_train, cv=3)
rbf_svc_pred = rbf_svc.fit(x_train, y_train)
rbf_svc_loss = metrics.log_loss(y_test, rbf_svc_pred.predict(x_test))
print("Accuracy training SVC(rbf): {} (+/- {})".format(rbf_svc_score.mean(), rbf_svc_score.std() * 2))
print("Accuracy test set SVC(rbf): {}".format(rbf_svc.score(x_test, y_test)))
print("Logloss test set SVC(rbf): {}".format(rbf_svc_loss))
      
poly_svc_score = cross_val_score(poly_svc, x_train, y_train, cv=3)
poly_svc_pred = poly_svc.fit(x_train, y_train)
poly_svc_loss = metrics.log_loss(y_test, poly_svc_pred.predict(x_test))
print("Accuracy training SVC(poly): {} (+/- {})".format(poly_svc_score.mean(), poly_svc_score.std() * 2))
print("Accuracy test set SVC(poly): {}".format(poly_svc.score(x_test, y_test)))
print("Logloss test set SVC(poly): {}".format(poly_svc_loss))
      

print("done")

ok
Accuracy training SVC(linear): 0.761368235784 (+/- 2.81154472936e-05)
Accuracy test set SVC(linear): 0.760854583046
Logloss test set SVC(linear): 8.25998130209
Accuracy training SVC(rbf): 0.772596112158 (+/- 0.0014309906858)
Accuracy test set SVC(rbf): 0.773074272385
Logloss test set SVC(rbf): 7.83790277176
Accuracy training SVC(poly): 0.766538289118 (+/- 0.000574905518871)
Accuracy test set SVC(poly): 0.768223506335
Logloss test set SVC(poly): 8.00545855216
done


In [21]:
train_df = load_data("train.csv")

correlations = pd.DataFrame(train_df.corr())
print(correlations)
target_correl = correlations['target']
#print(target_correl.order())

from bokeh.charts import HeatMap, output_file, show
from bokeh.sampledata.unemployment1948 import data

# pandas magic
print(data)

df = data[data.columns[:-2]]
df2 = df.set_index(df[df.columns[0]].astype(str))
df2.drop(df.columns[0], axis=1, inplace=True)
df3 = df2.transpose()

output_file("cat_heatmap.html")

hm = HeatMap(df3, title="categorical heatmap", width=800)

show(hm)

          target        v1        v2        v4        v5        v6        v7  \
target  1.000000 -0.018843 -0.003288 -0.002915 -0.014713 -0.011823 -0.011511   
v1     -0.018843  1.000000  0.696461  0.779529  0.755411  0.818256  0.835146   
v2     -0.003288  0.696461  1.000000  0.917658  0.827596  0.856378  0.896183   
v4     -0.002915  0.779529  0.917658  1.000000  0.882361  0.946102  0.955000   
v5     -0.014713  0.755411  0.827596  0.882361  1.000000  0.891806  0.878692   
v6     -0.011823  0.818256  0.856378  0.946102  0.891806  1.000000  0.948777   
v7     -0.011511  0.835146  0.896183  0.955000  0.878692  0.948777  1.000000   
v8     -0.018114  0.478706  0.282083  0.412737  0.481223  0.471503  0.419458   
v9     -0.020321  0.811641  0.846393  0.889921  0.851149  0.923412  0.925890   
v10     0.147620  0.016779  0.043768  0.047789  0.039626  0.036445  0.031098   
v11    -0.017669  0.836102  0.888219  0.947135  0.902813  0.965504  0.962327   
v12     0.049938  0.030596  0.052794  0.