In [70]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

from keras.callbacks import EarlyStopping

from keras.layers import advanced_activations
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Dropout, Conv1D, MaxPooling1D, BatchNormalization, Activation, Flatten
from sklearn import tree, base
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.model_selection import *
from sklearn.svm import *
import scipy.stats

*Loading data*

In [4]:
with open("training_data.txt") as f:
    words = f.readline().split(' ')
training_data = np.loadtxt("training_data.txt", skiprows = 1)
test_data = np.loadtxt("test_data.txt", skiprows = 1)

unprocessed_x_train_original = training_data[:, 1:]
y_train_original = training_data[:, 0]

*Normalizing data with TfidfTransformer*

In [27]:
tf = TfidfTransformer().fit(unprocessed_x_train_original)
x_train_tf = tf.transform(unprocessed_x_train_original)

x_test_tf = tf.transform(test_data)

*Splitting data into training and validation sets*

In [28]:
# Split tf normalized data 
x_train, x_val, y_train, y_val \
        = train_test_split(x_train_tf, y_train_original, test_size = 0.2, random_state = None)

In [29]:
# Split regular data (not tf-normalized)
x_train_reg, x_val_reg, y_train_reg, y_val_reg \
        = train_test_split(unprocessed_x_train_original, y_train_original, test_size = 0.2, random_state = None)


# Neural Network

In [26]:
neural_network(x_train, y_train, x_val, y_val)

Train on 16000 samples, validate on 4000 samples
Epoch 1/10
 - 3s - loss: 0.4938 - acc: 0.7647 - val_loss: 0.3393 - val_acc: 0.8568
Epoch 2/10
 - 2s - loss: 0.3311 - acc: 0.8613 - val_loss: 0.3410 - val_acc: 0.8575
Epoch 3/10
 - 2s - loss: 0.3219 - acc: 0.8662 - val_loss: 0.3458 - val_acc: 0.8548
Epoch 4/10
 - 2s - loss: 0.3140 - acc: 0.8698 - val_loss: 0.3488 - val_acc: 0.8552
Epoch 5/10
 - 2s - loss: 0.3082 - acc: 0.8732 - val_loss: 0.3516 - val_acc: 0.8548
Epoch 6/10
 - 2s - loss: 0.3044 - acc: 0.8764 - val_loss: 0.3543 - val_acc: 0.8508
Epoch 7/10
 - 2s - loss: 0.2986 - acc: 0.8801 - val_loss: 0.3564 - val_acc: 0.8480
Epoch 8/10
 - 2s - loss: 0.2940 - acc: 0.8801 - val_loss: 0.3587 - val_acc: 0.8472
Epoch 9/10
 - 2s - loss: 0.2869 - acc: 0.8847 - val_loss: 0.3615 - val_acc: 0.8468
Epoch 10/10
 - 2s - loss: 0.2818 - acc: 0.8871 - val_loss: 0.3640 - val_acc: 0.8425
Validation Accuracy: 84.250000


0.3640069779753685

In [25]:
neural_network(x_train_reg, y_train_reg, x_val_reg, y_val_reg)

Train on 16000 samples, validate on 4000 samples
Epoch 1/10
 - 3s - loss: 0.4442 - acc: 0.7957 - val_loss: 0.3600 - val_acc: 0.8430
Epoch 2/10
 - 2s - loss: 0.3316 - acc: 0.8622 - val_loss: 0.3604 - val_acc: 0.8430
Epoch 3/10
 - 2s - loss: 0.3093 - acc: 0.8756 - val_loss: 0.3641 - val_acc: 0.8458
Epoch 4/10
 - 2s - loss: 0.2891 - acc: 0.8863 - val_loss: 0.3701 - val_acc: 0.8430
Epoch 5/10
 - 2s - loss: 0.2688 - acc: 0.8969 - val_loss: 0.3754 - val_acc: 0.8442
Epoch 6/10
 - 2s - loss: 0.2490 - acc: 0.9084 - val_loss: 0.3859 - val_acc: 0.8433
Epoch 7/10
 - 2s - loss: 0.2335 - acc: 0.9167 - val_loss: 0.3953 - val_acc: 0.8397
Epoch 8/10
 - 2s - loss: 0.2161 - acc: 0.9255 - val_loss: 0.4052 - val_acc: 0.8387
Epoch 9/10
 - 2s - loss: 0.2020 - acc: 0.9314 - val_loss: 0.4129 - val_acc: 0.8400
Epoch 10/10
 - 2s - loss: 0.1877 - acc: 0.9378 - val_loss: 0.4261 - val_acc: 0.8380
Validation Accuracy: 83.800000


0.42610613971948624

# Random Forest

In [38]:
def classification_err(y, real_y):
    """
    This function returns the classification error between two equally-sized vectors of 
    labels; this is the fraction of samples for which the labels differ.
    
    Inputs:
        y: (N, ) shaped array of predicted labels
        real_y: (N, ) shaped array of true labels
    Output:
        Scalar classification error
    """
    tot_err = 0
    
    # Count how many samples the labels differ between the two vectors
    for i in range(len(y)):
        if y[i] != real_y[i]:
            tot_err += 1
    
    # Get the fraction of samples for which the labels differ
    classification_error = float(tot_err) / len(y)
    
    return classification_error

def random_forest_predict(clf, max_depth, X_train, y_train, X_test, y_test):
    """
    This function evaluates the given classifier (either a decision tree or random forest) at all of the 
    minimum leaf size parameters in the vector min_samples_leaf, using the given training and testing
    data. It returns two vector, with the training and testing classification errors.
    
    Inputs:
        clf: either a decision tree or random forest classifier object
        min_samples_leaf: a (T, ) vector of all the min_samples_leaf stopping condition parameters 
                            to test, where T is the number of parameters to test
        X_train: (N, D) matrix of training samples.
        y_train: (N, ) vector of training labels.
        X_test: (N, D) matrix of test samples
        y_test: (N, ) vector of test labels
    Output:
        train_err: (T, ) vector of classification errors on the training data
        test_err: (T, ) vector of classification errors on the test data
    """
    
    # Use minimum leaf size parameters to create, train, and test a classifier

    # Make a new copy of the classifier, which allows us to "forget" about the
    # fitting from the previous min_leaf value
    clf_copy = base.clone(clf)
    clf_copy.set_params(max_depth= max_depth)
    clf_copy.fit(X_train, y_train)

    train_predict = clf_copy.predict(X_train)
    test_predict = clf_copy.predict(X_test)

    train_err = classification_err(train_predict, y_train)
    val_err = classification_err(test_predict, y_test)
    
    return train_err, val_err #, test_predict

In [42]:
n_estimators = 200
clf = RandomForestClassifier(n_estimators = n_estimators, criterion = 'gini')
train_err, test_err = random_forest_predict(clf, 49, x_train, y_train, x_val, y_val)

train_accuracy = 1 - train_err
test_accuracy = 1 - test_err

print(train_accuracy)
print(test_accuracy)


0.992875
0.827


In [43]:
n_estimators = 200
clf = RandomForestClassifier(n_estimators = n_estimators, criterion = 'gini')
train_err_reg, test_err_reg = random_forest_predict(clf, 49, x_train_reg, y_train_reg, x_val_reg, y_val_reg)

train_accuracy_reg = 1 - train_err_reg
test_accuracy_reg = 1 - test_err_reg

print(train_accuracy_reg)
print(test_accuracy_reg)

0.9859375
0.8342499999999999


# SVM

In [None]:
def random_forest_predict(clf, max_depth, X_train, y_train, X_test, y_test):
    """
    This function evaluates the given classifier (either a decision tree or random forest) at all of the 
    minimum leaf size parameters in the vector min_samples_leaf, using the given training and testing
    data. It returns two vector, with the training and testing classification errors.
    
    Inputs:
        clf: either a decision tree or random forest classifier object
        min_samples_leaf: a (T, ) vector of all the min_samples_leaf stopping condition parameters 
                            to test, where T is the number of parameters to test
        X_train: (N, D) matrix of training samples.
        y_train: (N, ) vector of training labels.
        X_test: (N, D) matrix of test samples
        y_test: (N, ) vector of test labels
    Output:
        train_err: (T, ) vector of classification errors on the training data
        test_err: (T, ) vector of classification errors on the test data
    """
    
    # Use minimum leaf size parameters to create, train, and test a classifier

    # Make a new copy of the classifier, which allows us to "forget" about the
    # fitting from the previous min_leaf value
    clf_copy = base.clone(clf)
    clf_copy.set_params(max_depth= max_depth)
    clf_copy.fit(X_train, y_train)

    train_predict = clf_copy.predict(X_train)
    test_predict = clf_copy.predict(X_test)

    train_err = classification_err(train_predict, y_train)
    val_err = classification_err(test_predict, y_test)
    
    return train_err, val_err #, test_predict

# Ensemble Model

In [78]:
def random_forest_validation(x_train, y_train, x_test):
    n_estimators = 200
    clf = RandomForestClassifier(n_estimators = n_estimators, criterion = 'gini')
    clf.set_params(max_depth= 49)
    clf.fit(x_train, y_train)

    y_test = clf.predict(x_test)

    return y_test

In [52]:
def neural_network_validation(x_train, y_train, x_test):
    model = Sequential()
    model.add(Dense(100, input_shape=(len(x_train[0]),)))
    model.add(Activation('relu'))
    model.add(Dropout(0.05))

    model.add(Dense(30)) 
    model.add(Activation('sigmoid'))
    model.add(Dropout(0.05))

    model.add(Dense(30)) 
    model.add(Activation('sigmoid'))
    model.add(Dropout(0.05))

    model.add(Dense(1))
    model.add(Activation('sigmoid'))

    # compile network
    model.compile(loss='binary_crossentropy', optimizer='rmsprop', metrics=['accuracy'])
    # fit network
    model.fit(x_train, y_train, epochs=10, verbose=2) # verbose = 2
    
    # Generate predictions
    predictions = model.predict(x_test, verbose=1)
    predictions = predictions.flatten()
    y_test = (predictions >= 0.5).astype(int)
    
    return y_test

In [None]:
unprocessed_x_train_original.shape

In [68]:
def SVC_validation(x_train_tf, y_train, x_test_tf):       
    clf = SVC(C=1, gamma=0.95, kernel='rbf', max_iter=20000)
    clf.fit(x_train_tf, y_train)   
    y_test = clf.predict(x_test_tf)
    
    return y_test

In [None]:
def ensemble_model(x_train, y_train, x_test):
    """
    This function generates an ensemble with majority voting of the three models: 
        (1) SVC with RBF Kernel
        (2) Random Forest Classfier
        (3) Neural network
    
    """
    x_train_tf = tf.transform(x_train)
    x_test_tf = tf.transform(x_test)
    
    y_val_nt =  neural_network_validation(x_train, y_train, x_test)
    print("Neural Network finished")
    
    y_val_rf = random_forest_validation(x_train, y_train, x_test)
    print("Random Forest finished")
    
    y_val_SVC = SVC_validation(x_train_tf, y_train, x_test_tf)
    print("SVC finished")
       
    # Combining the y classification 
    df_results = pd.DataFrame(columns = [ 'y_val_neural', 'y_val_rf', "y_val_SVC", "combined"])
    
    df_results['y_val_neural'] = y_val_nt 
    df_results['y_val_rf'] = y_val_rf
    df_results["y_val_SVC"] = y_val_SVC 
        
    df_results["combined"] = df_results.mode(axis=1)
    
    
    return df_results

In [None]:
df_final = ensemble_model(unprocessed_x_train_original, y_train_original, test_data)

Epoch 1/10
 - 4s - loss: 0.4265 - acc: 0.8121
Epoch 2/10
 - 2s - loss: 0.3304 - acc: 0.8628
Epoch 3/10
 - 2s - loss: 0.3110 - acc: 0.8729
Epoch 4/10
 - 2s - loss: 0.2893 - acc: 0.8860
Epoch 5/10
 - 2s - loss: 0.2683 - acc: 0.8972
Epoch 6/10
 - 2s - loss: 0.2508 - acc: 0.9051
Epoch 7/10
 - 2s - loss: 0.2325 - acc: 0.9168
Epoch 8/10
 - 2s - loss: 0.2157 - acc: 0.9231
Epoch 9/10
 - 2s - loss: 0.2020 - acc: 0.9284
Epoch 10/10
 - 2s - loss: 0.1838 - acc: 0.9361
Neural Network finished
Random Forest finished


In [73]:
df_final

Unnamed: 0,y_val_neural,y_val_rf,y_val_SVC,combined
0,1,1.0,1.0,1.0
1,1,1.0,1.0,1.0
2,0,0.0,0.0,0.0
3,0,0.0,0.0,0.0
4,0,1.0,0.0,0.0
5,0,0.0,0.0,0.0
6,1,1.0,1.0,1.0
7,1,1.0,1.0,1.0
8,1,1.0,1.0,1.0
9,0,0.0,0.0,0.0


In [76]:
df_sum = df_final.sum()
df_sum

y_val_neural    5236.0
y_val_rf        5157.0
y_val_SVC       5165.0
combined        5179.0
dtype: float64