In [3]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

from keras.callbacks import EarlyStopping

from keras.layers import advanced_activations
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Dropout, Conv1D, MaxPooling1D, BatchNormalization, Activation, Flatten
from sklearn import tree, base
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.model_selection import *
from sklearn.svm import *
import scipy.stats

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


*Loading data*

In [4]:
with open("training_data.txt") as f:
    words = f.readline().split(' ')
training_data = np.loadtxt("training_data.txt", skiprows = 1)
test_data = np.loadtxt("test_data.txt", skiprows = 1)

unprocessed_x_train_original = training_data[:, 1:]
y_train_original = training_data[:, 0]

*Normalizing data with TfidfTransformer*

In [5]:
tf = TfidfTransformer().fit(unprocessed_x_train_original)
x_train_tf = tf.transform(unprocessed_x_train_original)

x_test_tf = tf.transform(test_data)

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


*Splitting data into training and validation sets*

In [6]:
# Split tf normalized data 
x_train, x_val, y_train, y_val \
        = train_test_split(x_train_tf, y_train_original, test_size = 0.2, random_state = None)

In [7]:
# Split regular data (not tf-normalized)
x_train_reg, x_val_reg, y_train_reg, y_val_reg \
        = train_test_split(unprocessed_x_train_original, y_train_original, test_size = 0.2, random_state = None)

# Ensemble Model

In [8]:
def random_forest_validation(x_train, y_train, x_test):
    n_estimators = 200
    clf = RandomForestClassifier(n_estimators = n_estimators, criterion = 'gini')
    clf.set_params(max_depth= 49)
    clf.fit(x_train, y_train)
    y_test = clf.predict(x_test)
    return y_test

In [9]:
def neural_network_validation(x_train, y_train, x_test):
    model = Sequential()
    model.add(Dense(100, input_shape=(len(x_train[0]),)))
    model.add(Activation('relu'))
    model.add(Dropout(0.05))

    model.add(Dense(30)) 
    model.add(Activation('sigmoid'))
    model.add(Dropout(0.05))

    model.add(Dense(30)) 
    model.add(Activation('sigmoid'))
    model.add(Dropout(0.05))

    model.add(Dense(1))
    model.add(Activation('sigmoid'))

    # compile network
    model.compile(loss='binary_crossentropy', optimizer='rmsprop', metrics=['accuracy'])
    # fit network
    model.fit(x_train, y_train, epochs=10, verbose=2) # verbose = 2
    
    # Generate predictions
    predictions = model.predict(x_test, verbose=1)
    predictions = predictions.flatten()
    y_test = (predictions >= 0.5).astype(int)
    
    return y_test

In [10]:
unprocessed_x_train_original.shape

(20000, 1000)

In [11]:
def SVC_validation(x_train_tf, y_train, x_test_tf):       
    clf = SVC(C=1, gamma=0.95, kernel='rbf', max_iter=30000)
    clf.fit(x_train_tf, y_train)   
    y_test = clf.predict(x_test_tf)
    return y_test

In [12]:
def ensemble_model(x_train, y_train, x_test):
    """
    This function generates an ensemble with majority voting of the three models: 
        (1) SVC with RBF Kernel
        (2) Random Forest Classfier
        (3) Neural network
    
    """
    x_train_tf = tf.transform(x_train)
    x_test_tf = tf.transform(x_test)
    
    y_val_nt =  neural_network_validation(x_train, y_train, x_test)
    print("Neural Network finished")
    
    y_val_rf = random_forest_validation(x_train, y_train, x_test)
    print("Random Forest finished")
    
    y_val_SVC = SVC_validation(x_train_tf, y_train, x_test_tf)
    print("SVC finished")
       
    # Combining the y classification 
    df_results = pd.DataFrame(columns = [ 'y_val_neural', 'y_val_rf', "y_val_SVC", "combined"])
    
    df_results['y_val_neural'] = y_val_nt 
    df_results['y_val_rf'] = y_val_rf
    df_results["y_val_SVC"] = y_val_SVC 
        
    df_results["combined"] = df_results.mode(axis=1)
    
    return df_results

In [14]:
df_final = ensemble_model(x_train_reg, y_train_reg, x_val_reg)
print("Neural Network accuracy", ((np.array(df_final.y_val_neural) == y_val_reg).sum() / len(y_val_reg)))
print("Random Forest accuracy", ((np.array(df_final.y_val_rf) == y_val_reg).sum() / len(y_val_reg)))
print("SVC accuracy", ((np.array(df_final.y_val_SVC) == y_val_reg).sum() / len(y_val_reg)))
print("Ensemble accuracy", ((np.array(df_final.combined) == y_val_reg).sum() / len(y_val_reg)))

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


Epoch 1/10
 - 1s - loss: 0.4553 - acc: 0.7961
Epoch 2/10
 - 1s - loss: 0.3340 - acc: 0.8620
Epoch 3/10
 - 1s - loss: 0.3071 - acc: 0.8756
Epoch 4/10
 - 1s - loss: 0.2837 - acc: 0.8856
Epoch 5/10
 - 1s - loss: 0.2650 - acc: 0.8979
Epoch 6/10
 - 1s - loss: 0.2464 - acc: 0.9077
Epoch 7/10
 - 1s - loss: 0.2266 - acc: 0.9158
Epoch 8/10
 - 1s - loss: 0.2124 - acc: 0.9242
Epoch 9/10
 - 1s - loss: 0.1974 - acc: 0.9312
Epoch 10/10
 - 1s - loss: 0.1846 - acc: 0.9358
Neural Network finished
Random Forest finished
SVC finished
Neural Network accuracy 0.84475
Random Forest accuracy 0.82725
SVC accuracy 0.85275
Ensemble accuracy 0.8575


In [15]:
df_final = ensemble_model(unprocessed_x_train_original, y_train_original, test_data)

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


Epoch 1/10
 - 1s - loss: 0.4295 - acc: 0.8046
Epoch 2/10
 - 1s - loss: 0.3285 - acc: 0.8631
Epoch 3/10
 - 1s - loss: 0.3096 - acc: 0.8754
Epoch 4/10
 - 1s - loss: 0.2872 - acc: 0.8855
Epoch 5/10
 - 1s - loss: 0.2668 - acc: 0.8969
Epoch 6/10
 - 1s - loss: 0.2490 - acc: 0.9061
Epoch 7/10
 - 1s - loss: 0.2329 - acc: 0.9141
Epoch 8/10
 - 1s - loss: 0.2180 - acc: 0.9208
Epoch 9/10
 - 1s - loss: 0.2047 - acc: 0.9281
Epoch 10/10
 - 1s - loss: 0.1914 - acc: 0.9338
Neural Network finished
Random Forest finished
SVC finished


In [16]:
df_final

Unnamed: 0,y_val_neural,y_val_rf,y_val_SVC,combined
0,1,1.0,1.0,1.0
1,1,1.0,1.0,1.0
2,0,0.0,0.0,0.0
3,0,0.0,0.0,0.0
4,0,1.0,0.0,0.0
5,0,0.0,0.0,0.0
6,1,1.0,1.0,1.0
7,1,1.0,1.0,1.0
8,1,1.0,1.0,1.0
9,0,0.0,0.0,0.0


In [17]:
df_sum = df_final.sum()
df_sum

y_val_neural    5076.0
y_val_rf        5190.0
y_val_SVC       5167.0
combined        5139.0
dtype: float64

In [18]:
print("Neural net classification rejected: ", (df_final.y_val_neural!=df_final.combined).sum())
print("Random forest classification rejected: ", (df_final.y_val_rf!=df_final.combined).sum())
print("SVM classification rejected: ", (df_final.y_val_SVC!=df_final.combined).sum())

Neural net classification rejected:  409
Random forest classification rejected:  935
SVM classification rejected:  236


In [21]:
"""
Takes in a numpy array consisting of outputs for our test set and
outputs a file in the correct submission format (as per the sample
submission file)
"""
def make_submission_file(arr, fname = "submission.txt"):
    f = open(fname, 'w')
    f.write("Id,Prediction\n")
    for i in range(len(arr)):
        f.write(str(i + 1) + "," + str(int(arr[i])) + '\n')
    f.close()

In [22]:
make_submission_file(np.array(df_final.combined), fname = "submission_ensemble_majority_vote.txt")