In [2]:
import numpy as np
import pandas as pd

In [3]:
#importing training data and test data into the program
train_data = pd.read_csv('input/train_data.csv')
test_data = pd.read_csv('input/test_data.csv')

In [4]:
def seq_to_ascii(x):
    o = []
    for i in range(x.shape[0]):
        out = []
        for c in x[i]:
            #ASCII conversion of every sequence character
            out.append((ord(c) - ord('A'))) 
            #converting every sequence into a list of integers where every integer represents the corresponding ASCII value
        o.append(out)
    return np.array(o)

In [5]:
#importing library for oversampling
from imblearn.combine import SMOTETomek

In [6]:
tr = train_data.values
x_train = tr[:, 1]  # taking out values from dataframe
# x_train = x_train[:, 1:]    #dropping sequence number
x_train = seq_to_ascii(x_train)
x_train = x_train.astype(np.float64) # converting from object type to float
x_train.shape


(36891, 17)

In [7]:
y_train = train_data['label'].values
print(y_train.shape)

(36891,)


In [8]:
#oversampling data to have equal number of training samples and test samples
smk = SMOTETomek(random_state=41)
x_res, y_res = smk.fit_resample(x_train, y_train)

In [9]:
# shape of data after oversampling
print(x_res.shape, y_res.shape)

(70174, 17) (70174,)


In [10]:
#test data converted to csv
ts = test_data.values
x_test = ts[:, 1]
test_x = seq_to_ascii(x_test)
print(test_x)

[[18  3 24 ... 10  3  6]
 [18 17 24 ... 10 13  2]
 [23 23 23 ... 16  5 21]
 ...
 [11  0  3 ... 23 23 23]
 [19 12  8 ...  4  8 19]
 [ 4  3  3 ...  6 17 21]]


# Logistic Regression Model

In [11]:

from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
svm = SVC(kernel="linear",C = 0.010,random_state = 48)
svm.fit(x_res,y_res)

SVC(C=0.01, kernel='linear', random_state=48)

In [12]:
# Testing Model
preds = svm.predict(test_x) #generating predictions
preds.shape

(9582,)

In [13]:
preds.tolist().count(1)

4851

In [14]:
#importing sample data
sample_data = pd.read_csv('input/sample.csv')

In [15]:
sample_data.Label  = preds
sample_data.to_csv('output/svm2_ascii.csv', index=False)

# Decision Tree Classifier

In [29]:
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler

In [31]:
#scaling
sc = StandardScaler()
x_transform = sc.fit_transform(x_res)

In [32]:
# initializing model and fitting
clf = DecisionTreeClassifier()
clf.fit(x_res, y_res)

DecisionTreeClassifier()

In [36]:
# extracting predictions and exporting as CSV
preds = clf.predict(test_x)
print(preds.tolist().count(1))
#importing sample data
sample_data = pd.read_csv('input/sample.csv')
sample_data.Label = preds
sample_data.to_csv('output/DT2_output.csv', index=False)

785


# Random Forest

In [37]:
from sklearn.ensemble import RandomForestClassifier

In [38]:
# initializing model and fitting
clf = RandomForestClassifier(max_depth= 2, random_state=0)
clf.fit(x_res, y_res)


RandomForestClassifier(max_depth=2, random_state=0)

In [43]:
#generating predictions
preds = clf.predict(test_x)
#importing sample data
sample_data = pd.read_csv('input/sample.csv')
sample_data.Label = preds
sample_data.to_csv('output/RF2_output.csv', index=False)
print(preds.tolist().count(1))

5946


# K Nearest Neighbours

In [26]:
from sklearn.neighbors import KNeighborsClassifier

In [27]:
clf = KNeighborsClassifier(n_neighbors=5)
clf.fit(x_res, y_res)

KNeighborsClassifier()

In [28]:
#generating predictions
preds = clf.predict(x_test)


In [29]:
sample_data.Label = preds
sample_data.to_csv('output/KNN_5output.csv', index=False)

In [30]:
preds.tolist().count(1)

2859

# Artificial Neural Network [ANN]

In [31]:
# scaler to scale data
from sklearn.preprocessing import MinMaxScaler

In [32]:
train_res_x = MinMaxScaler().fit_transform(x_res)
test_res_x = MinMaxScaler().fit_transform(x_test)

In [33]:
#importing tensorflow for ANN model
import tensorflow as tf
import keras
from keras.layers import *
from keras.models import *
from keras.preprocessing import image

In [55]:
model = tf.keras.Sequential([
    Dense(100, activation='relu', input_dim = 20),
    Dense(100, activation='relu'),    
    Dense(1, activation='sigmoid'),
 ])

In [56]:
model.compile(
    loss= 'binary_crossentropy',
    optimizer= tf.keras.optimizers.Adam(learning_rate= 0.001,decay= 1e-6),
    metrics= ['accuracy']
)

In [58]:
early_stopping = tf.keras.callbacks.EarlyStopping(
    patience=10,
    restore_best_weights=True,
    min_delta=0.0005,
)
history = model.fit(
    x_res, y_res,
    verbose= 2,
    batch_size= len(x_res),
    epochs= 100,
    callbacks=[early_stopping]
)

Epoch 1/100
1/1 - 0s - loss: 41.6703 - accuracy: 0.5000
Epoch 2/100
1/1 - 0s - loss: 37.9724 - accuracy: 0.5001
Epoch 3/100
1/1 - 0s - loss: 32.9573 - accuracy: 0.5001
Epoch 4/100
1/1 - 0s - loss: 27.0620 - accuracy: 0.5011
Epoch 5/100
1/1 - 0s - loss: 20.6759 - accuracy: 0.5168
Epoch 6/100
1/1 - 0s - loss: 14.4843 - accuracy: 0.5581
Epoch 7/100
1/1 - 0s - loss: 9.2817 - accuracy: 0.6067
Epoch 8/100
1/1 - 0s - loss: 6.0505 - accuracy: 0.6484
Epoch 9/100
1/1 - 0s - loss: 5.5491 - accuracy: 0.6649
Epoch 10/100
1/1 - 0s - loss: 7.1649 - accuracy: 0.6393
Epoch 11/100
1/1 - 0s - loss: 9.2241 - accuracy: 0.6055
Epoch 12/100
1/1 - 0s - loss: 10.6096 - accuracy: 0.5831
Epoch 13/100
1/1 - 0s - loss: 11.0500 - accuracy: 0.5733
Epoch 14/100
1/1 - 0s - loss: 10.6697 - accuracy: 0.5735
Epoch 15/100
1/1 - 0s - loss: 9.7065 - accuracy: 0.5814
Epoch 16/100
1/1 - 0s - loss: 8.4087 - accuracy: 0.5956
Epoch 17/100
1/1 - 0s - loss: 7.0043 - accuracy: 0.6152
Epoch 18/100
1/1 - 0s - loss: 5.6911 - accuracy:

In [75]:
preds = model.predict(x_test.astype(np.float32))

In [76]:
preds = np.round(preds)

In [77]:
preds = preds.astype(np.int64)

In [79]:
sample_data.Label = preds
sample_data.to_csv('output/seqNN_output.csv', index=False)

In [81]:
preds.shape

(9582, 1)