In [1]:
import pandas as pd
import numpy as np
from sklearn.multiclass import OneVsRestClassifier
from sklearn.svm import SVC
import time

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [4]:
ts = time.time()
global extracted_features
extracted_features = 0

global absorbance_firstCount

global absorbanceFeatures

# **Sample Extraction Method**

In [5]:
# we need to shrink the samples, because 60 of the same sample led to
# over-fitting as expected.

# Split the data into batches that gave the same output.
# And then set a different epsilon value, and compared every sample with every
# feature, starting from a pivot sample. If, for the two samples
# that are being compared, the change in 90 percent of the
# total features is less than the epsilon value, we extracted the
# sample because it was a duplicate.

def extract_sample(hdl, ldl, hgl, absorbanceFeatures, data):
    eps = 0.0033
    pivot = 1/3
    similarity_percentage = 0.9
    samples_with_same_labels = []

    for i in range(0, len(labels)):
        if labels[i][0] == hdl and labels[i][1] == ldl and labels[i][2] == hgl:
            samples_with_same_labels.append(i)

    for key in range(int(len(samples_with_same_labels) * pivot), 0, -1):
        k = samples_with_same_labels[key]
        if np.count_nonzero(data[k]) == 0:
            continue
        if k >= len(data) or k < 0:
            break
        line1 = np.array(absorbanceFeatures[k, :])
        for index in range(key - 1, 0, -1):
            i = samples_with_same_labels[index]
            if np.count_nonzero(data[i]) == 0:
                continue
            if i >= len(data) or i < 0:
                break
            line2 = np.array(absorbanceFeatures[i, :])
            number_of_matches = 0
            for j in range(0, len(line1)):
                if data[i, len(line1)] == 0:
                    break
                if abs(line2[j] - line1[j]) < eps:
                    number_of_matches += 1
                    if number_of_matches >= (similarity_percentage * len(line1)):
                        data[i] = np.zeros(len(line1) + 5)
                        absorbanceFeatures[i] = np.zeros(len(line1))
                        break

    for key in range(int(len(samples_with_same_labels) * pivot), len(samples_with_same_labels)):
        k = samples_with_same_labels[key]
        if np.count_nonzero(data[k]) == 0:
            continue
        if k >= len(data):
            break
        line1 = np.array(absorbanceFeatures[k, :])
        for index in range(key + 1, len(samples_with_same_labels)):
            i = samples_with_same_labels[index]
            if np.count_nonzero(data[i]) == 0:
                continue
            if i >= len(data):
                break
            line2 = np.array(absorbanceFeatures[i, :])
            number_of_matches = 0
            for j in range(0, len(line1)):
                if data[i, len(line1)] == 0:
                    break
                if abs(line2[j] - line1[j]) < eps:
                    number_of_matches += 1
                    if number_of_matches >= (similarity_percentage * len(line1)):
                        data[i] = np.zeros(len(line1) + 5)
                        absorbanceFeatures[i] = np.zeros(len(line1))
                        break

    return data


#Feature Extraction Method

In [6]:
def extract_feature(hdl, ldl, hgl, extracted_features, absorbanceFeatures, data, testdata):
    eps = 0.75
    difference = np.zeros(absorbance_firstCount - extracted_features)
    max_values = np.zeros(absorbance_firstCount - extracted_features)
    min_values = np.zeros(absorbance_firstCount - extracted_features)
    samples_with_same_labels = []

    for i in range(0, len(labels)):
        if labels[i][0] == hdl and labels[i][1] == ldl and labels[i][2] == hgl:
            samples_with_same_labels.append(i)

    if len(samples_with_same_labels) > 60:
        return extracted_features, absorbanceFeatures, data, testdata

    for k in range(0, len(difference)):
        for index in range(0, len(samples_with_same_labels)):
            i = samples_with_same_labels[index]
            if absorbanceFeatures[i][k] > max_values[k]:
                max_values[k] = absorbanceFeatures[i][k]
            if absorbanceFeatures[i][k] < min_values[k]:
                min_values[k] = absorbanceFeatures[i][k]
        difference[k] = max_values[k] - min_values[k]

    for i in range(0, len(difference)):
        if i >= len(difference):
            break

            # we extracted the features
            # that changed more than for some epsilon value, which we
            # decided by trial and error, on the 60-sampled batches.

        if difference[i] > eps:
            extracted_features += 1
            difference = np.append(difference[:i], difference[i + 1:])
            data = np.append(data[:, : i], data[:, i + 1:], axis=1)
            absorbanceFeatures = np.append(absorbanceFeatures[:, :i], absorbanceFeatures[:, i + 1:], axis=1)
            testdata = np.append(testdata[:, : i], testdata[:, i + 1:], axis=1)
            i -= 1

    return extracted_features, absorbanceFeatures, data, testdata



In [7]:
maindir = "/content/drive/MyDrive/Colab Notebooks/" # Directory with your files
traincsv = maindir+"/Train.csv"
testcsv = maindir+"/Test.csv"

train = pd.read_csv(traincsv)

In [8]:
train.drop(["Reading_ID"], axis=1, inplace=True)

In [9]:
data = train.to_numpy()
features = np.array(data[:, :-3], dtype=float)

In [10]:
absorbanceFeatures = np.array(data[:, :-5], dtype=float)
absorbance_firstCount = absorbanceFeatures.shape[1]
test = pd.read_csv(testcsv)

In [11]:
# save it ids before dropping them
idx = test.Reading_ID.values
test.drop(["Reading_ID"], axis=1, inplace=True)
testdata = test.to_numpy()
testdatarows = testdata.shape[0]

In [12]:
labels = np.array(data[:, - 3:])

# **Feature Extraction**

In [13]:
targets = ['low', 'ok', 'high']

for i in range(0, 3):
    for j in range(0, 3):
        for k in range(0, 3):
            extracted_features, absorbanceFeatures, data, testdata = extract_feature(targets[i], targets[j], targets[k]
                                                                              , extracted_features, absorbanceFeatures
                                                                              , data, testdata)

#Sample **Extraction**

In [14]:
for i in range(2, -1, -1):
    for j in range(2, -1, -1):
        for k in range(2, -1, -1):
            data = extract_sample(targets[i], targets[j], targets[k], absorbanceFeatures, data)

            data = data[~np.all(data == 0, axis=1)]
            absorbanceFeatures = np.array(data[:, : - 5], dtype=float)
            labels = np.array(data[:, - 3:])

In [15]:
data[np.where(data == 'low')] = 0
data[np.where(data == 'ok')] = 1
data[np.where(data == 'high')] = 2

In [16]:
data = np.array(data, dtype=float)
data = np.unique(data, axis=0)

In [17]:
features = np.array(data[:, : - 3])

In [18]:
labels = np.array(data[:, - 3:])

In [19]:
# Using separate regularization parameters for each support
# vector machine increased the accuracy of predicting outputs
# substantially. While using lower regularization parameters (C) we found out that this kind of approach causes
# larger-margin and therefore we observed an increased proportion of miss-classified samples.

# With these settings (C=1e3,C=1e5,C=1e5)we achieved the highest score of
# accuracy from the models that we trained.
# using lower C values on the SVM’s tends to cause miss-classification on second and third

# random_state=42, because its the answer for everything :)

model = OneVsRestClassifier(SVC(random_state=42,C=1e3),n_jobs=-1)

In [20]:
# Fitting the models for three labels training data
model.fit(features, labels[:, 0])
model.fit(features, labels[:, 0])
prediction_label1_zeros = np.zeros((testdatarows, 1), dtype=float)
prediction_label1 = model.predict(testdata)

In [21]:
for i in range(testdatarows):
    prediction_label1_zeros[i] = prediction_label1[i]

In [22]:
model = OneVsRestClassifier(SVC(random_state=42, C=1e5), n_jobs=-1)

In [23]:
model.fit(features, labels[:, 1])
prediction_label2 = model.predict(testdata)
prediction_label2_zeros = np.zeros((testdatarows, 1), dtype=float)

In [24]:
for i in range(testdatarows):
    prediction_label2_zeros[i] = prediction_label2[i]

In [25]:
model = OneVsRestClassifier(SVC(random_state=42, C=1e5), n_jobs=-1)

In [26]:
model.fit(features, labels[:, 2])
prediction_label3 = model.predict(testdata)
prediction_label3_zeros = np.zeros((testdatarows, 1), dtype=float)

In [27]:
for i in range(testdatarows):
    prediction_label3_zeros[i] = prediction_label3[i]

In [28]:
predictions = np.concatenate((prediction_label1_zeros, prediction_label2_zeros, prediction_label3_zeros), axis=1)

In [29]:
ss = pd.DataFrame()
ss["Reading_ID"] = idx
ss["hdl_cholesterol_human"] = predictions[:, 0]
ss["hemoglobin(hgb)_human"] = predictions[:, 1]
ss["cholesterol_ldl_human"] = predictions[:, 2]

In [30]:
def transform(value):
    if value == 0.0:
        return 'low'
    elif value == 1.0:
        return 'ok'
    else:
        return 'high'

In [31]:
ss["hdl_cholesterol_human"] = ss["hdl_cholesterol_human"].apply(lambda x: transform(x))
ss["hemoglobin(hgb)_human"] = ss["hemoglobin(hgb)_human"].apply(lambda x: transform(x))
ss["cholesterol_ldl_human"] = ss["cholesterol_ldl_human"].apply(lambda x: transform(x))

In [32]:
def transform_c_hdl(row):
    return str(row["Reading_ID"]) + "_hdl_cholesterol_human" + "-" + row["hdl_cholesterol_human"]

In [33]:
hdl_rows = pd.DataFrame(ss.apply(transform_c_hdl, axis=1))

In [34]:
def transform_hemo(row):
    return str(row["Reading_ID"]) + "_hemoglobin(hgb)_human" + "-" + row["hemoglobin(hgb)_human"]

In [35]:
hemo_rows = pd.DataFrame(ss.apply(transform_hemo, axis=1))

In [36]:
def transform_c_ldl(row):
    return str(row["Reading_ID"]) + "_cholesterol_ldl_human" + "-" + row["cholesterol_ldl_human"]

In [37]:
ldl_rows = pd.DataFrame(ss.apply(transform_c_ldl, axis=1))

In [38]:
ss = pd.concat([hdl_rows, hemo_rows, ldl_rows]).reset_index(drop=True)

In [39]:
ss["target"] = ss[0].apply(lambda x: x.split("-")[1])
ss[0] = ss[0].apply(lambda x: x.split("-")[0])

In [40]:
ss = ss.rename(columns={0: "Reading_ID"})

In [41]:
ss.to_csv(maindir + "/submission.csv", index=False)

In [42]:
print(time.time() - ts)

173.49046921730042
