In [43]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LinearRegression
from collections import defaultdict
from sklearn.metrics import mean_squared_error, mean_absolute_error

In [44]:
datasets = ["Study_A.csv", "Study_B.csv", "Study_C.csv", "Study_D.csv"]
train = pd.concat([pd.read_csv(df) for df in datasets])
test = pd.read_csv("Study_E.csv")
train = train.to_numpy()
test = test.to_numpy()

In [45]:
def featureExtractor(inpTens, testSet=False):
    final = []
    if not testSet:
        final.append(inpTens[7:-1])
    else:
        final.append(inpTens[7:])
    return np.array(final)


def horizontal(inp, test=0):
    sequences = []
    lens = []
    patients = []
    patients_dict = defaultdict(list)
    for x in inp:
        patient = x[2]
        patients_dict[patient].append(featureExtractor(x, test))

    for patient, tensor_list in patients_dict.items():
        sequence = np.concatenate(tensor_list, axis=0)
        sequences.append(sequence)
        patients.append(patient)
        lens.append(len(tensor_list))
    return sequences, patients


preProcessed, _ = horizontal(train)
preTrained, patientLabels = horizontal(test, True)

print(len(preTrained))
print(len(patientLabels))

513
513


In [46]:
def uniqueDays(inp):
    toReturn = []
    for x in inp:
        inds = np.unique(x[:, 0].astype(int), return_index=True)[1]
        toReturn.append(x[inds])
    return toReturn

def Processed(inp):
    series = []
    valueRanges = [35, 77, 119, 161, 203]
    tolerance = 15
    for inputs in inp:
        if inputs[0][0] == 0 and len(inputs) > 1:
            inputs = np.array(inputs, dtype=float) # convert the first column from string to float
            moddedSeries = []
            index = 0
            for i, valueRange in enumerate(valueRanges):
                # check if the time point is within the current range
                while index < len(inputs) and (inputs[index][0] < valueRange - tolerance or inputs[index][0] > valueRange + tolerance):
                    index += 1
                if index < len(inputs):
                    moddedSeries.append(inputs[index])
                    index += 1
                else:
                    break
            if len(moddedSeries) == 5:
                moddedSeries.append(inputs[-1])
                series.append(np.vstack(moddedSeries))
    return series



preTrained = uniqueDays(preTrained)
preProcessed = uniqueDays(preProcessed)
processed = Processed(preProcessed)

print(len(preTrained))
print(len(preProcessed))
print(len(processed))

513
2434
804


In [47]:
X = [x[:-1] for x in processed]
y = [x[-1, -1] for x in processed]

def doubleExponentialMean(inp, a=0.5, gamma=0.8):
    predictions = []
    for x in inp:
        data = x.copy()
        pred = data[0, -1]
        grad = data[1, -1] - data[0, -1]
        for x in range(1, len(data)):
            prev = pred
            pred = a * data[x, -1] + (1 - a) * (pred + grad)
            grad = gamma * (pred - prev) + (1 - gamma) * grad
        predictions.append(pred)
    return predictions

def inputs(inp):
    return [x/inp for x in range(inp)]
aVals = inputs(10)
gammaVals = inputs(10)
besta = 0
bestgamma = 0
bestVal = 1000000

for a in aVals:
    for gamma in gammaVals:
        yhat = doubleExponentialMean(X, a, gamma)
        mse = mean_squared_error(y, yhat)
        mae = mean_absolute_error(y, yhat)
        rmse = np.sqrt(mse)
        if rmse < bestVal:
            besta = a
            bestgamma = gamma
            bestVal = rmse

print("Best alpha:", besta)
print("Best gamma:", bestgamma)
print("Best RMSE:", bestVal)


Best alpha: 0.9
Best gamma: 0.7
Best RMSE: 7.802027486192427


In [48]:
viewPretrained = [x for x in preTrained if 4 > len(x) > 1]
ShortInds = []
LongInds = []
GoingtoModelInds = []
modelData = []
finalPANS = []
for x in range(len(preTrained)):
    if len(preTrained[x]) < 5:
        ShortInds.append(x)
        finalPANS.append(preTrained[x][-1, -1])
    else:
        LongInds.append(x)
        if preTrained[x][-1, 0] >= 150 and preTrained[x][-2, 0] >= 110:
            GoingtoModelInds.append(x)
            modelData.append(preTrained[x])

predictions = doubleExponentialMean(modelData, besta, bestgamma)

In [49]:
y = []
X = []
preProcessed = [x for x in preProcessed if (x.ndim != 0 and x.shape[0] >= 2)]

def generateInputs(inps):
    X = []
    for x in inps:
        if x.ndim == 1:
            x = np.expand_dims(x, 0)
        inp = np.hstack((x[0, :-1], x[-1, :-1] if len(x) > 1 else x[0, :-1]))
        X.append(inp)
    return np.vstack(X)

for x in preProcessed:
    y.append(x[-1, -1])
y = np.vstack(y)

X = generateInputs(preProcessed)

Xtest = generateInputs(preTrained)

patientsLabels = np.vstack(patientLabels)
reg = LinearRegression().fit(X, y)
labels = reg.predict(Xtest)

# Base Model
labels = np.sum(Xtest[:, 32:], axis=1)[:, np.newaxis]

# modifying base model
labels = labels.squeeze().astype(float)
oldlabels = labels.copy()
# print(labels)
for x in range(len(GoingtoModelInds)):
    labels[GoingtoModelInds[x]] = predictions[x]
labels = labels[:, np.newaxis]


outputs = np.hstack((patientsLabels, labels))
out = pd.DataFrame(outputs)
file2 = pd.read_csv('sample_submission_PANSS.csv')

out.columns = ["PatientID", "PANSS_Total"]
out["PatientID"] = out["PatientID"].astype(int)

keys = list(file2['PatientID'])
selected_rows = out[out['PatientID'].isin(keys)]
selected_rows.to_csv('prototyp3e.csv', index=False)
