In [1]:
import pandas as pd
import numpy as np

# Data Descriptions

In [3]:
names = pd.read_csv('../features_survey.csv')
names.head()

Unnamed: 0,Heart Rate Variability,CSF Tau Concentrations,Unconsciousness Duration Upon Injury,Head Injury Type,Time Since Injury,Arm Injury Severity,Acne Severity,Day to Day Happiness and Satisfaction,Disability Rating Score,Obesity Rating (How Obese is the Person)
0,variance in time between heartbeats. measured ...,Concentration of the Tau protein in cerebrospi...,How long a person is in an unresponsive state ...,"a head injury can either be open, where a vict...",time that has elapsed since the impact to the ...,arm pain that ranges from manageable to severe...,can be classified by number of eruptions on th...,how happy and satisfied one is with their life...,range of how much the disability affects quali...,how body fat is distributed and how much of so...
1,The amount of time between heart beats measure...,Amount of Tau detected in CSF measured by pico...,how long a person is in an unresponsive state ...,can range from external skull injury to intern...,how long it has beensince the initial head imp...,combination of how impacted or deformed the ar...,"combination of type of acne (whiteheads, black...",how happy or satisfied someone is with their a...,how impactful the disability is on quality of ...,"If your BMI is 18.5 to <25, it falls within th..."
2,The standard deviation of all NN intervals wit...,The concentration of Tau Filament in the cereb...,How long the patient is unconscious upon impac...,The location where the head injury occurred. E...,The number of days that have elapsed since rec...,A measure of how severe an arm injury is range...,A measure of the severity of the facial acne [...,A survey response for 10 questions on the pati...,A survey of the disability rating from the car...,An obesity rating based on the BMI range of th...
3,"The mean variance of the heart rate space, mea...",Tau Filament is measured by taking a sample of...,Length of time that the patient was unconsciou...,The classification of the head injury based on...,"How long, in hours, that have passed since the...",A measurement of the arm injury severity. Scal...,How severe the patient's acne appears. 1-10 fo...,A patient's indication 1-10 of their day to da...,Indicator for prior disabilities that the pati...,An indicator for prior history in the medical ...
4,HRV is a measure of the variability in time be...,A bio marker for neurological disorder such as...,Loss of consciousness for a given amount of ti...,Level of brain trauma or injury. Measured by d...,The amount of time needed to recover from a co...,Type of arm injury. Listed condition and severity,Acne Measured by grade 1-4. Count types of les...,Self reported on a scale 1-10. Questions about...,DRS is a percentage of your disability. 0% is ...,Body mass index (BMI) is a person's weight in ...


In [4]:
# Randomly shuffle each column of names
names = names.sample(frac=1).reset_index(drop=True)

# train on first 3 rows of patients
# test on the last row 
names_train = names.iloc[:3, :]
names_test = names.iloc[3:, :]

names_train.head()

Unnamed: 0,Heart Rate Variability,CSF Tau Concentrations,Unconsciousness Duration Upon Injury,Head Injury Type,Time Since Injury,Arm Injury Severity,Acne Severity,Day to Day Happiness and Satisfaction,Disability Rating Score,Obesity Rating (How Obese is the Person)
0,variance in time between heartbeats. measured ...,Concentration of the Tau protein in cerebrospi...,How long a person is in an unresponsive state ...,"a head injury can either be open, where a vict...",time that has elapsed since the impact to the ...,arm pain that ranges from manageable to severe...,can be classified by number of eruptions on th...,how happy and satisfied one is with their life...,range of how much the disability affects quali...,how body fat is distributed and how much of so...
1,"The mean variance of the heart rate space, mea...",Tau Filament is measured by taking a sample of...,Length of time that the patient was unconsciou...,The classification of the head injury based on...,"How long, in hours, that have passed since the...",A measurement of the arm injury severity. Scal...,How severe the patient's acne appears. 1-10 fo...,A patient's indication 1-10 of their day to da...,Indicator for prior disabilities that the pati...,An indicator for prior history in the medical ...
2,The standard deviation of all NN intervals wit...,The concentration of Tau Filament in the cereb...,How long the patient is unconscious upon impac...,The location where the head injury occurred. E...,The number of days that have elapsed since rec...,A measure of how severe an arm injury is range...,A measure of the severity of the facial acne [...,A survey response for 10 questions on the pati...,A survey of the disability rating from the car...,An obesity rating based on the BMI range of th...


# Data Sampling and Label Generation

In [5]:
# simulate data by sampling 1000 patients 
n_train = 10000
n_test = 1000

# Each patient has 10 features
# with feature values ranging from 0 to 1
# and a feature description pulled from the survey

# create a covariance matrix for 10 features
# with 0.5 correlation between random features
cov = np.random.rand(10, 10)
# make sure it's symmetric
cov = (cov + cov.T) / 2
# make sure it's positive definite
cov[np.diag_indices_from(cov)] = 1
# make sure it's not too correlated
cov = 0.5 * cov

# create a dataframe with 1000 rows and 10 columns
# with values sampled from a multivariate normal distribution
# with mean 0 and covariance matrix cov
X_train = pd.DataFrame(np.random.multivariate_normal(np.zeros(10), cov, n_train))
X_test = pd.DataFrame(np.random.multivariate_normal(np.zeros(10), cov, n_test))

  X_train = pd.DataFrame(np.random.multivariate_normal(np.zeros(10), cov, n_train))
  X_test = pd.DataFrame(np.random.multivariate_normal(np.zeros(10), cov, n_test))


In [7]:
# add a label column
# with values calculated from the first 3 features 
# by the equation y = 0.5 * x1 + 0.3 * x2 + 0.2 * x3
# and adding random noise
y_train = 0.5 * X_train.iloc[:, 0] + 0.3 * X_train.iloc[:, 1] + 0.2 * X_train.iloc[:, 2] + np.random.normal(0, 0.1, n_train)
y_test = 0.5 * X_test.iloc[:, 0] + 0.3 * X_test.iloc[:, 1] + 0.2 * X_test.iloc[:, 2] + np.random.normal(0, 0.1, n_test)

# use a threshold at the 80th percentile to create a binary label
y_train = (y_train > np.percentile(y_train, 50)).astype(int)
y_test = (y_test > np.percentile(y_test, 50)).astype(int)

# Testing Linear Regression to Learn Distribution (It Works Great!!)

In [8]:
# test linear regression model
from sklearn.linear_model import LinearRegression
from sklearn.metrics import accuracy_score

# train model
model = LinearRegression()

# fit model on training data
model.fit(X_train, y_train)

# predict on test data
y_pred = model.predict(X_test)

# convert predictions to binary labels
y_pred = (y_pred > np.percentile(y_pred, 50)).astype(int)

# calculate f1 score
results = accuracy_score(y_test, y_pred)
print(f"Accuracy of linear regression is {results}")

# test training model on X_train with random shuffle of values
# as baseline comparison

# shuffle values in each column
X_train_shuffled = X_train.apply(np.random.permutation)

# train model
model = LinearRegression()

# fit model on training data
model.fit(X_train_shuffled, y_train)

# predict on test data
y_pred = model.predict(X_test)

# convert predictions to binary labels
y_pred = (y_pred > np.percentile(y_pred, 80)).astype(int)

# calculate accuracy
results = accuracy_score(y_test, y_pred)
print(f"Accuracy on random shuffle is {results}")

Accuracy of linear regression is 0.948
Accuracy on random shuffle is 0.642


# Feature Concatenation

In [9]:
# add feature descriptions for each patient by:
# 1. sampling 10 random feature descriptions from the survey in names_train
# 2. replace the values in X_train with the sampled feature descriptions 
#       concatenated with the values in X_train rounded to 3 decimal places
for i in range(len(names_train.columns)):
    # apply function to each element individually
    # using feature name np.random.choice(names_train.iloc[:,i], 1)
    X_train.iloc[:, i] = X_train.iloc[:, i].apply(lambda x: "Feature is " + np.random.choice(names_train.iloc[:,i], 1)[0] + '. VAL is ' + str(round(x, 3)) + ". ")
    
# do the same for X_test
for i in range(len(names_test.columns)):
    X_test.iloc[:, i] = X_test.iloc[:, i].apply(lambda x: "Feature is " + np.random.choice(names_test.iloc[:,i], 1)[0] + '. VAL is ' + str(round(x, 3)) + ". ")

X_train.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,Feature is The standard deviation of all NN in...,Feature is Tau Filament is measured by taking ...,Feature is Length of time that the patient was...,Feature is The classification of the head inju...,Feature is The number of days that have elapse...,Feature is A measurement of the arm injury sev...,Feature is can be classified by number of erup...,Feature is A survey response for 10 questions ...,Feature is range of how much the disability af...,Feature is An obesity rating based on the BMI ...
1,Feature is variance in time between heartbeats...,Feature is The concentration of Tau Filament i...,Feature is How long the patient is unconscious...,Feature is The classification of the head inju...,"Feature is How long, in hours, that have passe...",Feature is A measurement of the arm injury sev...,Feature is How severe the patient's acne appea...,Feature is A survey response for 10 questions ...,Feature is A survey of the disability rating f...,Feature is An indicator for prior history in t...
2,Feature is The standard deviation of all NN in...,Feature is Concentration of the Tau protein in...,Feature is How long the patient is unconscious...,Feature is The location where the head injury ...,"Feature is How long, in hours, that have passe...",Feature is A measurement of the arm injury sev...,Feature is How severe the patient's acne appea...,Feature is how happy and satisfied one is with...,Feature is Indicator for prior disabilities th...,Feature is how body fat is distributed and how...
3,Feature is variance in time between heartbeats...,Feature is Tau Filament is measured by taking ...,Feature is How long the patient is unconscious...,Feature is The classification of the head inju...,"Feature is How long, in hours, that have passe...",Feature is A measure of how severe an arm inju...,Feature is can be classified by number of erup...,Feature is how happy and satisfied one is with...,Feature is Indicator for prior disabilities th...,Feature is An obesity rating based on the BMI ...
4,Feature is The mean variance of the heart rate...,Feature is Tau Filament is measured by taking ...,Feature is Length of time that the patient was...,Feature is The classification of the head inju...,Feature is The number of days that have elapse...,Feature is A measure of how severe an arm inju...,Feature is A measure of the severity of the fa...,Feature is how happy and satisfied one is with...,Feature is Indicator for prior disabilities th...,Feature is how body fat is distributed and how...


# RNN Dataset Creation

In [10]:
# for each patient, concatenate all the features in a random order 
# to create a single string
X_train = X_train.apply(lambda x: ' '.join(x.sample(frac=1)), axis=1)
X_test = X_test.apply(lambda x: ' '.join(x.sample(frac=1)), axis=1)

X_train.head()

0    Feature is A survey response for 10 questions ...
1    Feature is variance in time between heartbeats...
2    Feature is how happy and satisfied one is with...
3    Feature is Tau Filament is measured by taking ...
4    Feature is Length of time that the patient was...
dtype: object

In [11]:
X_train.iloc[0]

"Feature is A survey response for 10 questions on the patient's overall happiness and satisfaction in work and personal life. Responses were recorded between 1-6, with 1 being little satisfaction and extreme unhappiness, and 6 being well satisfied and happy.. VAL is -0.079.  Feature is Tau Filament is measured by taking a sample of the CSF within 24 hours of admitting using a BioGen Tau Assay.. VAL is 0.367.  Feature is An obesity rating based on the BMI range of the patient. Also weights in past prescriptions for obesity and progress current management strategies. . VAL is 0.223.  Feature is A measurement of the arm injury severity. Scaling with the expected time of recovery for the arm injury.. VAL is -0.279.  Feature is The standard deviation of all NN intervals within a 5 minute period taken using an electrocardiogram (EKG). Units in ms.. VAL is 0.808.  Feature is The classification of the head injury based on the location. 0 for hard skull, 1 for soft skull, 2 for critical areas (

In [12]:
# find the maximum length of any string in names_train
# of names_train columns
max_length = max([len(x) for x in names_train.iloc[:, 0]])

In [13]:
max_length

120

In [14]:
# combine X's and y's into a single dataframes
train = pd.concat([X_train, y_train], axis=1)
test = pd.concat([X_test, y_test], axis=1)
# save data to csv as test_even.csv and train_even.csv
train.to_csv('train_even.csv', index=False)
test.to_csv('test_even.csv', index=False)