In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, TimeSeriesSplit, RandomizedSearchCV
from sklearn.linear_model import Ridge
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error
import scipy.stats as ss

In [2]:
pd.options.display.max_columns = 50

In [3]:
summary = pd.read_csv('data/summary.csv')
day1 = pd.read_csv('data/day 1.csv')

In [4]:
day1.head(20)

Unnamed: 0.1,Unnamed: 0,assessment,day,event,patient,time
0,0,,1,arrived,1,280
1,1,,1,arrived,2,288
2,2,,1,assessment initiated,1,308
3,3,,1,arrived,3,430
4,4,urgent|36.599999999999994|no pain,1,assessment concluded,1,741
5,5,,1,assessment initiated,2,764
6,6,,1,consultation_initiated,1,773
7,7,normal|36.699999999999996|severe pain,1,assessment concluded,2,851
8,8,,1,assessment initiated,3,878
9,9,,1,consultation_initiated,2,905


In [5]:
summary.corr()['duration']

Unnamed: 0                -0.043085
arrival_time              -0.034497
assessment_end_time       -0.066131
assessment_start_time     -0.065776
consultation_end_time      0.038430
consultation_start_time   -0.066377
day                       -0.044200
duration                   1.000000
patient                   -0.044809
temperature                0.298949
Name: duration, dtype: float64

In [None]:
sns.pairplot(summary);

In [None]:
sns.pairplot(day1);

In [None]:
print(summary.shape)
print(len(summary['Unnamed: 0'].unique()))
print(len(summary['patient'].unique()))

In [None]:
summary.head(20)

In [None]:
sns.scatterplot('patient', 'arrival_time', data=summary, hue='day');

In [None]:
summary.loc[summary['day'] == 50, 'arrival_time'].corr(summary.loc[summary['day'] == 50, 'patient'])

In [None]:
df = summary.sort_values(['day', 'arrival_time'])

In [None]:
sns.distplot(df['duration']);

In [None]:
sns.distplot(np.log(df['duration']));

In [None]:
df['log_consult_duration'] = np.log(df['duration'])

In [None]:
df.info()

In [None]:
df['assessment_duration'] = df['assessment_end_time'] - df['assessment_start_time']

In [None]:
df['temperature'].describe()

In [None]:
df['temp_cat'] = pd.cut(df['temperature'],
                        bins=[float('-inf'), 36.5, 37.5, float('inf')],
                        labels=['hypothermia', 'normal', 'fever'])

In [None]:
df = pd.get_dummies(df, columns=['priority', 'pain', 'temp_cat'])

In [None]:
df

In [None]:
df['pain_no pain:temp_cat_hypothermia:priority_normal'] =\
df['pain_no pain'] * df['temp_cat_hypothermia'] * df['priority_normal']
df['pain_moderate pain:temp_cat_hypothermia:priority_normal'] =\
df['pain_moderate pain'] * df['temp_cat_hypothermia'] * df['priority_normal']
df['pain_severe pain:temp_cat_hypothermia:priority_normal'] =\
df['pain_severe pain'] * df['temp_cat_hypothermia'] * df['priority_normal']

df['pain_no pain:temp_cat_normal:priority_normal'] =\
df['pain_no pain'] * df['temp_cat_normal'] * df['priority_normal']
df['pain_moderate pain:temp_cat_normal:priority_normal'] =\
df['pain_moderate pain'] * df['temp_cat_normal'] * df['priority_normal']
df['pain_severe pain:temp_cat_normal:priority_normal'] =\
df['pain_severe pain'] * df['temp_cat_normal'] * df['priority_normal']

df['pain_no pain:temp_cat_fever:priority_normal'] =\
df['pain_no pain'] * df['temp_cat_fever'] * df['priority_normal']
df['pain_moderate pain:temp_cat_fever:priority_normal'] =\
df['pain_moderate pain'] * df['temp_cat_fever'] * df['priority_normal']
df['pain_severe pain:temp_cat_fever:priority_normal'] =\
df['pain_severe pain'] * df['temp_cat_fever'] * df['priority_normal']

df['pain_no pain:temp_cat_hypothermia:priority_urgent'] =\
df['pain_no pain'] * df['temp_cat_hypothermia'] * df['priority_urgent']
df['pain_moderate pain:temp_cat_hypothermia:priority_urgent'] =\
df['pain_moderate pain'] * df['temp_cat_hypothermia'] * df['priority_urgent']
df['pain_severe pain:temp_cat_hypothermia:priority_urgent'] =\
df['pain_severe pain'] * df['temp_cat_hypothermia'] * df['priority_urgent']

df['pain_no pain:temp_cat_normal:priority_urgent'] =\
df['pain_no pain'] * df['temp_cat_normal'] * df['priority_urgent']
df['pain_moderate pain:temp_cat_normal:priority_urgent'] =\
df['pain_moderate pain'] * df['temp_cat_normal'] * df['priority_urgent']
df['pain_severe pain:temp_cat_normal:priority_urgent'] =\
df['pain_severe pain'] * df['temp_cat_normal'] * df['priority_urgent']

df['pain_no pain:temp_cat_fever:priority_urgent'] =\
df['pain_no pain'] * df['temp_cat_fever'] * df['priority_urgent']
df['pain_moderate pain:temp_cat_fever:priority_urgent'] =\
df['pain_moderate pain'] * df['temp_cat_fever'] * df['priority_urgent']
df['pain_severe pain:temp_cat_fever:priority_urgent'] =\
df['pain_severe pain'] * df['temp_cat_fever'] * df['priority_urgent']

df.drop(['pain_no pain', 'pain_moderate pain', 'pain_severe pain',
         'temp_cat_hypothermia', 'temp_cat_normal', 'temp_cat_fever',
         'priority_normal', 'priority_urgent'], axis=1, inplace=True)

In [None]:
cols_to_drop = ['Unnamed: 0', 'assessment_end_time', 'consultation_end_time', 'duration', 'patient']
df.drop(cols_to_drop, axis=1, inplace=True)

In [None]:
df.info()

In [None]:
X_train = df[df['day'] <= 45].drop(['day', 'log_consult_duration'], axis=1)
X_test = df[df['day'] > 45].drop(['day', 'log_consult_duration'], axis=1)
y_train = df.loc[df['day'] <= 45, 'log_consult_duration']
y_test = df.loc[df['day'] > 45, 'log_consult_duration']

In [None]:
tscv = TimeSeriesSplit(n_splits=5)

ridge = Ridge(normalize=False, random_state=42)

ridge_params ={'alpha': ss.uniform(0, 1)}

rscv = RandomizedSearchCV(ridge,
                          ridge_params,
                          n_iter=1000,
                          scoring='neg_root_mean_squared_error',
                          n_jobs=-1,
                          cv=tscv,
                          verbose=1,
                          random_state=42,
                          return_train_score=True)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

rscv.fit(X_train_scaled, y_train)

In [None]:
lm = rscv.best_estimator_
lm

In [None]:
cv_results = pd.DataFrame(rscv.cv_results_)
train_rmse = -cv_results.loc[rscv.best_index_, 'mean_train_score']
val_rmse = -cv_results.loc[rscv.best_index_, 'mean_test_score']

test_rmse = mean_squared_error(y_test, lm.predict(X_test_scaled))

print(f'train rmse: {train_rmse}')
print(f'val rmse: {val_rmse}')
print(f'test rmse: {test_rmse}')