In [7]:
import pandas as pd
import numpy as np
import time

In [8]:
df = pd.read_csv('df_sml_1000_ids.csv')
df = df.drop('Unnamed: 0',axis=1)
df['Date'] = pd.to_datetime(df['Date'])
og_df = df
#df

In [9]:
# Get the start Dates of the treatment.
min_dates = df.groupby('Patient-Uid').agg({'Patient-Uid': 'first', 'Date': 'min'})
min_dates = min_dates.reset_index(drop=True)
min_dates = min_dates.rename(columns={'Date':'Start'})

# Get the dates on which the first dose of "TARGET DRUG" was given.
df_target_vals = df[df['Incident']=='TARGET DRUG']
df_target_vals = df_target_vals.groupby('Patient-Uid')['Date'].min()
df_target_vals = pd.DataFrame(df_target_vals).reset_index()
df_target_vals = df_target_vals.rename(columns={'Date':'End'})

# Merge the start days and first day of target drug.
df_start_end = min_dates.merge(df_target_vals, how='left', on='Patient-Uid')

# Get the average time between the start of the treatment and first dosage of target drug.
avg_time = (df_start_end['End']-df_start_end['Start']).mean()
print(f'Average Time: {avg_time}')

# Add the average time to the records which did not receive "TARGET DRUG"
df_start_end['End'] = df_start_end['End'].fillna(df_start_end['Start']+avg_time)

# Subtract 1 month from the date of "TARGET DRUG"
df_start_end['End'] = df_start_end['End'] - pd.DateOffset(months=1)
#df_start_end

#************************************

# All records before they received the "TARGET DRUG"
df = df.merge(df_start_end, how='left', on='Patient-Uid')
df = df[df['Date']<df['End']]
#df

Average Time: 1340 days 02:38:06.053412464


In [10]:
# Calculate the average time between the incidents.

start_time = time.time()

# Assuming you have a DataFrame named 'df' with columns 'Patient-Uid', 'Incident', and other columns

# Group the DataFrame by 'Patient-Uid' and 'Incident'
grouped = df.groupby(['Patient-Uid', 'Incident'])

# Initialize an empty dictionary to store the results
result_dict = {}

# Iterate over the groups
for group, group_df in grouped:
    uid, incident = group
    #print(group)
    #print(group_df)
    value = group_df['Date'].diff().mean()
    if uid in result_dict:
        result_dict[uid][incident] = value
    else:
        result_dict[uid] = {incident: value}

# Print the resulting dictionary
#print(result_dict)

end_time = time.time()
print(f'Time Elapsed: {end_time - start_time}')

df_avg_time = pd.DataFrame(result_dict).T.reset_index()

# Rename the columns, add "_avg_time" suffix.
col_names = {}
for i in df_avg_time.columns:
    if i == 'index':
        col_names[i] = 'Patient-Uid'
    else:
        col_names[i] = str(i)+'_avg_time'

df_avg_time = df_avg_time.rename(columns=col_names)

#df_avg_time

Time Elapsed: 3.1351683139801025


In [11]:
# Calculate the count of each incident
df_incident_count = pd.pivot_table(df,columns = df['Incident'], values='Incident', index='Patient-Uid', aggfunc='count').reset_index()

col_names = {}
for i in df_incident_count.columns:
    if i == 'Patient-Uid':
        continue
    col_names[i] = str(i)+'_count'

df_incident_count = df_incident_count.rename(columns=col_names)

#df_incident_count

In [12]:
df_combined = df_incident_count.merge(df_avg_time, how='left', on='Patient-Uid')
#df_combined

In [13]:
# Calculate which patients are eligible
df = og_df
tmp = pd.pivot_table(df,columns = df['Incident'], values='Incident', index='Patient-Uid', aggfunc='count').reset_index()
target_count = pd.DataFrame()
target_count['Patient-Uid'] = tmp['Patient-Uid']
target_count['TARGET DRUG'] = tmp['TARGET DRUG']
target_count['IS_ELIGIBLE'] = np.where(target_count['TARGET DRUG'] > 0, 1, 0)
target_count = target_count.drop('TARGET DRUG', axis = 1)
#target_count

In [14]:
df_final = df_combined.merge(target_count, how='left', on='Patient-Uid')
df_final

Unnamed: 0,Patient-Uid,DRUG_TYPE_0_count,DRUG_TYPE_1_count,DRUG_TYPE_10_count,DRUG_TYPE_11_count,DRUG_TYPE_12_count,DRUG_TYPE_13_count,DRUG_TYPE_14_count,DRUG_TYPE_15_count,DRUG_TYPE_16_count,...,DRUG_TYPE_14_avg_time,DRUG_TYPE_10_avg_time,SYMPTOM_TYPE_21_avg_time,DRUG_TYPE_16_avg_time,SYMPTOM_TYPE_23_avg_time,TEST_TYPE_5_avg_time,SYMPTOM_TYPE_25_avg_time,SYMPTOM_TYPE_26_avg_time,SYMPTOM_TYPE_22_avg_time,IS_ELIGIBLE
0,a0dc9543-1c7c-11ec-bb63-16262ee38c7f,1.0,37.0,,22.0,,,,,,...,NaT,NaT,NaT,NaT,NaT,NaT,NaT,NaT,NaT,0
1,a0dcab04-1c7c-11ec-ac35-16262ee38c7f,13.0,12.0,,,,,,1.0,,...,NaT,NaT,NaT,NaT,NaT,NaT,NaT,NaT,NaT,0
2,a0dcb11b-1c7c-11ec-b68c-16262ee38c7f,18.0,37.0,,,,,,,,...,NaT,NaT,NaT,NaT,NaT,NaT,NaT,NaT,NaT,0
3,a0dcb20c-1c7c-11ec-bc7e-16262ee38c7f,12.0,66.0,,,,,,,,...,NaT,NaT,NaT,NaT,NaT,NaT,NaT,NaT,NaT,0
4,a0dcb57a-1c7c-11ec-bcb7-16262ee38c7f,9.0,17.0,,,,,,,,...,NaT,NaT,NaT,NaT,NaT,NaT,NaT,NaT,NaT,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,a0f0af14-1c7c-11ec-ac48-16262ee38c7f,,2.0,,,,,,,,...,NaT,NaT,NaT,NaT,NaT,NaT,NaT,NaT,NaT,1
996,a0f0b365-1c7c-11ec-a59e-16262ee38c7f,15.0,10.0,,,,,,,,...,NaT,NaT,NaT,NaT,NaT,NaT,NaT,NaT,NaT,1
997,a0f0bab2-1c7c-11ec-9f18-16262ee38c7f,12.0,5.0,,1.0,,,,,,...,NaT,NaT,NaT,NaT,NaT,NaT,NaT,NaT,NaT,1
998,a0f0bf76-1c7c-11ec-ae56-16262ee38c7f,7.0,11.0,1.0,1.0,,,,,,...,NaT,NaT,NaT,NaT,NaT,NaT,NaT,NaT,NaT,1


In [266]:
#df_final.to_pickle('df_final.pkl')

In [68]:
import pandas as pd
import numpy as np

avg_time_cols = ['Patient-Uid', 'DRUG_TYPE_0_avg_time', 'DRUG_TYPE_1_avg_time',
       'DRUG_TYPE_11_avg_time', 'DRUG_TYPE_2_avg_time', 'DRUG_TYPE_3_avg_time',
       'DRUG_TYPE_6_avg_time', 'DRUG_TYPE_7_avg_time',
       'PRIMARY_DIAGNOSIS_avg_time', 'SYMPTOM_TYPE_0_avg_time',
       'SYMPTOM_TYPE_15_avg_time', 'SYMPTOM_TYPE_6_avg_time',
       'SYMPTOM_TYPE_7_avg_time', 'SYMPTOM_TYPE_8_avg_time',
       'TEST_TYPE_0_avg_time', 'TEST_TYPE_1_avg_time', 'TEST_TYPE_2_avg_time',
       'DRUG_TYPE_15_avg_time', 'DRUG_TYPE_4_avg_time', 'DRUG_TYPE_5_avg_time',
       'DRUG_TYPE_8_avg_time', 'DRUG_TYPE_9_avg_time',
       'SYMPTOM_TYPE_4_avg_time', 'SYMPTOM_TYPE_1_avg_time',
       'SYMPTOM_TYPE_12_avg_time', 'SYMPTOM_TYPE_2_avg_time',
       'SYMPTOM_TYPE_3_avg_time', 'TEST_TYPE_3_avg_time',
       'SYMPTOM_TYPE_5_avg_time', 'SYMPTOM_TYPE_18_avg_time',
       'SYMPTOM_TYPE_14_avg_time', 'SYMPTOM_TYPE_29_avg_time',
       'SYMPTOM_TYPE_10_avg_time', 'DRUG_TYPE_13_avg_time',
       'SYMPTOM_TYPE_16_avg_time', 'SYMPTOM_TYPE_19_avg_time',
       'TEST_TYPE_4_avg_time', 'SYMPTOM_TYPE_11_avg_time',
       'SYMPTOM_TYPE_13_avg_time', 'DRUG_TYPE_12_avg_time',
       'SYMPTOM_TYPE_17_avg_time', 'SYMPTOM_TYPE_9_avg_time',
       'DRUG_TYPE_14_avg_time', 'DRUG_TYPE_10_avg_time',
       'SYMPTOM_TYPE_21_avg_time', 'DRUG_TYPE_16_avg_time',
       'SYMPTOM_TYPE_23_avg_time', 'TEST_TYPE_5_avg_time',
       'SYMPTOM_TYPE_25_avg_time', 'SYMPTOM_TYPE_26_avg_time',
       'SYMPTOM_TYPE_22_avg_time']

df_final = pd.read_pickle('df_final.pkl')

for i in avg_time_cols:
    if i == 'Patient-Uid':
        continue
    else:
        df_final[i] = df_final[i].dt.days.astype('float64')

In [78]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score
from sklearn.preprocessing import StandardScaler

data = df_final.fillna(-10)

X = data.drop(['Patient-Uid','IS_ELIGIBLE'], axis=1)  # Features (all columns except the target)
y = data['IS_ELIGIBLE']  # Target variable

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
scaler = StandardScaler()

# scaler.fit(X_train)
# X_train = scaler.transform(X_train)
# X_test = scaler.transform(X_test)




rf_classifier = RandomForestClassifier()
rf_classifier.fit(X_train, y_train)
y_pred = rf_classifier.predict(X_test)
f1 = f1_score(y_test,y_pred)
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)
print("F1:", f1)

Accuracy: 0.745
F1: 0.4631578947368421


In [348]:
from sklearn.linear_model import LogisticRegression

# Assuming you have your features in X and the corresponding labels in y
# Initialize the logistic regression model
logreg = LogisticRegression()

# Fit the model to the training data
logreg.fit(X_train, y_train)

# Make predictions on new data
y_pred = logreg.predict(X_test)

f1 = f1_score(y_test,y_pred)
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)
print("F1:", f1)

Accuracy: 0.755
F1: 0.4948453608247424


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [85]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import StandardScaler

scaler.fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

dtc = DecisionTreeClassifier()
dtc.fit(X_train, y_train)
y_pred = dtc.predict(X_test)

f1 = f1_score(y_test,y_pred)
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)
print("F1:", f1)

Accuracy: 0.67
F1: 0.47619047619047616
