In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder, LabelBinarizer
from sklearn.model_selection import train_test_split

In [2]:
diagnosis = pd.read_csv("./../prepared datasets/dignoses.csv")
drug = pd.read_csv("./../prepared datasets/drug.csv")
demographics = pd.read_csv("./../prepared datasets/demographics.csv")

In [3]:
demographics.drop(["onset_time", "year_of_birth", "month_of_birth", "day_of_birth", "first_visit_date"], 
                  axis=1,
                  inplace=True)

drug.drop(["diabetes"], axis=1, inplace=True)
diagnosis.drop(["0"], axis=1, inplace=True)

In [4]:
diagnosis.head()

Unnamed: 0,44783618,198683,260135,40482859,79916,133169,200774,200779,440408,440409,...,40490929,4179911,444382,436200,313324,77812,321526,444406,434170,person_id
0,0,0,1,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,2
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,3
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,4
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,5


In [5]:
demographics.head()

Unnamed: 0,person_id,diabetes,gender,race,ethnicity
0,1,0,MALE,White,Not Hispanic or Latino
1,2,1,MALE,White,Not Hispanic or Latino
2,3,1,FEMALE,White,Not Hispanic or Latino
3,4,1,MALE,No matching concept,Hispanic or Latino
4,5,1,MALE,White,Not Hispanic or Latino


In [6]:
drug.head()

Unnamed: 0,person_id,drug_1,drug_10,drug_11,drug_12,drug_13,drug_14,drug_15,drug_16,drug_17,...,drug_4,drug_40,drug_41,drug_42,drug_43,drug_5,drug_6,drug_7,drug_8,drug_9
0,1,1,1,0,1,1,0,1,0,1,...,0,1,0,0,1,1,0,1,0,0
1,2,1,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
2,3,1,1,0,0,1,0,1,0,1,...,0,1,0,0,0,0,0,1,0,0
3,4,1,0,0,0,1,0,1,0,0,...,0,1,0,0,0,0,0,0,0,0
4,5,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [7]:
# merge them together
final_data = demographics.merge(drug, on=["person_id"], how="inner").merge(diagnosis, on=["person_id"], how="inner")

In [8]:
final_data

Unnamed: 0,person_id,diabetes,gender,race,ethnicity,drug_1,drug_10,drug_11,drug_12,drug_13,...,141232,40490929,4179911,444382,436200,313324,77812,321526,444406,434170
0,1,0,MALE,White,Not Hispanic or Latino,1,1,0,1,1,...,0,0,0,0,0,0,0,0,0,0
1,2,1,MALE,White,Not Hispanic or Latino,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,3,1,FEMALE,White,Not Hispanic or Latino,1,1,0,0,1,...,0,0,0,0,0,0,0,0,0,0
3,4,1,MALE,No matching concept,Hispanic or Latino,1,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
4,5,1,MALE,White,Not Hispanic or Latino,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
914,1099,1,FEMALE,Black or African American,Not Hispanic or Latino,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
915,1100,1,FEMALE,White,Not Hispanic or Latino,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
916,1101,1,FEMALE,White,Not Hispanic or Latino,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
917,1102,1,MALE,White,Not Hispanic or Latino,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [9]:
np.unique(final_data.gender)

array(['FEMALE', 'MALE'], dtype=object)

In [10]:
np.unique(final_data.race)

array(['Black or African American', 'No matching concept', 'White'],
      dtype=object)

In [11]:
np.unique(final_data.ethnicity)

array(['Hispanic or Latino', 'Not Hispanic or Latino'], dtype=object)

In [12]:
final_data.ethnicity = final_data.ethnicity.apply(lambda x: x.strip())

In [13]:
final_data.loc[final_data.race == "No matching concept"].shape

(51, 375)

In [14]:
final_data.loc[final_data.race == "Black or African American"].shape

(88, 375)

In [15]:
final_data.loc[final_data.race == "White"].shape

(780, 375)

In [16]:
race_ohe = OneHotEncoder()
race_data = race_ohe.fit_transform(np.array(final_data.race).reshape(-1, 1))
race_data = pd.DataFrame(race_data.toarray(), columns = ['race_Black_or_African_American', 'race_No_matching_concept', 'race_White'])

In [17]:
# remove race category from final dataset, add onehotencoding result
final_data.drop(["race"], axis=1, inplace=True)
final_data = pd.concat([final_data, race_data], axis=1)

In [18]:
final_data.gender = LabelBinarizer().fit_transform(final_data.gender).reshape(final_data.shape[0], )

In [19]:
final_data.ethnicity = LabelBinarizer().fit_transform(final_data.ethnicity).reshape(final_data.shape[0],)

In [20]:
final_data.head()

Unnamed: 0,person_id,diabetes,gender,ethnicity,drug_1,drug_10,drug_11,drug_12,drug_13,drug_14,...,444382,436200,313324,77812,321526,444406,434170,race_Black_or_African_American,race_No_matching_concept,race_White
0,1,0,1,1,1,1,0,1,1,0,...,0,0,0,0,0,0,0,0.0,0.0,1.0
1,2,1,1,1,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0.0,0.0,1.0
2,3,1,0,1,1,1,0,0,1,0,...,0,0,0,0,0,0,0,0.0,0.0,1.0
3,4,1,1,0,1,0,0,0,1,0,...,0,0,0,0,0,0,0,0.0,1.0,0.0
4,5,1,1,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0.0,0.0,1.0


In [21]:
# Permuation and then split
y = final_data.diabetes
X = final_data.drop(["diabetes"], axis=1)

X_train, X_test, y_train, y_test = train_test_split(
                                   X, y, test_size=0.33, random_state=123, stratify=y)

In [28]:
train_data = pd.concat([X_train, pd.DataFrame(y_train, columns=["diabetes"])], axis=1)
test_data = pd.concat([X_test, pd.DataFrame(y_test, columns=["diabetes"])], axis=1)

In [30]:
np.sum(train_data.diabetes)/train_data.shape[0]

0.8341463414634146

In [31]:
np.sum(test_data.diabetes)/test_data.shape[0]

0.8355263157894737

In [32]:
# ratios of diabetes are the same for training and test set

In [33]:
train_data.to_csv("./../prepared datasets/train.csv", index=False)
test_data.to_csv("./../prepared datasets/test.csv", index=False)