In [41]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
import pandas as pd
import numpy as np

In [42]:
df = pd.read_csv('datasets/cov-severity.csv')
df.head()

Unnamed: 0,Fever,Tiredness,Dry-Cough,Difficulty-in-Breathing,Sore-Throat,None_Sympton,Pains,Nasal-Congestion,Runny-Nose,Diarrhea,...,Gender_Male,Gender_Transgender,Severity_Mild,Severity_Moderate,Severity_None,Severity_Severe,Contact_Dont-Know,Contact_No,Contact_Yes,Country
0,1,1,1,1,1,0,1,1,1,1,...,1,0,1,0,0,0,0,0,1,China
1,1,1,1,1,1,0,1,1,1,1,...,1,0,1,0,0,0,0,1,0,China
2,1,1,1,1,1,0,1,1,1,1,...,1,0,1,0,0,0,1,0,0,China
3,1,1,1,1,1,0,1,1,1,1,...,1,0,0,1,0,0,0,0,1,China
4,1,1,1,1,1,0,1,1,1,1,...,1,0,0,1,0,0,0,1,0,China


Preprocess plan:
1. Make the column names uniform.
2. Merge "Age" columns to one column "age".
3. Merge "Severity" columns to one column "severity".
4. Merge "Contact" columns to one column "contact" (or just dump contact_no, we'll see).
5. Remove transgender column. Why? Because health needs honesty.
6. Dump "Country" column, it's subjective..

Notes on columns label:
1. AGE COLUMN:
Age_0-9 = 1;
Age_10-19 = 2;
Age_20-24 = 3;
Age_25-59 = 4;
Age_60+ = 5

2. GENDER:
Male = 1;
Female = 2;

3. CONTACT:
No = 0;
Yes = 1;
Dont know = 2

4. SEVERITY:
No = 0;
Mild = 1;
Moderate = 2;
Severe = 3

In [43]:
# Does not mean to discriminate, but one should be honest about their health
# Male and female reactions differ, if one claim he/she is trans, we don't know the truth
# i.e. the truth is mixed!

df = df[df.Gender_Transgender == 0] 
df = df.drop(['Gender_Transgender', 'Country'], axis=1)
df.columns

Index(['Fever', 'Tiredness', 'Dry-Cough', 'Difficulty-in-Breathing',
       'Sore-Throat', 'None_Sympton', 'Pains', 'Nasal-Congestion',
       'Runny-Nose', 'Diarrhea', 'None_Experiencing', 'Age_0-9', 'Age_10-19',
       'Age_20-24', 'Age_25-59', 'Age_60+', 'Gender_Female', 'Gender_Male',
       'Severity_Mild', 'Severity_Moderate', 'Severity_None',
       'Severity_Severe', 'Contact_Dont-Know', 'Contact_No', 'Contact_Yes'],
      dtype='object')

In [44]:
old_cols = df.columns
new_cols = ['fever', 'tired', 'dry_cough', 'difficult_breathing', 
            'sore_throat', 'symptoms', 'pain', 'nc', 'rn', 
            'diarrhea', 'experiencing', '0_9', '10_19', '20_24', '25_59', '60+',
            'female', 'male', 'sev_mild', 'sev_moderate', 'sev_none', 'sev_severe',
            'contact_dont_know', 'contact_no', 'contact_yes']

col_dict = {}
for i in range(len(old_cols)):
    col_dict[old_cols[i]] = new_cols[i]

df = df.rename(columns=col_dict)

In [45]:
def convert_age(row):
    if (row['0_9'] == 1):
        return 1
    elif (row['10_19'] == 1):
        return 2
    elif (row['20_24'] == 1):
        return 3
    elif (row['25_59'] == 1):
        return 4
    else:
        return 5
    
def convert_gender(row):
    if (row['female'] == 1):
        return 2
    else:
        return 1

def convert_contact(row):
    if (row['contact_no'] == 1):
        return 0
    elif (row['contact_yes'] == 1):
        return 1
    else:
        return 2

# def convert_severity(row):
#     if (row['sev_none'] == 1):
#         return 0
#     elif (row['sev_mild'] == 1):
#         return 1
#     elif (row['sev_moderate'] == 1):
#         return 2
#     elif (row['sev_severe'] == 1):
#         return 3

def convert_severity(row):
    if (row['sev_none'] == 1):
        return 0
    else:
        return 1

In [46]:
df['age'] = df.apply(lambda x: convert_age(x), axis=1)
df['gender'] = df.apply(lambda x: convert_gender(x), axis=1)
df['contact'] = df.apply(lambda x: convert_contact(x), axis=1)
df['severity'] = df.apply(lambda x: convert_severity(x), axis=1)

In [59]:
cols = new_cols[:10]
# ext_cols = ['age', 'gender', 'contact', 'severity']
ext_cols = ['contact', 'severity']
cols.extend(ext_cols)
df1 = df[cols].copy()

In [60]:
df1.head()

Unnamed: 0,fever,tired,dry_cough,difficult_breathing,sore_throat,symptoms,pain,nc,rn,diarrhea,contact,severity
0,1,1,1,1,1,0,1,1,1,1,1,1
1,1,1,1,1,1,0,1,1,1,1,0,1
2,1,1,1,1,1,0,1,1,1,1,2,1
3,1,1,1,1,1,0,1,1,1,1,1,1
4,1,1,1,1,1,0,1,1,1,1,0,1


In [61]:
# Target should be integer
# Features can be integer/float/object. In this case all are objects
df1[df1.columns[:-1]] = df1[df1.columns[:-1]].astype('O')

In [62]:
X = df1[df1.columns[:-1]]
y = df1[df1.columns[-1]]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [51]:
from sklearn.tree import DecisionTreeClassifier

In [63]:
dtc = DecisionTreeClassifier()
dtc.fit(X_train, y_train)

DecisionTreeClassifier()

In [64]:
print(classification_report(np.array(y_test), dtc.predict(X_test), zero_division=True))

              precision    recall  f1-score   support

           0       1.00      0.00      0.00     10632
           1       0.75      1.00      0.86     31608

    accuracy                           0.75     42240
   macro avg       0.87      0.50      0.43     42240
weighted avg       0.81      0.75      0.64     42240

