# Data Preprocessing
## Chase Ivancic - 2788040

__Dataset:__ train.csv

__Columns:__ Internet Use, FitnessGram, Physical Activity Questionnaire (PAQ)

### Step 1: Import libraries, import data, and narrow data to necessary columns

Print head of final dataframe to confirm the selection

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
pd.set_option('display.max_columns', None)

In [3]:
data_filepath = r"../data/train.csv"
data = pd.read_csv(data_filepath)
mydata = data[['PreInt_EduHx-Season', 'PreInt_EduHx-computerinternet_hoursday', 'Fitness_Endurance-Season', 'Fitness_Endurance-Max_Stage', 'Fitness_Endurance-Time_Mins', 
                          'Fitness_Endurance-Time_Sec', 'FGC-Season', 'FGC-FGC_CU', 'FGC-FGC_CU_Zone', 'FGC-FGC_GSND', 'FGC-FGC_GSND_Zone', 'FGC-FGC_GSD', 
                         'FGC-FGC_GSD_Zone', 'FGC-FGC_PU', 'FGC-FGC_PU_Zone', 'FGC-FGC_SRL', 'FGC-FGC_SRL_Zone', 'FGC-FGC_SRR', 
                         'FGC-FGC_SRR_Zone', 'FGC-FGC_TL', 'FGC-FGC_TL_Zone', 'PAQ_A-Season', 'PAQ_A-PAQ_A_Total', 'PAQ_C-Season', 'PAQ_C-PAQ_C_Total', 'sii']]
display(mydata)

Unnamed: 0,PreInt_EduHx-Season,PreInt_EduHx-computerinternet_hoursday,Fitness_Endurance-Season,Fitness_Endurance-Max_Stage,Fitness_Endurance-Time_Mins,Fitness_Endurance-Time_Sec,FGC-Season,FGC-FGC_CU,FGC-FGC_CU_Zone,FGC-FGC_GSND,FGC-FGC_GSND_Zone,FGC-FGC_GSD,FGC-FGC_GSD_Zone,FGC-FGC_PU,FGC-FGC_PU_Zone,FGC-FGC_SRL,FGC-FGC_SRL_Zone,FGC-FGC_SRR,FGC-FGC_SRR_Zone,FGC-FGC_TL,FGC-FGC_TL_Zone,PAQ_A-Season,PAQ_A-PAQ_A_Total,PAQ_C-Season,PAQ_C-PAQ_C_Total,sii
0,Fall,3.0,,,,,Fall,0.0,0.0,,,,,0.0,0.0,7.0,0.0,6.0,0.0,6.0,1.0,,,,,2.0
1,Summer,0.0,,,,,Fall,3.0,0.0,,,,,5.0,0.0,11.0,1.0,11.0,1.0,3.0,0.0,,,Fall,2.340,0.0
2,Summer,2.0,Fall,5.0,7.0,33.0,Fall,20.0,1.0,10.2,1.0,14.7,2.0,7.0,1.0,10.0,1.0,10.0,1.0,5.0,0.0,,,Summer,2.170,0.0
3,Winter,0.0,Summer,6.0,9.0,37.0,Summer,18.0,1.0,,,,,5.0,0.0,7.0,0.0,7.0,0.0,7.0,1.0,,,Winter,2.451,1.0
4,,,,,,,,,,,,,,,,,,,,,,Summer,1.04,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3955,Fall,1.0,,,,,Fall,16.0,0.0,18.0,1.0,19.9,2.0,10.0,1.0,8.0,1.0,9.0,1.0,12.0,1.0,,,Winter,3.260,1.0
3956,Winter,0.0,,,,,Spring,0.0,0.0,,,,,4.0,0.0,0.0,0.0,0.0,0.0,12.0,1.0,,,Winter,2.340,
3957,Fall,0.0,,,,,Winter,15.0,1.0,18.5,2.0,15.8,2.0,0.0,0.0,10.0,1.0,10.0,1.0,14.0,1.0,,,Winter,2.729,1.0
3958,Spring,1.0,,,,,Spring,,,,,,,,,,,,,,,,,Spring,3.300,0.0


### Step 2: Drop columns with too many null values

In [4]:
print(mydata.shape)
print("As shown above, there are a total of 3960 rows")
print(mydata.isnull().sum())

(3960, 26)
As shown above, there are a total of 3960 rows
PreInt_EduHx-Season                        420
PreInt_EduHx-computerinternet_hoursday     659
Fitness_Endurance-Season                  2652
Fitness_Endurance-Max_Stage               3217
Fitness_Endurance-Time_Mins               3220
Fitness_Endurance-Time_Sec                3220
FGC-Season                                 614
FGC-FGC_CU                                1638
FGC-FGC_CU_Zone                           1678
FGC-FGC_GSND                              2886
FGC-FGC_GSND_Zone                         2898
FGC-FGC_GSD                               2886
FGC-FGC_GSD_Zone                          2897
FGC-FGC_PU                                1650
FGC-FGC_PU_Zone                           1689
FGC-FGC_SRL                               1655
FGC-FGC_SRL_Zone                          1693
FGC-FGC_SRR                               1653
FGC-FGC_SRR_Zone                          1691
FGC-FGC_TL                                1636
FG

- The "Fitness_Endurance" columns are on average 81.3% null (Excluding "Season" column).
- The Grip Strength columns from "FGC" are on average 73.0% null.
- The Adolescent columns for "PAQ" are on average 88.0% null.

These columns will be dropped from the DataFrame because they will not provide us enough data to make an accurate prediction.

In [5]:
dropdata = mydata.drop(columns=['Fitness_Endurance-Season', 'Fitness_Endurance-Max_Stage', 'Fitness_Endurance-Time_Mins', 
                     'Fitness_Endurance-Time_Sec', 'FGC-FGC_GSND', 'FGC-FGC_GSND_Zone', 'FGC-FGC_GSD', 
                     'FGC-FGC_GSD_Zone', 'PAQ_A-Season', 'PAQ_A-PAQ_A_Total'])
display(dropdata)

Unnamed: 0,PreInt_EduHx-Season,PreInt_EduHx-computerinternet_hoursday,FGC-Season,FGC-FGC_CU,FGC-FGC_CU_Zone,FGC-FGC_PU,FGC-FGC_PU_Zone,FGC-FGC_SRL,FGC-FGC_SRL_Zone,FGC-FGC_SRR,FGC-FGC_SRR_Zone,FGC-FGC_TL,FGC-FGC_TL_Zone,PAQ_C-Season,PAQ_C-PAQ_C_Total,sii
0,Fall,3.0,Fall,0.0,0.0,0.0,0.0,7.0,0.0,6.0,0.0,6.0,1.0,,,2.0
1,Summer,0.0,Fall,3.0,0.0,5.0,0.0,11.0,1.0,11.0,1.0,3.0,0.0,Fall,2.340,0.0
2,Summer,2.0,Fall,20.0,1.0,7.0,1.0,10.0,1.0,10.0,1.0,5.0,0.0,Summer,2.170,0.0
3,Winter,0.0,Summer,18.0,1.0,5.0,0.0,7.0,0.0,7.0,0.0,7.0,1.0,Winter,2.451,1.0
4,,,,,,,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3955,Fall,1.0,Fall,16.0,0.0,10.0,1.0,8.0,1.0,9.0,1.0,12.0,1.0,Winter,3.260,1.0
3956,Winter,0.0,Spring,0.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,12.0,1.0,Winter,2.340,
3957,Fall,0.0,Winter,15.0,1.0,0.0,0.0,10.0,1.0,10.0,1.0,14.0,1.0,Winter,2.729,1.0
3958,Spring,1.0,Spring,,,,,,,,,,,Spring,3.300,0.0


### Step 3: One-Hot encoding on categorical data
Categorical Columns:
- PreInt_EduHx-Season
- FGC-Season
- PAQ_C-Season

In [6]:
encoded_data = pd.get_dummies(dropdata, columns=['PreInt_EduHx-Season', 'FGC-Season', 'PAQ_C-Season'])
display(encoded_data)

Unnamed: 0,PreInt_EduHx-computerinternet_hoursday,FGC-FGC_CU,FGC-FGC_CU_Zone,FGC-FGC_PU,FGC-FGC_PU_Zone,FGC-FGC_SRL,FGC-FGC_SRL_Zone,FGC-FGC_SRR,FGC-FGC_SRR_Zone,FGC-FGC_TL,FGC-FGC_TL_Zone,PAQ_C-PAQ_C_Total,sii,PreInt_EduHx-Season_Fall,PreInt_EduHx-Season_Spring,PreInt_EduHx-Season_Summer,PreInt_EduHx-Season_Winter,FGC-Season_Fall,FGC-Season_Spring,FGC-Season_Summer,FGC-Season_Winter,PAQ_C-Season_Fall,PAQ_C-Season_Spring,PAQ_C-Season_Summer,PAQ_C-Season_Winter
0,3.0,0.0,0.0,0.0,0.0,7.0,0.0,6.0,0.0,6.0,1.0,,2.0,True,False,False,False,True,False,False,False,False,False,False,False
1,0.0,3.0,0.0,5.0,0.0,11.0,1.0,11.0,1.0,3.0,0.0,2.340,0.0,False,False,True,False,True,False,False,False,True,False,False,False
2,2.0,20.0,1.0,7.0,1.0,10.0,1.0,10.0,1.0,5.0,0.0,2.170,0.0,False,False,True,False,True,False,False,False,False,False,True,False
3,0.0,18.0,1.0,5.0,0.0,7.0,0.0,7.0,0.0,7.0,1.0,2.451,1.0,False,False,False,True,False,False,True,False,False,False,False,True
4,,,,,,,,,,,,,,False,False,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3955,1.0,16.0,0.0,10.0,1.0,8.0,1.0,9.0,1.0,12.0,1.0,3.260,1.0,True,False,False,False,True,False,False,False,False,False,False,True
3956,0.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,12.0,1.0,2.340,,False,False,False,True,False,True,False,False,False,False,False,True
3957,0.0,15.0,1.0,0.0,0.0,10.0,1.0,10.0,1.0,14.0,1.0,2.729,1.0,True,False,False,False,False,False,False,True,False,False,False,True
3958,1.0,,,,,,,,,,,3.300,0.0,False,True,False,False,False,True,False,False,False,True,False,False


### Step 4: Replacing NULL Values

In [7]:
replace_mean = ['PAQ_C-PAQ_C_Total']
replace_median = ['PreInt_EduHx-computerinternet_hoursday', 'FGC-FGC_CU', 'FGC-FGC_PU', 'FGC-FGC_SRL', 'FGC-FGC_SRR', 'FGC-FGC_TL', 'sii']
replace_mode = ['FGC-FGC_CU_Zone', 'FGC-FGC_PU_Zone', 'FGC-FGC_SRL_Zone', 'FGC-FGC_SRR_Zone', 'FGC-FGC_TL_Zone', 
               'PreInt_EduHx-Season_Fall', 'PreInt_EduHx-Season_Spring', 'PreInt_EduHx-Season_Summer', 'PreInt_EduHx-Season_Winter', 
               'FGC-Season_Fall', 'FGC-Season_Spring', 'FGC-Season_Summer', 'FGC-Season_Winter', 
               'PAQ_C-Season_Fall', 'PAQ_C-Season_Spring', 'PAQ_C-Season_Summer', 'PAQ_C-Season_Winter']

encodedcopy = encoded_data.copy()
encodedcopy[replace_mean] = encoded_data[replace_mean].fillna(encoded_data[replace_mean].mean().iloc[0])
encodedcopy[replace_median] = encoded_data[replace_median].fillna(encoded_data[replace_median].median().iloc[0])
encodedcopy[replace_mode] = encoded_data[replace_mode].fillna(encoded_data[replace_mode].mode().iloc[0])
display(encodedcopy)
print(encodedcopy.isnull().sum())

Unnamed: 0,PreInt_EduHx-computerinternet_hoursday,FGC-FGC_CU,FGC-FGC_CU_Zone,FGC-FGC_PU,FGC-FGC_PU_Zone,FGC-FGC_SRL,FGC-FGC_SRL_Zone,FGC-FGC_SRR,FGC-FGC_SRR_Zone,FGC-FGC_TL,FGC-FGC_TL_Zone,PAQ_C-PAQ_C_Total,sii,PreInt_EduHx-Season_Fall,PreInt_EduHx-Season_Spring,PreInt_EduHx-Season_Summer,PreInt_EduHx-Season_Winter,FGC-Season_Fall,FGC-Season_Spring,FGC-Season_Summer,FGC-Season_Winter,PAQ_C-Season_Fall,PAQ_C-Season_Spring,PAQ_C-Season_Summer,PAQ_C-Season_Winter
0,3.0,0.0,0.0,0.0,0.0,7.0,0.0,6.0,0.0,6.0,1.0,2.58955,2.0,True,False,False,False,True,False,False,False,False,False,False,False
1,0.0,3.0,0.0,5.0,0.0,11.0,1.0,11.0,1.0,3.0,0.0,2.34000,0.0,False,False,True,False,True,False,False,False,True,False,False,False
2,2.0,20.0,1.0,7.0,1.0,10.0,1.0,10.0,1.0,5.0,0.0,2.17000,0.0,False,False,True,False,True,False,False,False,False,False,True,False
3,0.0,18.0,1.0,5.0,0.0,7.0,0.0,7.0,0.0,7.0,1.0,2.45100,1.0,False,False,False,True,False,False,True,False,False,False,False,True
4,1.0,1.0,0.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,2.58955,1.0,False,False,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3955,1.0,16.0,0.0,10.0,1.0,8.0,1.0,9.0,1.0,12.0,1.0,3.26000,1.0,True,False,False,False,True,False,False,False,False,False,False,True
3956,0.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,12.0,1.0,2.34000,1.0,False,False,False,True,False,True,False,False,False,False,False,True
3957,0.0,15.0,1.0,0.0,0.0,10.0,1.0,10.0,1.0,14.0,1.0,2.72900,1.0,True,False,False,False,False,False,False,True,False,False,False,True
3958,1.0,1.0,0.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,3.30000,0.0,False,True,False,False,False,True,False,False,False,True,False,False


PreInt_EduHx-computerinternet_hoursday    0
FGC-FGC_CU                                0
FGC-FGC_CU_Zone                           0
FGC-FGC_PU                                0
FGC-FGC_PU_Zone                           0
FGC-FGC_SRL                               0
FGC-FGC_SRL_Zone                          0
FGC-FGC_SRR                               0
FGC-FGC_SRR_Zone                          0
FGC-FGC_TL                                0
FGC-FGC_TL_Zone                           0
PAQ_C-PAQ_C_Total                         0
sii                                       0
PreInt_EduHx-Season_Fall                  0
PreInt_EduHx-Season_Spring                0
PreInt_EduHx-Season_Summer                0
PreInt_EduHx-Season_Winter                0
FGC-Season_Fall                           0
FGC-Season_Spring                         0
FGC-Season_Summer                         0
FGC-Season_Winter                         0
PAQ_C-Season_Fall                         0
PAQ_C-Season_Spring             

### Step 5: Random Forest Classifier

In [8]:
data

X = encodedcopy.drop(columns=['sii'])
y = encodedcopy['sii']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

clf = RandomForestClassifier(n_estimators=100, random_state=42)  # 100 trees
clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))

Accuracy: 0.5984848484848485

Classification Report:
               precision    recall  f1-score   support

         0.0       0.59      0.69      0.64       311
         1.0       0.65      0.64      0.64       397
         2.0       0.13      0.05      0.07        76
         3.0       0.00      0.00      0.00         8

    accuracy                           0.60       792
   macro avg       0.34      0.35      0.34       792
weighted avg       0.57      0.60      0.58       792



In [9]:
from sklearn.metrics import cohen_kappa_score

def sii(y):
    """
    0-30=None; 31-49=Mild; 50-79=Moderate; 80-100=Severe
    """
    # y = y[y_cols]
    return np.digitize(y.sum(axis=1), bins=[30, 50, 80], right=True)

def compare_sii(y1, y2):
    return cohen_kappa_score(sii(y1), sii(y2), weights='quadratic')

In [12]:
cohen_kappa_score(y_test, y_pred, weights='quadratic')

np.float64(0.28177163948089057)

In [11]:
y_test

149     0.0
1025    0.0
1846    0.0
720     1.0
325     0.0
       ... 
1226    0.0
736     0.0
3292    1.0
927     0.0
3778    1.0
Name: sii, Length: 792, dtype: float64