# Import thư viện cần thiết

In [1]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

Đọc dữ liệu

In [2]:
train = pd.read_csv('../data/train_preprocessed.csv')
train.head()

Unnamed: 0,Id,v2a1,hacdor,rooms,hacapo,v14a,refrig,v18q,v18q1,r4h1,...,SQBage,SQBhogar_total,SQBedjefe,SQBhogar_nin,SQBovercrowding,SQBdependency,SQBmeaned,agesq,Target,rez_esc-missing
0,ID_279628684,190000.0,0,3,0,1,1,0,0.0,0,...,1849,1,100,0,1.0,0.0,100.0,1849,4,False
1,ID_f29eb3ddd,135000.0,0,4,0,1,1,1,1.0,0,...,4489,1,144,0,1.0,64.0,144.0,4489,4,False
2,ID_68de51c94,0.0,0,8,0,1,1,0,0.0,0,...,8464,1,0,0,0.25,64.0,121.0,8464,4,False
3,ID_d671db89c,180000.0,0,5,0,1,1,1,1.0,0,...,289,16,121,4,1.777778,1.0,121.0,289,4,False
4,ID_d56d6f5f5,180000.0,0,5,0,1,1,1,1.0,0,...,1369,16,121,4,1.777778,1.0,121.0,1369,4,False


In [3]:
test = pd.read_csv('../data/test_preprocessed.csv')
test.head()

Unnamed: 0,Id,v2a1,hacdor,rooms,hacapo,v14a,refrig,v18q,v18q1,r4h1,...,SQBescolari,SQBage,SQBhogar_total,SQBedjefe,SQBhogar_nin,SQBovercrowding,SQBdependency,SQBmeaned,agesq,rez_esc-missing
0,ID_2f6873615,0.0,0,5,0,1,1,0,0.0,1,...,0,16,9,0,1,2.25,0.25,272.25,16,False
1,ID_1c78846d2,0.0,0,5,0,1,1,0,0.0,1,...,256,1681,9,0,1,2.25,0.25,272.25,1681,False
2,ID_e5442cf6a,0.0,0,5,0,1,1,0,0.0,1,...,289,1681,9,0,1,2.25,0.25,272.25,1681,False
3,ID_a8db26a79,0.0,0,14,0,1,1,1,1.0,0,...,256,3481,1,256,0,1.0,0.0,256.0,3481,False
4,ID_a62966799,175000.0,0,4,0,1,1,1,1.0,0,...,121,324,1,0,1,0.25,64.0,,324,True


In [4]:
ntrain = train.shape[0]
ntest = test.shape[0]

all_data = pd.concat((train, test)).reset_index(drop=True)

# Xây dựng mô hình học máy

## Loại bỏ những thuộc tính dư thừa

### Tạo biến ordinal từ dữ liệu đã được one-hot encode

Các thuộc tính như `epared`, `etecho`, `eviv` và `instlevel` có thể được chuyển về dạng dữ liệu ordinal với quy ước **(bad, regular, good) -> (0, 1, 2)**

In [5]:
def get_numeric(data, status_name):
    status_cols = [s for s in data.columns.tolist() if status_name in s]
    status_df = data[status_cols]
    status_df.columns = list(range(status_df.shape[1]))
    status_numeric = status_df.idxmax(1)
    status_numeric.name = status_name
    data = pd.concat([data, status_numeric], axis=1)
    return data

In [6]:
status_name_list = ['epared', 'etecho', 'eviv', 'instlevel']
for status_name in status_name_list:
    all_data = get_numeric(all_data, status_name)

### Xóa những thuộc tính không cần thiết

Nhóm nhận thấy có những thuộc tính có thể được xác định bằng những thuộc tính khác trong dữ liệu.

* Nhóm thuộc tính sau có thể được tạo ra bằng sự kết hợp từ `r4h` và `r4m`:
    ```
    r4t1, persons younger than 12 years of age
    r4t2, persons 12 years of age and older
    r4t3, Total persons in the household
    ```

* Các thuộc tính sau mang cùng ý nghĩa với `hogar_total`:
    ```
    tamhog, size of the household
    tamviv, number of persons living in the household
    hhsize, household size
    r4t3, Total persons in the household
    ```

* `v18q` có thể được tạo ra từ `v18q1`.
* `mobilephone` có thể được tạo ra từ `qmobilephone`.
* `epared1~3`, `etecho1~3`, `eviv1~3`, `instlevel1~9` do đã được chuyển đổi thành dữ liệu ordinal nên sẽ không dùng đến nữa.

In [7]:
redundant_features = ['r4t1', 'r4t2', 'r4t3', 
                      'tamhog', 'tamviv', 'hhsize', 'r4t3', 
                      'v18q', 'mobilephone',
                      'epared1', 'epared2', 'epared3', 
                      'etecho1', 'etecho2', 'etecho3',
                      'eviv1', 'eviv2', 'eviv3', 
                      'instlevel1', 'instlevel2', 'instlevel3', 'instlevel4', 'instlevel5', 'instlevel6', 'instlevel7', 'instlevel8', 'instlevel9']
all_data.drop(columns=redundant_features, inplace=True)

## Trích lọc đặc trưng bằng thông số thống kê

Để kết hợp dữ liệu của từng cá nhân vào dữ liệu của cả hộ gia đình, ta cần tổng hợp dữ liệu đó cho từng hộ gia đình. Cách đơn giản nhất để thực hiện việc này là nhóm dữ liệu theo `idhogar` rồi tổng hợp dữ liệu. Tuy nhiên, các dữ liệu **boolean** có thể giống nhau, và sẽ tạo ra nhiều cột dư thừa mà sau đó chúng ta sẽ cần phải loại bỏ sau khi triển khai.

In [8]:
ind_bool = ['dis', 'male', 'female', 
            'estadocivil1', 'estadocivil2', 'estadocivil3', 'estadocivil4', 'estadocivil5', 'estadocivil6', 'estadocivil7', 
            'parentesco1', 'parentesco2',  'parentesco3', 'parentesco4', 'parentesco5', 'parentesco6', 
            'parentesco7', 'parentesco8',  'parentesco9', 'parentesco10', 'parentesco11', 'parentesco12', 'instlevel']

ind_ordered = ['escolari', 'age']

In [9]:
range = lambda x: x.max() - x.min()
range.__name__ = 'range_'
ind_agg = all_data.groupby('idhogar')[ind_ordered + ind_bool].agg(['mean', 'max', 'min', 'sum', 'count', 'std', range])

new_cols = []
for col in ind_agg.columns.levels[0]:
    for stat in ind_agg.columns.levels[1]:
        new_cols.append(f'{col}-{stat}')

ind_agg.columns = new_cols
ind_agg.head()

Unnamed: 0_level_0,escolari-mean,escolari-max,escolari-min,escolari-sum,escolari-count,escolari-std,escolari-range_,age-mean,age-max,age-min,...,parentesco12-count,parentesco12-std,parentesco12-range_,instlevel-mean,instlevel-max,instlevel-min,instlevel-sum,instlevel-count,instlevel-std,instlevel-range_
idhogar,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
000a08204,8.666667,14,0,26,3,7.571878,14,20.666667,30,4,...,3,0.0,0,4.666667,7,0,14,3,4.041452,7
000bce7c4,2.5,5,0,5,2,3.535534,5,61.5,63,60,...,2,0.0,0,0.5,1,0,1,2,0.707107,1
001845fb0,10.25,14,6,41,4,3.304038,8,35.5,52,19,...,4,0.0,0,4.0,7,2,16,4,2.160247,5
001ff74ca,8.0,16,0,16,2,11.313708,16,19.0,38,0,...,2,0.0,0,3.5,7,0,7,2,4.949747,7
003514e22,7.25,14,0,29,4,5.85235,14,24.0,39,6,...,4,0.0,0,3.0,7,0,12,4,2.94392,7


In [10]:
corr_matrix = ind_agg.corr()
upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))

to_drop = [column for column in upper.columns if any(abs(upper[column]) > 0.95)]
print(f'There are {len(to_drop)} correlated columns to remove.')

There are 82 correlated columns to remove.


In [11]:
all_data = all_data.merge(ind_agg, on = 'idhogar', how = 'left')
all_data.drop(columns=ind_bool+ind_ordered+to_drop, inplace=True)
print('Number of features after dropping the individual level features', all_data.shape[1])

Number of features after dropping the individual level features 190


## Xây dựng model

Chia dữ liệu

In [12]:
train = all_data[:ntrain][:]
test = all_data[ntrain:][:]

In [13]:
from sklearn.impute import SimpleImputer

In [14]:
train.set_index(['Id', 'idhogar'], inplace=True)
test.set_index(['Id', 'idhogar'], inplace=True)
test

Unnamed: 0_level_0,Unnamed: 1_level_0,v2a1,hacdor,rooms,hacapo,v14a,refrig,v18q1,r4h1,r4h2,r4h3,...,parentesco11-max,parentesco11-min,parentesco11-sum,parentesco12-mean,parentesco12-max,parentesco12-min,parentesco12-sum,instlevel-max,instlevel-std,instlevel-range_
Id,idhogar,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
ID_2f6873615,72958b30c,0.0,0,5,0,1,1,0.0,1,1,2,...,0,0,0,0.0,0,0,0,8,4.358899,8
ID_1c78846d2,72958b30c,0.0,0,5,0,1,1,0.0,1,1,2,...,0,0,0,0.0,0,0,0,8,4.358899,8
ID_e5442cf6a,72958b30c,0.0,0,5,0,1,1,0.0,1,1,2,...,0,0,0,0.0,0,0,0,8,4.358899,8
ID_a8db26a79,5b598fbc9,0.0,0,14,0,1,1,1.0,0,1,1,...,0,0,0,0.0,0,0,0,7,,0
ID_a62966799,1e2fc704e,175000.0,0,4,0,1,1,1.0,0,0,0,...,0,0,0,0.0,0,0,0,4,,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
ID_a065a7cad,3aa78c56b,0.0,1,2,1,1,1,0.0,0,2,2,...,0,0,0,0.0,0,0,0,3,0.983192,3
ID_1a7c6953b,d237404b6,0.0,0,3,0,1,1,0.0,0,1,1,...,0,0,0,0.0,0,0,0,2,0.577350,1
ID_07dbb4be2,d237404b6,0.0,0,3,0,1,1,0.0,0,1,1,...,0,0,0,0.0,0,0,0,2,0.577350,1
ID_34d2ed046,d237404b6,0.0,0,3,0,1,1,0.0,0,1,1,...,0,0,0,0.0,0,0,0,2,0.577350,1


In [15]:
imputer = SimpleImputer(missing_values=np.nan, strategy="median")
# imputer = KNNImputer()
X = imputer.fit_transform(train.iloc[:, :-1])
new_test_df = imputer.transform(test)

ValueError: The feature names should match those that were passed during fit.
Feature names unseen at fit time:
- instlevel-range_
