# Import thư viện cần thiết

In [136]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

import pandas as pd
from sklearn.model_selection import train_test_split
import numpy as np
from sklearn.utils import class_weight
import matplotlib.pyplot as plt
import seaborn as sns

Đọc dữ liệu

In [137]:
train = pd.read_csv('../data/train_preprocessed.csv')
train.head()

Unnamed: 0,Id,v2a1,hacdor,rooms,hacapo,v14a,refrig,v18q,v18q1,r4h1,...,SQBage,SQBhogar_total,SQBedjefe,SQBhogar_nin,SQBovercrowding,SQBdependency,SQBmeaned,agesq,Target,rez_esc-missing
0,ID_279628684,190000.0,0,3,0,1,1,0,0.0,0,...,1849,1,100,0,1.0,0.0,100.0,1849,4,False
1,ID_f29eb3ddd,135000.0,0,4,0,1,1,1,1.0,0,...,4489,1,144,0,1.0,64.0,144.0,4489,4,False
2,ID_68de51c94,0.0,0,8,0,1,1,0,0.0,0,...,8464,1,0,0,0.25,64.0,121.0,8464,4,False
3,ID_d671db89c,180000.0,0,5,0,1,1,1,1.0,0,...,289,16,121,4,1.777778,1.0,121.0,289,4,False
4,ID_d56d6f5f5,180000.0,0,5,0,1,1,1,1.0,0,...,1369,16,121,4,1.777778,1.0,121.0,1369,4,False


In [138]:
test = pd.read_csv('../data/test_preprocessed.csv')
test.head()

Unnamed: 0,Id,v2a1,hacdor,rooms,hacapo,v14a,refrig,v18q,v18q1,r4h1,...,SQBescolari,SQBage,SQBhogar_total,SQBedjefe,SQBhogar_nin,SQBovercrowding,SQBdependency,SQBmeaned,agesq,rez_esc-missing
0,ID_2f6873615,0.0,0,5,0,1,1,0,0.0,1,...,0,16,9,0,1,2.25,0.25,272.25,16,False
1,ID_1c78846d2,0.0,0,5,0,1,1,0,0.0,1,...,256,1681,9,0,1,2.25,0.25,272.25,1681,False
2,ID_e5442cf6a,0.0,0,5,0,1,1,0,0.0,1,...,289,1681,9,0,1,2.25,0.25,272.25,1681,False
3,ID_a8db26a79,0.0,0,14,0,1,1,1,1.0,0,...,256,3481,1,256,0,1.0,0.0,256.0,3481,False
4,ID_a62966799,175000.0,0,4,0,1,1,1,1.0,0,...,121,324,1,0,1,0.25,64.0,,324,True


In [139]:
ntrain = train.shape[0]
ntest = test.shape[0]

all_data = pd.concat((train, test)).reset_index(drop=True)

# Xây dựng mô hình học máy

## Loại bỏ những thuộc tính dư thừa

### Tạo biến ordinal từ dữ liệu đã được one-hot encode

Các thuộc tính như `epared`, `etecho`, `eviv` và `instlevel` có thể được chuyển về dạng dữ liệu ordinal với quy ước **(bad, regular, good) -> (0, 1, 2)**

In [140]:
def get_numeric(data, status_name):
    status_cols = [s for s in data.columns.tolist() if status_name in s]
    status_df = data[status_cols]
    status_df.columns = list(range(status_df.shape[1]))
    status_numeric = status_df.idxmax(1)
    status_numeric.name = status_name
    data = pd.concat([data, status_numeric], axis=1)
    return data

In [141]:
status_name_list = ['epared', 'etecho', 'eviv', 'instlevel']
for status_name in status_name_list:
    all_data = get_numeric(all_data, status_name)

### Xóa những thuộc tính không cần thiết

Nhóm nhận thấy có những thuộc tính có thể được xác định bằng những thuộc tính khác trong dữ liệu.

* Nhóm thuộc tính sau có thể được tạo ra bằng sự kết hợp từ `r4h` và `r4m`:
    ```
    r4t1, persons younger than 12 years of age
    r4t2, persons 12 years of age and older
    r4t3, Total persons in the household
    ```

* Các thuộc tính sau mang cùng ý nghĩa với `hogar_total`:
    ```
    tamhog, size of the household
    tamviv, number of persons living in the household
    hhsize, household size
    r4t3, Total persons in the household
    ```

* `v18q` có thể được tạo ra từ `v18q1`.
* `mobilephone` có thể được tạo ra từ `qmobilephone`.
* `epared1~3`, `etecho1~3`, `eviv1~3`, `instlevel1~9` do đã được chuyển đổi thành dữ liệu ordinal nên sẽ không dùng đến nữa.

In [142]:
redundant_features = [
                      'epared1', 'epared2', 'epared3', 
                      'etecho1', 'etecho2', 'etecho3',
                      'eviv1', 'eviv2', 'eviv3', 
                      'instlevel1', 'instlevel2', 'instlevel3', 'instlevel4', 'instlevel5', 'instlevel6', 'instlevel7', 'instlevel8', 'instlevel9']
all_data.drop(columns=redundant_features, inplace=True)

In [143]:
tar = all_data['Target'].copy()

In [144]:
tar

0        4.0
1        4.0
2        4.0
3        4.0
4        4.0
        ... 
29989    NaN
29990    NaN
29991    NaN
29992    NaN
29993    NaN
Name: Target, Length: 29994, dtype: float64

In [145]:
all_data.drop(columns='Target', inplace=True)

In [146]:
all_data['rez_esc'].fillna(0, inplace =True)

In [147]:
all_data['num_over_18'] = 0
# Tạo một Series mới để lưu số lượng người trên 18 tuổi theo từng hộ gia đình
num_over_18_series = all_data[all_data['age'] >= 18].groupby('idhogar').size()

# Gán số lượng người trên 18 tuổi cho mỗi hộ gia đình trong DataFrame chính
all_data['num_over_18'] = all_data['idhogar'].map(num_over_18_series).fillna(0)

# all_data['num_over_18'] = all_data[all_data.age >= 18].groupby('idhogar').transform("count") //
# all_data['num_over_18'] = all_data.groupby("idhogar")["num_over_18"].transform("max")
# all_data['num_over_18'] = all_data['num_over_18'].fillna(0)
# # add some extra features, these were taken from another kernel
# def extract_features(df):
#     # add the number of people over 18 in each household
#     df['bedrooms_to_rooms'] = df['bedrooms']/df['rooms']
#     df['rent_to_rooms'] = df['v2a1']/df['rooms']
#     df['tamhog_to_rooms'] = df['tamhog']/df['rooms'] # tamhog - size of the household
#     df['r4t3_to_tamhog'] = df['r4t3']/df['tamhog'] # r4t3 - Total persons in the household
#     df['r4t3_to_rooms'] = df['r4t3']/df['rooms'] # r4t3 - Total persons in the household
#     df['v2a1_to_r4t3'] = df['v2a1']/df['r4t3'] # rent to people in household
#     df['v2a1_to_r4t3'] = df['v2a1']/(df['r4t3'] - df['r4t1']) # rent to people under age 12
#     df['hhsize_to_rooms'] = df['hhsize']/df['rooms'] # rooms per person
#     df['rent_to_hhsize'] = df['v2a1']/df['hhsize'] # rent to household size
#     df['rent_to_over_18'] = df['v2a1']/df['num_over_18']
#     # some households have no one over 18, use the total rent for those
#     df.loc[df.num_over_18 == 0, "rent_to_over_18"] = df[df.num_over_18 == 0].v2a1
    
# extract_features(all_data)       

In [148]:
all_data['num_over_18'] = all_data.groupby("idhogar")["num_over_18"].max()
all_data['num_over_18'] = all_data['num_over_18'].fillna(0)
# 


In [149]:
def extract_features(df):
    # add the number of people over 18 in each household
    df['bedrooms_to_rooms'] = df['bedrooms']/df['rooms']
    df['rent_to_rooms'] = df['v2a1']/df['rooms']
    df['tamhog_to_rooms'] = df['tamhog']/df['rooms'] # tamhog - size of the household
    df['r4t3_to_tamhog'] = df['r4t3']/df['tamhog'] # r4t3 - Total persons in the household
    df['r4t3_to_rooms'] = df['r4t3']/df['rooms'] # r4t3 - Total persons in the household
    df['v2a1_to_r4t3'] = df['v2a1']/df['r4t3'] # rent to people in household
    df['v2a1_to_r4t3'] = df['v2a1']/(df['r4t3'] - df['r4t1']) # rent to people under age 12
    df['hhsize_to_rooms'] = df['hhsize']/df['rooms'] # rooms per person
    df['rent_to_hhsize'] = df['v2a1']/df['hhsize'] # rent to household size
    df['rent_to_over_18'] = df['v2a1']/df['num_over_18']
    # some households have no one over 18, use the total rent for those
    df.loc[df.num_over_18 == 0, "rent_to_over_18"] = df[df.num_over_18 == 0].v2a1
    
extract_features(all_data)       

In [150]:
all_data

Unnamed: 0,Id,v2a1,hacdor,rooms,hacapo,v14a,refrig,v18q,v18q1,r4h1,...,num_over_18,bedrooms_to_rooms,rent_to_rooms,tamhog_to_rooms,r4t3_to_tamhog,r4t3_to_rooms,v2a1_to_r4t3,hhsize_to_rooms,rent_to_hhsize,rent_to_over_18
0,ID_279628684,190000.0,0,3,0,1,1,0,0.0,0,...,0.0,0.333333,63333.333333,0.333333,1.0,0.333333,190000.0,0.333333,190000.0,190000.0
1,ID_f29eb3ddd,135000.0,0,4,0,1,1,1,1.0,0,...,0.0,0.250000,33750.000000,0.250000,1.0,0.250000,135000.0,0.250000,135000.0,135000.0
2,ID_68de51c94,0.0,0,8,0,1,1,0,0.0,0,...,0.0,0.250000,0.000000,0.125000,1.0,0.125000,0.0,0.125000,0.0,0.0
3,ID_d671db89c,180000.0,0,5,0,1,1,1,1.0,0,...,0.0,0.600000,36000.000000,0.800000,1.0,0.800000,60000.0,0.800000,45000.0,180000.0
4,ID_d56d6f5f5,180000.0,0,5,0,1,1,1,1.0,0,...,0.0,0.600000,36000.000000,0.800000,1.0,0.800000,60000.0,0.800000,45000.0,180000.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
29989,ID_a065a7cad,0.0,1,2,1,1,1,0,0.0,0,...,0.0,0.500000,0.000000,3.000000,1.0,3.000000,0.0,3.000000,0.0,0.0
29990,ID_1a7c6953b,0.0,0,3,0,1,1,0,0.0,0,...,0.0,0.666667,0.000000,1.333333,1.0,1.333333,0.0,1.333333,0.0,0.0
29991,ID_07dbb4be2,0.0,0,3,0,1,1,0,0.0,0,...,0.0,0.666667,0.000000,1.333333,1.0,1.333333,0.0,1.333333,0.0,0.0
29992,ID_34d2ed046,0.0,0,3,0,1,1,0,0.0,0,...,0.0,0.666667,0.000000,1.333333,1.0,1.333333,0.0,1.333333,0.0,0.0


In [151]:
needless_cols = ['r4t3', 'tamhog', 'tamviv', 'hhsize', 'v18q', 'v14a', 'agesq',
                 'mobilephone', 'female', ]

#instlevel_cols = [s for s in train.columns.tolist() if 'instlevel' in s]

#needless_cols.extend(instlevel_cols)

all_data.drop(needless_cols, axis=1, inplace=True)
all_data.shape

(29994, 130)

In [152]:
all_data = pd.concat([all_data, tar],axis=1)
all_data.shape

(29994, 131)

In [153]:
all_data

Unnamed: 0,Id,v2a1,hacdor,rooms,hacapo,refrig,v18q1,r4h1,r4h2,r4h3,...,bedrooms_to_rooms,rent_to_rooms,tamhog_to_rooms,r4t3_to_tamhog,r4t3_to_rooms,v2a1_to_r4t3,hhsize_to_rooms,rent_to_hhsize,rent_to_over_18,Target
0,ID_279628684,190000.0,0,3,0,1,0.0,0,1,1,...,0.333333,63333.333333,0.333333,1.0,0.333333,190000.0,0.333333,190000.0,190000.0,4.0
1,ID_f29eb3ddd,135000.0,0,4,0,1,1.0,0,1,1,...,0.250000,33750.000000,0.250000,1.0,0.250000,135000.0,0.250000,135000.0,135000.0,4.0
2,ID_68de51c94,0.0,0,8,0,1,0.0,0,0,0,...,0.250000,0.000000,0.125000,1.0,0.125000,0.0,0.125000,0.0,0.0,4.0
3,ID_d671db89c,180000.0,0,5,0,1,1.0,0,2,2,...,0.600000,36000.000000,0.800000,1.0,0.800000,60000.0,0.800000,45000.0,180000.0,4.0
4,ID_d56d6f5f5,180000.0,0,5,0,1,1.0,0,2,2,...,0.600000,36000.000000,0.800000,1.0,0.800000,60000.0,0.800000,45000.0,180000.0,4.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
29989,ID_a065a7cad,0.0,1,2,1,1,0.0,0,2,2,...,0.500000,0.000000,3.000000,1.0,3.000000,0.0,3.000000,0.0,0.0,
29990,ID_1a7c6953b,0.0,0,3,0,1,0.0,0,1,1,...,0.666667,0.000000,1.333333,1.0,1.333333,0.0,1.333333,0.0,0.0,
29991,ID_07dbb4be2,0.0,0,3,0,1,0.0,0,1,1,...,0.666667,0.000000,1.333333,1.0,1.333333,0.0,1.333333,0.0,0.0,
29992,ID_34d2ed046,0.0,0,3,0,1,0.0,0,1,1,...,0.666667,0.000000,1.333333,1.0,1.333333,0.0,1.333333,0.0,0.0,


In [154]:
train = all_data.loc[all_data["Target"].notnull()]
# train_final = train_final.fillna(0)
test = all_data.loc[all_data["Target"].isnull()]

In [155]:
train

Unnamed: 0,Id,v2a1,hacdor,rooms,hacapo,refrig,v18q1,r4h1,r4h2,r4h3,...,bedrooms_to_rooms,rent_to_rooms,tamhog_to_rooms,r4t3_to_tamhog,r4t3_to_rooms,v2a1_to_r4t3,hhsize_to_rooms,rent_to_hhsize,rent_to_over_18,Target
0,ID_279628684,190000.0,0,3,0,1,0.0,0,1,1,...,0.333333,63333.333333,0.333333,1.0,0.333333,190000.0,0.333333,190000.0,190000.0,4.0
1,ID_f29eb3ddd,135000.0,0,4,0,1,1.0,0,1,1,...,0.250000,33750.000000,0.250000,1.0,0.250000,135000.0,0.250000,135000.0,135000.0,4.0
2,ID_68de51c94,0.0,0,8,0,1,0.0,0,0,0,...,0.250000,0.000000,0.125000,1.0,0.125000,0.0,0.125000,0.0,0.0,4.0
3,ID_d671db89c,180000.0,0,5,0,1,1.0,0,2,2,...,0.600000,36000.000000,0.800000,1.0,0.800000,60000.0,0.800000,45000.0,180000.0,4.0
4,ID_d56d6f5f5,180000.0,0,5,0,1,1.0,0,2,2,...,0.600000,36000.000000,0.800000,1.0,0.800000,60000.0,0.800000,45000.0,180000.0,4.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8603,ID_d45ae367d,80000.0,0,6,0,1,0.0,0,2,2,...,0.666667,13333.333333,0.833333,1.0,0.833333,20000.0,0.833333,16000.0,80000.0,2.0
8604,ID_c94744e07,80000.0,0,6,0,1,0.0,0,2,2,...,0.666667,13333.333333,0.833333,1.0,0.833333,20000.0,0.833333,16000.0,80000.0,2.0
8605,ID_85fc658f8,80000.0,0,6,0,1,0.0,0,2,2,...,0.666667,13333.333333,0.833333,1.0,0.833333,20000.0,0.833333,16000.0,80000.0,2.0
8606,ID_ced540c61,80000.0,0,6,0,1,0.0,0,2,2,...,0.666667,13333.333333,0.833333,1.0,0.833333,20000.0,0.833333,16000.0,80000.0,2.0


In [156]:
n = train.columns[train.isnull().any(axis=0)]

# In ra các cột chứa dữ liệu null
print("Các cột chứa dữ liệu null:")
print(n)

Các cột chứa dữ liệu null:
Index([], dtype='object')


Chia dữ liệu

In [176]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score
from sklearn.ensemble import GradientBoostingClassifier

In [177]:
train.shape

(8608, 131)

In [178]:
X= train.drop( ['Id', 'idhogar', 'Target'], axis= 1)
y= train['Target']
X.shape


(8608, 128)

In [179]:
# Our subjects of interest are heads of household
# Hence, for training, we will only be using these subjects

# But for test data, we have to predict for both heads and non-heads,
# except that results for non-heads are not graded.
# Hence, we treat all the test data like heads of household for prediction purposes.
# X = X[X.parentesco1 == 1].copy()
# y = y - 1
# y = y.astype('int')
# X.drop(["parentesco1"], axis=1, inplace = True)

# test.drop(columns = ["parentesco1"], inplace = True)
# X.shape

In [180]:
y_weights = class_weight.compute_sample_weight('balanced', y, indices=None)

print(pd.DataFrame(dict(Target = y, Weight = y_weights)).drop_duplicates().sort_values(by = ["Target"]).reset_index(drop = True))

   Target    Weight
0     1.0  3.448718
1     2.0  1.625378
2     3.0  2.007463
3     4.0  0.385111


In [181]:
y_weights.shape

(8608,)

In [182]:
X_train,X_test,y_train,y_test, weights_train,weights_test  =train_test_split(X,y,y_weights,test_size=0.2, random_state=1)
X_train.shape

(6886, 128)

In [183]:
from sklearn.ensemble import GradientBoostingClassifier


In [187]:
gradient = GradientBoostingClassifier(n_estimators=50, learning_rate=0.5, max_depth=5)
gradient.fit(X_train, y_train, sample_weight=weights_train)
y_pred = gradient.predict(X_test)
f1 = f1_score(y_test, y_pred, average='macro') # Để sử dụng macro-average F1 score

print(f'f1 : {f1}')

f1 : 0.8811446744369928
