In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.preprocessing import FunctionTransformer
from sklearn.neural_network import MLPClassifier
from sklearn import set_config
set_config(display='diagram') 
import numpy as np

In [2]:
data_df = pd.read_csv('total_data.csv', index_col=0)
data_df

Unnamed: 0,City,Level,PM2.5,PM10,O3,NO2,SO2,CO
0,Dhaka,Very Unhealthy,227.5,,,,,
1,Delhi,Very Unhealthy,137.7,201.2,9.8,35.0,12.6,1181.7
2,Bishkek,Very Unhealthy,157.4,2.2,,,,
3,Kathmandu,Unhealthy,112.2,198.5,14.8,,,
4,Lahore,Unhealthy,104.5,,,,,
...,...,...,...,...,...,...,...,...
766,"Ourense, Galicia",Unhealthy For Sensitive Groups,44.2,119.8,1.0,,1.2,
767,"Valdemoro, Madrid",Unhealthy For Sensitive Groups,44.0,,10.0,1.0,,
768,"Villa del Prado, Madrid",Unhealthy For Sensitive Groups,44.0,0.0,1.0,44.0,1.0,0.0
769,"Fiq, Quneitra",Moderate,14.0,,,,,


In [3]:
data_df.index.duplicated().sum()

0

In [4]:
data_df['Level'].dtype

dtype('O')

In [5]:
data_df['Level'].isna().sum()

4

In [6]:
#loai bo cac dong khong co du lieu Level
data_df = data_df[data_df['Level'].notna()]
data_df['Level'].isna().sum()

0

In [7]:
temp = data_df['Level'].value_counts(normalize=True) * 100
temp

Moderate                          38.722295
Good                              25.814863
Unhealthy For Sensitive Groups    15.254237
Unhealthy                         13.559322
Very Unhealthy                     3.389831
Hazardous                          3.259452
Name: Level, dtype: float64

In [8]:
# Tách X và y
y_sr = data_df["Level"]
X_df = data_df.drop("Level", axis=1)


In [9]:
y_sr

0                      Very Unhealthy
1                      Very Unhealthy
2                      Very Unhealthy
3                           Unhealthy
4                           Unhealthy
                    ...              
766    Unhealthy For Sensitive Groups
767    Unhealthy For Sensitive Groups
768    Unhealthy For Sensitive Groups
769                          Moderate
770                              Good
Name: Level, Length: 767, dtype: object

In [10]:
X_df

Unnamed: 0,City,PM2.5,PM10,O3,NO2,SO2,CO
0,Dhaka,227.5,,,,,
1,Delhi,137.7,201.2,9.8,35.0,12.6,1181.7
2,Bishkek,157.4,2.2,,,,
3,Kathmandu,112.2,198.5,14.8,,,
4,Lahore,104.5,,,,,
...,...,...,...,...,...,...,...
766,"Ourense, Galicia",44.2,119.8,1.0,,1.2,
767,"Valdemoro, Madrid",44.0,,10.0,1.0,,
768,"Villa del Prado, Madrid",44.0,0.0,1.0,44.0,1.0,0.0
769,"Fiq, Quneitra",14.0,,,,,


In [11]:
#Tách tập train, test và validation
trainval_X_df, test_X_df, trainval_y_sr, test_y_sr = train_test_split(X_df, y_sr, test_size=0.3, random_state=None, stratify = y_sr)
train_X_df, val_X_df, train_y_sr, val_y_sr = train_test_split(trainval_X_df, trainval_y_sr, test_size=0.3, random_state=None, stratify = trainval_y_sr)

In [12]:
train_X_df.shape

(375, 7)

In [13]:
train_y_sr.shape

(375,)

In [14]:
val_X_df.shape

(161, 7)

In [15]:
val_y_sr.shape

(161,)

In [16]:
train_X_df.index

Int64Index([ 31, 512, 168, 475, 265, 438, 228, 669, 585, 175,
            ...
            105, 270, 700, 203,  78, 274, 588, 651, 634, 155],
           dtype='int64', length=375)

In [17]:
train_X_df.dtypes

City      object
PM2.5    float64
PM10     float64
O3       float64
NO2      float64
SO2      float64
CO       float64
dtype: object

In [18]:
train_X_df.dtypes[train_X_df.dtypes != object]

PM2.5    float64
PM10     float64
O3       float64
NO2      float64
SO2      float64
CO       float64
dtype: object

In [19]:
#quan sát phân bố của các cột số
num_cols = ['PM2.5', 'PM10', 'O3', 'NO2', 'SO2', 'CO']
df = train_X_df[num_cols]
def missing_ratio(df):
    return (df.isna().mean() * 100).round(1)
def lower_quartile(df):
    return df.quantile(0.25).round(1)
def median(df):
    return df.quantile(0.5).round(1)
def upper_quartile(df):
    return df.quantile(0.75).round(1)
df.agg([missing_ratio, 'min', lower_quartile, median, upper_quartile, 'max'])



Unnamed: 0,PM2.5,PM10,O3,NO2,SO2,CO
missing_ratio,1.6,42.1,66.9,54.1,67.5,79.5
min,0.0,0.0,0.0,0.0,0.0,0.0
lower_quartile,11.6,16.5,8.3,11.9,2.6,9.0
median,25.4,30.0,20.7,24.2,7.0,510.0
upper_quartile,48.2,61.0,36.0,41.8,15.4,1019.9
max,328.0,1159.0,100.0,310.0,215.0,2920.0


In [20]:
#hàm tiền xử lý
def fill_missing(X_df):
    num_cols = ['PM2.5', 'PM10', 'O3', 'NO2', 'SO2', 'CO']
    df = X_df.copy()
    #thay 'nan' bằng 0.0
    df[num_cols] = X_df[num_cols].fillna(0)
    #xóa cột City vì là cột định danh không cần thiết cho mô hình
    df = df.drop(columns=['City'])
    return df

In [21]:
#sử dụng FunctionTransformer để transform tập dữ liệu
transform = FunctionTransformer(fill_missing)


In [22]:
#tạo pipeline cho tiền XL
preprocess_pipeline = make_pipeline(FunctionTransformer(fill_missing), StandardScaler())


In [23]:
#preprocess tập train
preprocess_train_X = preprocess_pipeline.fit_transform(train_X_df)
preprocess_train_X

array([[-0.16303094,  0.1441983 ,  1.72518815,  1.0627351 ,  0.46775718,
         1.27665953],
       [-0.63899564, -0.28446176, -0.50686809, -0.15629385, -0.10123815,
        -0.31776386],
       [-0.67629138, -0.30273501, -0.50686809, -0.38575813, -0.28894796,
        -0.31776386],
       ...,
       [-0.52178045, -0.19918658, -0.37932201,  1.22407716, -0.28894796,
        -0.31776386],
       [-0.52178045, -0.1611173 , -0.50686809, -0.10251316, -0.28894796,
        -0.31776386],
       [-0.72246706, -0.34384983, -0.50686809, -0.53275868, -0.28894796,
        -0.31776386]])

In [24]:
#preprocess tập validation
preprocessed_val_X = preprocess_pipeline.transform(val_X_df)
preprocessed_val_X.shape

(161, 6)

In [25]:
train_y_sr.value_counts()

Moderate                          145
Good                               97
Unhealthy For Sensitive Groups     57
Unhealthy                          51
Very Unhealthy                     13
Hazardous                          12
Name: Level, dtype: int64

In [26]:
#chuyển ouput thành các cấp độ để dễ nhiều 
def transform_y(train_y_sr):
    new_y = []
    for index, value in train_y_sr.items():
        if value == 'Good':
            new_y.append(0)
        elif value == 'Moderate':
            new_y.append(1)
        elif value =='Unhealthy For Sensitive Groups':
            new_y.append(2)
        elif value == 'Unhealthy':
            new_y.append(3)
        elif value == 'Very Unhealthy':
            new_y.append(4)
        else:
            new_y.append(5)
    new_y_sr = pd.Series(new_y)
    return new_y_sr

In [27]:
new_train_y_sr = transform_y(train_y_sr)
new_val_y_sr = transform_y(val_y_sr)
new_test_y_sr = transform_y(test_y_sr)

In [28]:
neural_net = MLPClassifier(hidden_layer_sizes=(20,), activation='tanh', solver='lbfgs', max_iter=2500, random_state = 0)
full_pipeline = make_pipeline(preprocess_pipeline, neural_net)

In [None]:
# Thử nghiệm với các giá trị khác nhau của các siêu tham số
# và chọn ra các giá trị tốt nhất
train_errs = []
val_errs = []
alphas = [0.00001, 0.001, 0.01, 0.1, 1, 10, 100, 1000]
best_val_err = float('inf'); 
best_alpha = None;
li_hidden_layer_sizes = [5, 10, 15, 20, 25, 30]
best_hidden_layer_size = None
li = []

for alpha in alphas:
    for x in li_hidden_layer_sizes:
        full_pipeline.set_params(mlpclassifier__alpha=alpha, mlpclassifier__hidden_layer_sizes=x)
        full_pipeline.fit(train_X_df, new_train_y_sr)
        train_err = (1-full_pipeline.score(train_X_df, new_train_y_sr))*100
        train_errs.append(train_err)
        val_err = (1-full_pipeline.score(val_X_df, new_val_y_sr))*100
        val_errs.append(val_err)
        li.append([alpha, x])
    
best_val_err = min(val_errs)
best_alpha = li[val_errs.index(best_val_err)][0]
best_hidden_layer_size = li[val_errs.index(best_val_err)][1]
'Finish!'

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)


In [None]:
best_alpha

In [None]:
best_hidden_layer_size

In [None]:
#sử dụng best_alpha cho mô hình
full_pipeline.set_params(mlpclassifier__alpha=best_alpha, mlpclassifier__hidden_layer_sizes=best_hidden_layer_size)

In [None]:
full_pipeline.fit(train_X_df, new_train_y_sr)

In [None]:
full_pipeline.score(test_X_df, new_test_y_sr)

In [None]:
#độ lỗi
(1 - full_pipeline.score(test_X_df, new_test_y_sr))*100