In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.preprocessing import FunctionTransformer
from sklearn.neural_network import MLPClassifier
from sklearn import set_config
set_config(display='diagram') 

In [2]:
data_df = pd.read_csv('data.csv',index_col = 0)
data_df

Unnamed: 0,City,Level,PM2.5,PM10,O3,NO2,SO2,CO
0,"Dhaka, Dhaka",Very Unhealthy,182.1,,,,,
1,"Manikganj, Dhaka",Very Unhealthy,171.0,,,,,
2,"Savar, Dhaka",Unhealthy,119.1,,,,,
3,"Sreepur, Dhaka",Unhealthy,81.6,,,,,
4,"Narayanganj, Dhaka",Good,8.0,,,,,
...,...,...,...,...,...,...,...,...
126,"Swan, Western Australia",Moderate,13.5,46.2,,67.7,,
127,"Busselton, Western Australia",Moderate,6.1,57.0,,,,
128,"Albany, Western Australia",Good,,55.0,,,,
129,"Geraldton, Western Australia",Good,7.0,46.2,,,,


In [3]:
data_df_1 = pd.read_csv('data_1.csv',index_col = 0)
data_df_1

Unnamed: 0,City,Level,PM2.5,PM10,O3,NO2,SO2,CO
0,"Ube, Yamaguchi",Moderate,30.0,8.0,,9.4,7.9,
1,"Hiraidemachi, Tochigi",Moderate,23.0,8.0,,39.5,,
2,"Minato, Wakayama",Moderate,20.0,24.0,,,23.6,
3,"Mizusawaku Higashiodori, Iwate",Moderate,19.0,4.0,,26.3,2.6,
4,"Futago, Kanagawa",Moderate,15.0,11.0,,73.3,,
...,...,...,...,...,...,...,...,...
201,"Puertollano, Castilla-La Mancha",Moderate,23.0,15.0,34.0,14.0,5.0,
202,"Beasain, Basque Country",Moderate,22.0,,,,2.0,0.0
203,"Ciutat Meridiana, Catalunya",Moderate,22.0,32.0,,,,
204,"Arwad, Tartus",Moderate,28.7,46.3,,18.4,7.9,


In [4]:
data_df_city = pd.read_csv('data_cityranking.csv',index_col = 0)
data_df_city

Unnamed: 0,City,Level,PM2.5,PM10,O3,NO2,SO2,CO
0,Dhaka,Very Unhealthy,227.5,,,,,
1,Delhi,Very Unhealthy,137.7,201.2,9.8,35.0,12.6,1181.7
2,Bishkek,Very Unhealthy,157.4,2.2,,,,
3,Kathmandu,Unhealthy,112.2,198.5,14.8,,,
4,Lahore,Unhealthy,104.5,,,,,
...,...,...,...,...,...,...,...,...
91,Tokyo,Good,6.1,6.4,,44.2,2.6,381.7
92,Los Angeles,Good,17.1,41.2,3.0,31.8,2.4,
93,Sydney,Good,0.0,7.2,31.5,1.1,0.0,2.3
94,Skopje,Good,6.8,4.9,20.3,11.5,0.0,


In [5]:
data_df_final = pd.concat([data_df_city,data_df,data_df_1], ignore_index=True)
data_df_final

Unnamed: 0,City,Level,PM2.5,PM10,O3,NO2,SO2,CO
0,Dhaka,Very Unhealthy,227.5,,,,,
1,Delhi,Very Unhealthy,137.7,201.2,9.8,35.0,12.6,1181.7
2,Bishkek,Very Unhealthy,157.4,2.2,,,,
3,Kathmandu,Unhealthy,112.2,198.5,14.8,,,
4,Lahore,Unhealthy,104.5,,,,,
...,...,...,...,...,...,...,...,...
428,"Puertollano, Castilla-La Mancha",Moderate,23.0,15.0,34.0,14.0,5.0,
429,"Beasain, Basque Country",Moderate,22.0,,,,2.0,0.0
430,"Ciutat Meridiana, Catalunya",Moderate,22.0,32.0,,,,
431,"Arwad, Tartus",Moderate,28.7,46.3,,18.4,7.9,


In [6]:
data_df_final.to_csv('data_final.csv')

In [7]:
data_df_final['Level'].isna().sum()

2

In [8]:
#lấy index của 2 dòng thiếu giá trị Level
missing_level = data_df_final[data_df_final['Level'].isnull()].index.tolist()
missing_level

[119, 176]

In [9]:
#xóa 2 dòng đó khỏi dataframe
for i in missing_level:
    data_df_final = data_df_final.drop(index = i)
data_df_final

Unnamed: 0,City,Level,PM2.5,PM10,O3,NO2,SO2,CO
0,Dhaka,Very Unhealthy,227.5,,,,,
1,Delhi,Very Unhealthy,137.7,201.2,9.8,35.0,12.6,1181.7
2,Bishkek,Very Unhealthy,157.4,2.2,,,,
3,Kathmandu,Unhealthy,112.2,198.5,14.8,,,
4,Lahore,Unhealthy,104.5,,,,,
...,...,...,...,...,...,...,...,...
428,"Puertollano, Castilla-La Mancha",Moderate,23.0,15.0,34.0,14.0,5.0,
429,"Beasain, Basque Country",Moderate,22.0,,,,2.0,0.0
430,"Ciutat Meridiana, Catalunya",Moderate,22.0,32.0,,,,
431,"Arwad, Tartus",Moderate,28.7,46.3,,18.4,7.9,


In [10]:
data_df_final['Level'].isna().sum() #check lại xem cột Level còn missing value không

0

In [11]:
data_df.index.duplicated().sum()

0

In [12]:
data_df['Level'].dtype

dtype('O')

In [13]:
temp = data_df['Level'].value_counts(normalize=True) * 100
temp

Good                              39.534884
Moderate                          23.255814
Unhealthy                         12.403101
Unhealthy For Sensitive Groups    11.627907
Very Unhealthy                     9.302326
Hazardous                          3.875969
Name: Level, dtype: float64

In [14]:
#chuyển string thành số (số tương ứng mức AQI cao nhất của từng loại đánh giá, đối với Hazardous là 300+ nên lấy 301)
data_df_final = data_df_final.replace({'Good' : 50, 'Moderate' : 100, 'Unhealthy For Sensitive Groups' : 150, 'Unhealthy' : 200, 'Very Unhealthy' : 300, 'Hazardous' : 301})
data_df_final

Unnamed: 0,City,Level,PM2.5,PM10,O3,NO2,SO2,CO
0,Dhaka,300,227.5,,,,,
1,Delhi,300,137.7,201.2,9.8,35.0,12.6,1181.7
2,Bishkek,300,157.4,2.2,,,,
3,Kathmandu,200,112.2,198.5,14.8,,,
4,Lahore,200,104.5,,,,,
...,...,...,...,...,...,...,...,...
428,"Puertollano, Castilla-La Mancha",100,23.0,15.0,34.0,14.0,5.0,
429,"Beasain, Basque Country",100,22.0,,,,2.0,0.0
430,"Ciutat Meridiana, Catalunya",100,22.0,32.0,,,,
431,"Arwad, Tartus",100,28.7,46.3,,18.4,7.9,


In [15]:
data_df_final['Level'] = pd.to_numeric(data_df_final['Level'])
data_df_final['Level'].dtype

dtype('int64')

In [16]:
# Tách X và y
y_sr = data_df_final["Level"]
X_df = data_df_final.drop("Level", axis=1)


In [17]:
#Tách tập train và validation
train_X_df, val_X_df, train_y_sr, val_y_sr = train_test_split(X_df, y_sr, test_size=0.3, random_state=0)

In [18]:
train_X_df.shape

(301, 7)

In [19]:
train_y_sr.shape

(301,)

In [20]:
val_X_df.shape

(130, 7)

In [21]:
val_y_sr.shape

(130,)

In [22]:
train_X_df.index

Int64Index([365, 106, 274,  63, 300, 371, 380, 238,  89, 252,
            ...
            213,   9, 361, 197, 253, 325, 194, 117,  47, 173],
           dtype='int64', length=301)

In [23]:
train_X_df.dtypes

City      object
PM2.5    float64
PM10     float64
O3       float64
NO2      float64
SO2      float64
CO       float64
dtype: object

In [24]:
train_X_df.dtypes[train_X_df.dtypes != object]

PM2.5    float64
PM10     float64
O3       float64
NO2      float64
SO2      float64
CO       float64
dtype: object

In [25]:
#quan sát phân bố của các cột số
num_cols = ['PM2.5', 'PM10', 'O3', 'NO2', 'SO2', 'CO']
df = train_X_df[num_cols]
def missing_ratio(df):
    return (df.isna().mean() * 100).round(1)
def lower_quartile(df):
    return df.quantile(0.25).round(1)
def median(df):
    return df.quantile(0.5).round(1)
def upper_quartile(df):
    return df.quantile(0.75).round(1)
df.agg([missing_ratio, 'min', lower_quartile, median, upper_quartile, 'max'])



Unnamed: 0,PM2.5,PM10,O3,NO2,SO2,CO
missing_ratio,1.7,43.2,64.1,51.5,62.8,76.1
min,0.0,2.1,0.0,1.1,0.0,0.0
lower_quartile,11.0,12.6,6.0,13.9,2.6,14.8
median,26.5,28.8,18.0,29.0,4.7,425.7
upper_quartile,42.8,53.1,34.8,49.1,9.2,900.0
max,770.0,504.0,84.0,470.0,101.9,2920.0


In [26]:
#hàm tiền xử lý
def fill_missing(X_df):
    num_cols = ['PM2.5', 'PM10', 'O3', 'NO2', 'SO2', 'CO']
    df = X_df
    #thay 'nan' bằng 0.0
    df[num_cols] = X_df[num_cols].fillna(0)
    #xóa cột City vì là cột định danh không cần thiết cho mô hình
    df = df.drop(columns=['City'])
    return df

In [27]:
#sử dụng FunctionTransformer để transform tập dữ liệu
transform = FunctionTransformer(fill_missing)


In [28]:
#tạo pipeline cho tiền XL
preprocess_pipeline = make_pipeline(FunctionTransformer(fill_missing), StandardScaler())

In [29]:
#preprocess_pipeline

In [30]:
#preprocess tập train
preprocess_train_X = preprocess_pipeline.fit_transform(train_X_df)
preprocess_train_X.shape

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[k1] = value[k2]


(301, 6)

In [31]:
#preprocess tập validation
preprocessed_val_X = preprocess_pipeline.transform(val_X_df)
preprocessed_val_X.shape

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[k1] = value[k2]


(130, 6)