In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.preprocessing import FunctionTransformer
from sklearn.neural_network import MLPClassifier
from sklearn import set_config
set_config(display='diagram') 

In [2]:
data_df = pd.read_csv('data_final.csv')
data_df

Unnamed: 0,City,AQI,PM2.5,PM10,O3,NO2,SO2,CO
0,Bishkek,353,356.8,2.2,,,,
1,Kolkata,218,163.9,286.3,17.7,66.1,26.2,1384.0
2,Kabul,196,283.0,,,,,
3,Dhaka,196,178.8,,,,,
4,Kathmandu,193,116.3,217.7,17.5,,,
...,...,...,...,...,...,...,...,...
424,"Swan, Western Australia",44,10.7,40.7,,5.6,,
425,"Wynnum West, Queensland",44,10.6,14.2,,,2.6,
426,"Moe, Victoria",40,9.8,,,,,
427,"Wyong, New South Wales",40,1.6,44.4,,3.8,2.6,10.4


In [3]:
data_df.index.duplicated().sum()

0

In [4]:
data_df['AQI'].dtype

dtype('int64')

In [5]:
data_df['AQI'].isna().sum()

0

In [6]:
temp = data_df['AQI'].value_counts(normalize=True) * 100
temp

70     2.331002
33     2.097902
68     1.864802
21     1.631702
84     1.631702
         ...   
88     0.233100
152    0.233100
157    0.233100
159    0.233100
382    0.233100
Name: AQI, Length: 163, dtype: float64

In [7]:
# Tách X và y
y_sr = data_df["AQI"]
X_df = data_df.drop("AQI", axis=1)


In [8]:
#Tách tập train và validation
train_X_df, val_X_df, train_y_sr, val_y_sr = train_test_split(X_df, y_sr, test_size=0.3, random_state=0)

In [9]:
train_X_df.shape

(300, 7)

In [10]:
train_y_sr.shape

(300,)

In [11]:
val_X_df.shape

(129, 7)

In [12]:
val_y_sr.shape

(129,)

In [13]:
train_X_df.index

Int64Index([240, 284, 106, 418,  63, 296, 367, 376, 351,  89,
            ...
            211,   9, 359, 195, 251, 323, 192, 117,  47, 172],
           dtype='int64', length=300)

In [14]:
train_X_df.dtypes

City      object
PM2.5    float64
PM10     float64
O3       float64
NO2      float64
SO2      float64
CO       float64
dtype: object

In [15]:
train_X_df.dtypes[train_X_df.dtypes != object]

PM2.5    float64
PM10     float64
O3       float64
NO2      float64
SO2      float64
CO       float64
dtype: object

In [16]:
#quan sát phân bố của các cột số
num_cols = ['PM2.5', 'PM10', 'O3', 'NO2', 'SO2', 'CO']
df = train_X_df[num_cols]
def missing_ratio(df):
    return (df.isna().mean() * 100).round(1)
def lower_quartile(df):
    return df.quantile(0.25).round(1)
def median(df):
    return df.quantile(0.5).round(1)
def upper_quartile(df):
    return df.quantile(0.75).round(1)
df.agg([missing_ratio, 'min', lower_quartile, median, upper_quartile, 'max'])



Unnamed: 0,PM2.5,PM10,O3,NO2,SO2,CO
missing_ratio,1.0,43.3,66.7,55.0,67.0,75.3
min,0.0,2.2,0.0,0.0,0.0,0.0
lower_quartile,10.4,14.0,5.2,11.0,2.0,0.2
median,23.0,27.1,17.8,21.2,4.7,314.6
upper_quartile,43.2,56.9,36.4,34.2,10.0,734.2
max,356.8,347.0,125.0,271.9,141.4,7900.0


In [17]:
#hàm tiền xử lý
def fill_missing(X_df):
    num_cols = ['PM2.5', 'PM10', 'O3', 'NO2', 'SO2', 'CO']
    df = X_df
    #thay 'nan' bằng 0.0
    df[num_cols] = X_df[num_cols].fillna(0)
    #xóa cột City vì là cột định danh không cần thiết cho mô hình
    df = df.drop(columns=['City'])
    return df

In [18]:
#sử dụng FunctionTransformer để transform tập dữ liệu
transform = FunctionTransformer(fill_missing)


In [19]:
#tạo pipeline cho tiền XL
preprocess_pipeline = make_pipeline(FunctionTransformer(fill_missing), StandardScaler())

In [20]:
#preprocess_pipeline

In [21]:
#preprocess tập train
preprocess_train_X = preprocess_pipeline.fit_transform(train_X_df)
preprocess_train_X.shape

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[k1] = value[k2]


(300, 6)

In [22]:
#preprocess tập validation
preprocessed_val_X = preprocess_pipeline.transform(val_X_df)
preprocessed_val_X.shape

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[k1] = value[k2]


(129, 6)