In [104]:
#Mengimport semua library yang dibutuhkan 
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, confusion_matrix, r2_score
import pickle

In [105]:
air_quality_dataset = pd.read_csv('air_quality_health_impact_data.csv', on_bad_lines="skip")
print(air_quality_dataset.head())

   RecordID         AQI        PM10       PM2_5         NO2        SO2  \
0         1  187.270059  295.853039   13.038560    6.639263  66.161150   
1         2  475.357153  246.254703    9.984497   16.318326  90.499523   
2         3  365.996971   84.443191   23.111340   96.317811  17.875850   
3         4  299.329242   21.020609   14.273403   81.234403  48.323616   
4         5   78.009320   16.987667  152.111623  121.235461  90.866167   

           O3  Temperature   Humidity  WindSpeed  RespiratoryCases  \
0   54.624280     5.150335  84.424344   6.137755                 7   
1  169.621728     1.543378  46.851415   4.521422                10   
2    9.006794     1.169483  17.806977  11.157384                13   
3   93.161033    21.925276  99.473373  15.302500                 8   
4  241.795138     9.217517  24.906837  14.534733                 9   

   CardiovascularCases  HospitalAdmissions  HealthImpactScore  \
0                    5                   1          97.244041   
1   

In [106]:
air_quality_dataset.isna().sum()

RecordID               0
AQI                    0
PM10                   0
PM2_5                  0
NO2                    0
SO2                    0
O3                     0
Temperature            0
Humidity               0
WindSpeed              0
RespiratoryCases       0
CardiovascularCases    0
HospitalAdmissions     0
HealthImpactScore      0
HealthImpactClass      0
dtype: int64

In [107]:
# Convert target feature to categorical
air_quality_dataset['HealthImpactScore'] = air_quality_dataset['HealthImpactScore'].astype('category')
air_quality_dataset['HealthImpactClass'] = air_quality_dataset['HealthImpactClass'].astype('category')

In [108]:
def count_outliers (air_quality_dataset):
    col_numerik = air_quality_dataset.select_dtypes(include=[np.number]).columns
    outliers = {}
    for col in col_numerik :
        limit_atas = air_quality_dataset[col].quantile(0.99)
        outliers[col] = (air_quality_dataset[col] > limit_atas).sum()
        return outliers
    
outlier_air_quality_dataset = count_outliers(air_quality_dataset.drop(columns = ['RecordID']))

def remove_outliers(air_quality_dataset):
    col_numerik = air_quality_dataset.select_dtypes(include=[np.number]).columns
    for col in col_numerik:
        limit_atas = air_quality_dataset[col].quantile(0.99)
        air_quality_dataset = air_quality_dataset[air_quality_dataset[col] <= limit_atas]
    return air_quality_dataset

cleaned_air_dataset = remove_outliers(air_quality_dataset) 

print("\nData Train sebelum di hapus outliers",air_quality_dataset.shape)
print("\nData Train setelah di hapus outliers",cleaned_air_dataset.shape)


Data Train sebelum di hapus outliers (5811, 15)

Data Train setelah di hapus outliers (5165, 15)


In [109]:
x = cleaned_air_dataset.drop(['RecordID','HealthImpactScore','HealthImpactClass'], axis=1)
x

Unnamed: 0,AQI,PM10,PM2_5,NO2,SO2,O3,Temperature,Humidity,WindSpeed,RespiratoryCases,CardiovascularCases,HospitalAdmissions
0,187.270059,295.853039,13.038560,6.639263,66.161150,54.624280,5.150335,84.424344,6.137755,7,5,1
1,475.357153,246.254703,9.984497,16.318326,90.499523,169.621728,1.543378,46.851415,4.521422,10,2,0
2,365.996971,84.443191,23.111340,96.317811,17.875850,9.006794,1.169483,17.806977,11.157384,13,3,0
4,78.009320,16.987667,152.111623,121.235461,90.866167,241.795138,9.217517,24.906837,14.534733,9,0,1
5,77.997260,36.113445,97.113240,87.769562,32.261206,136.999714,-1.441781,32.635904,4.675127,13,5,2
...,...,...,...,...,...,...,...,...,...,...,...,...
5746,337.146787,246.493979,130.210223,92.594733,58.776887,200.775019,11.579942,91.232370,6.218481,8,8,2
5747,489.977724,36.686616,43.492069,119.508875,96.194297,112.458508,1.254843,44.262755,6.106693,9,2,1
5748,492.800367,254.576297,40.763899,83.167996,10.357855,187.026421,11.022767,56.677998,13.994129,5,4,2
5750,121.807396,76.109049,1.426004,35.605906,70.175899,296.061498,16.900741,14.280989,0.599042,9,6,1


In [110]:
scaler = StandardScaler()
x_standard = scaler.fit_transform(x)
x_standard

array([[-0.40632395,  1.74576749, -1.49179641, ..., -0.96659077,
         0.01567071, -0.72161761],
       [ 1.61027993,  1.16160929, -1.54489372, ...,  0.02299126,
        -1.37185864, -1.46511551],
       [ 0.84476085, -0.7441708 , -1.31667316, ...,  1.01257328,
        -0.90934886, -1.46511551],
       ...,
       [ 1.73238208,  1.25961918, -1.00976944, ..., -1.62631211,
        -0.44683907,  0.02188029],
       [-0.86456129, -0.84232847, -1.6936899 , ..., -0.30686942,
         0.47818049, -0.72161761],
       [ 1.07772682,  1.3344584 ,  0.65189147, ..., -0.63673009,
         0.94069027, -0.72161761]])

In [111]:
y = cleaned_air_dataset['HealthImpactClass']
y

0       0.0
1       0.0
2       0.0
4       0.0
5       1.0
       ... 
5746    1.0
5747    3.0
5748    1.0
5750    4.0
5751    4.0
Name: HealthImpactClass, Length: 5165, dtype: category
Categories (5, float64): [0.0, 1.0, 2.0, 3.0, 4.0]

In [112]:
x_train,x_test,y_train,y_test=train_test_split(x_standard, y, test_size=0.2, random_state=42)

In [113]:
model_classifier_1 = SVC()
model_classifier_1.fit(x_train,y_train)

y_pred = model_classifier_1.predict(x_test)
accuracy_test = accuracy_score(y_test,y_pred)
print(accuracy_test)

0.9002904162633107


In [114]:
model_classifier_2 = RandomForestClassifier()
model_classifier_2.fit(x_train,y_train)

y_pred = model_classifier_2.predict(x_test)
accuracy_test = accuracy_score(y_test, y_pred)
print(accuracy_test)

0.8983543078412392


In [115]:
model_classifier_3 = DecisionTreeClassifier()
model_classifier_3.fit(x_train,y_train)

y_pred = model_classifier_3.predict(x_test)
accuracy_test = accuracy_score(y_test, y_pred)
print(accuracy_test)

0.8383349467570184


In [116]:
input_data = (10.292247147901223, 133.76598595794235, 143.13529602451538 ,29.374375166641475, 7.374416514116067, 146.350041824353,-7.1508773244773804,99.6814012977356,5.37810152663782,9,7,4)
input_data_as_numpy_array = np.array(input_data)
input_data_reshape = input_data_as_numpy_array.reshape(1,-1)

std_data = scaler.transform(input_data_reshape)
print(std_data)

prediction = model_classifier_2.predict(std_data)
print(prediction)

if (prediction[0] == 0) :
    print('Kualitas Udara berdampak Sangat Buruk untuk kesehatan anda')
elif (prediction[0] == 1) :
    print('Kualitas Udara berdampak Buruk untuk kesehatan anda')
elif  (prediction[0] == 2) :
    print('Kualitas Udara berdampak Sedang untuk kesehatan anda')
elif  (prediction[0] == 3) :
    print('Kualitas Udara berdampak Rendah untuk kesehatan anda')
else :          
    print('Kualitas Udara berdampak Sangat Rendah untuk kesehatan anda')

[[-1.64516509 -0.16325786  0.77003839 -1.24974505 -1.47230106 -0.02790972
  -1.53106001  1.76624759 -0.78884498 -0.30686942  0.94069027  1.50887608]]
[1.]
Kualitas Udara berdampak Buruk untuk kesehatan anda




In [117]:
with open('model.pkl', 'wb') as model_file:
    pickle.dump(model_classifier_1, model_file)

with open('scaler.pkl', 'wb') as scaler_file:
    pickle.dump(scaler, scaler_file)