In [12]:
# Mengimport library yang digunakan
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)


In [13]:
# Mengimport dataset yang digunakan
dataset = pd.read_csv("cuaca.csv")

# Menampilkan dataset 
dataset.head()

Unnamed: 0,date,precipitation,temp_max,temp_min,wind,weather
0,2012-01-01,0.0,12.8,5.0,4.7,drizzle
1,2012-01-02,10.9,10.6,2.8,4.5,rain
2,2012-01-03,0.8,11.7,7.2,2.3,rain
3,2012-01-04,20.3,12.2,5.6,4.7,rain
4,2012-01-05,1.3,8.9,2.8,6.1,rain


In [14]:
dataset.describe()

Unnamed: 0,precipitation,temp_max,temp_min,wind
count,1461.0,1461.0,1461.0,1461.0
mean,3.029432,16.439083,8.234771,3.241136
std,6.680194,7.349758,5.023004,1.437825
min,0.0,-1.6,-7.1,0.4
25%,0.0,10.6,4.4,2.2
50%,0.0,15.6,8.3,3.0
75%,2.8,22.2,12.2,4.0
max,55.9,35.6,18.3,9.5


In [15]:
dataset.isnull().sum()

date             0
precipitation    0
temp_max         0
temp_min         0
wind             0
weather          0
dtype: int64

In [16]:
dataset.duplicated().sum() 

0

In [17]:
dataset['weather'].unique()

array(['drizzle', 'rain', 'sun', 'snow', 'fog'], dtype=object)

In [19]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()

dataset['weather_label'] = le.fit_transform(dataset['weather'])
dataset.head(10)

Unnamed: 0,date,precipitation,temp_max,temp_min,wind,weather,weather_label
0,2012-01-01,0.0,12.8,5.0,4.7,drizzle,0
1,2012-01-02,10.9,10.6,2.8,4.5,rain,2
2,2012-01-03,0.8,11.7,7.2,2.3,rain,2
3,2012-01-04,20.3,12.2,5.6,4.7,rain,2
4,2012-01-05,1.3,8.9,2.8,6.1,rain,2
5,2012-01-06,2.5,4.4,2.2,2.2,rain,2
6,2012-01-07,0.0,7.2,2.8,2.3,rain,2
7,2012-01-08,0.0,10.0,2.8,2.0,sun,4
8,2012-01-09,4.3,9.4,5.0,3.4,rain,2
9,2012-01-10,1.0,6.1,0.6,3.4,rain,2


In [20]:
#Store labels and name for final use

weather_dict = dict(zip(dataset['weather_label'], dataset['weather']))
weather_dict

{0: 'drizzle', 2: 'rain', 4: 'sun', 3: 'snow', 1: 'fog'}

In [21]:
#Preprocess Datetime Columns

def date_time(dataset) :
    
    dataset['date'] = pd.to_datetime(dataset['date'])
    dataset['year'] = dataset['date'].dt.year #Generate Year Column
    dataset['month'] = dataset['date'].dt.month #Generate Month Column
    dataset['day'] = dataset['date'].dt.day
    
    return dataset


df_final = date_time(dataset)
df_final

Unnamed: 0,date,precipitation,temp_max,temp_min,wind,weather,weather_label,year,month,day
0,2012-01-01,0.0,12.8,5.0,4.7,drizzle,0,2012,1,1
1,2012-01-02,10.9,10.6,2.8,4.5,rain,2,2012,1,2
2,2012-01-03,0.8,11.7,7.2,2.3,rain,2,2012,1,3
3,2012-01-04,20.3,12.2,5.6,4.7,rain,2,2012,1,4
4,2012-01-05,1.3,8.9,2.8,6.1,rain,2,2012,1,5
...,...,...,...,...,...,...,...,...,...,...
1456,2015-12-27,8.6,4.4,1.7,2.9,rain,2,2015,12,27
1457,2015-12-28,1.5,5.0,1.7,1.3,rain,2,2015,12,28
1458,2015-12-29,0.0,7.2,0.6,2.6,fog,1,2015,12,29
1459,2015-12-30,0.0,5.6,-1.0,3.4,sun,4,2015,12,30


In [22]:
df_final = df_final.drop(['weather'],axis=1).set_index('date')
df_final

Unnamed: 0_level_0,precipitation,temp_max,temp_min,wind,weather_label,year,month,day
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2012-01-01,0.0,12.8,5.0,4.7,0,2012,1,1
2012-01-02,10.9,10.6,2.8,4.5,2,2012,1,2
2012-01-03,0.8,11.7,7.2,2.3,2,2012,1,3
2012-01-04,20.3,12.2,5.6,4.7,2,2012,1,4
2012-01-05,1.3,8.9,2.8,6.1,2,2012,1,5
...,...,...,...,...,...,...,...,...
2015-12-27,8.6,4.4,1.7,2.9,2,2015,12,27
2015-12-28,1.5,5.0,1.7,1.3,2,2015,12,28
2015-12-29,0.0,7.2,0.6,2.6,1,2015,12,29
2015-12-30,0.0,5.6,-1.0,3.4,4,2015,12,30


In [24]:
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from imblearn import under_sampling, over_sampling
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from sklearn.pipeline import Pipeline

#Preprocess Data for Machine Learning Development
X = df_final.drop(['weather_label'], axis = 1)
y = df_final['weather_label']


over_strategy = {0 : 1000, 1 : 1000, 2 : 1000, 3 : 1000, 4 : 2000}
under_strategy = {0 : 1000, 1 : 1000, 2 : 1000, 3 : 1000, 4 : 2000}

oversample = SMOTE(sampling_strategy = over_strategy)
undersample = RandomUnderSampler(sampling_strategy = under_strategy)

X_final,y = oversample.fit_resample(X,y)
X_final,y = undersample.fit_resample(X_final, y)


X_train,X_test,y_train,y_test = train_test_split(X_final,y,random_state = 10, test_size = 0.2)

In [25]:
from sklearn.metrics import accuracy_score, classification_report
model = GaussianNB()

np.random.seed(10)
model.fit(X_train,y_train)

pred_all = model.predict(X_final)
pred_train = model.predict(X_train)
pred_test = model.predict(X_test)

print(f'Accuracy Score on All Data : {round(accuracy_score(y, pred_all),2)*100}')
print(f'Accuracy Score on Train Data : {round(accuracy_score(y_train, pred_train),2)*100}')
print(f'Accuracy Score on Test Data : {round(accuracy_score(y_test, pred_test),2)*100}')

Accuracy Score on All Data : 74.0
Accuracy Score on Train Data : 74.0
Accuracy Score on Test Data : 74.0


In [26]:
print(classification_report(y,pred_all))

              precision    recall  f1-score   support

           0       0.62      0.62      0.62      1000
           1       0.65      0.53      0.58      1000
           2       0.97      0.88      0.92      1000
           3       0.93      0.97      0.95      1000
           4       0.64      0.72      0.68      2000

    accuracy                           0.74      6000
   macro avg       0.76      0.74      0.75      6000
weighted avg       0.74      0.74      0.74      6000

