In [60]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

# Data Split

In [61]:
dataset_split = pd.read_csv("climate_data.csv")

In [62]:
dataset_split.head()

Unnamed: 0,date,Tn,Tx,Tavg,RH_avg,RR,ss,ff_x,ddd_x,ff_avg,ddd_car,station_id
0,01-01-2010,21.4,30.2,27.1,82.0,9.0,0.5,7.0,90.0,5.0,E,96001
1,02-01-2010,21.0,29.6,25.7,95.0,24.0,0.2,6.0,90.0,4.0,E,96001
2,03-01-2010,20.2,26.8,24.5,98.0,63.0,0.0,5.0,90.0,4.0,E,96001
3,04-01-2010,21.0,29.2,25.8,90.0,0.0,0.1,4.0,225.0,3.0,SW,96001
4,05-01-2010,21.2,30.0,26.7,90.0,2.0,0.4,,,,,96001


Tn = min temperature
Tx = max temperature
Tavg = avg temperature
RH_avg = avg humidity (%)
RR = rainfall (mm)
ss = duration of sunshine (hour)
ff_x = max wind speed (m/s)
ddd_x = wind direction at maximum speed
ff_avg = avg wind speed (m/s)
ddd_car = most wind direction
station_id = station id which record the data

In [63]:
dataset_split.shape

(589265, 12)

In [64]:
X = dataset_split.iloc[:, :-1]
y = dataset_split.iloc[:, -1]

## Proporsi testing set 0.3 atau 30%

In [65]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

In [66]:
print("Dimensi X_train : ", X_train.shape)
print("Dimensi X_test : ", X_test.shape)
print("Dimensi y_train : ", y_train.shape)
print("Dimensi y_test : ", y_test.shape)

Dimensi X_train :  (412485, 11)
Dimensi X_test :  (176780, 11)
Dimensi y_train :  (412485,)
Dimensi y_test :  (176780,)


# Normalisasi Data

In [67]:
dataset_split.to_csv("climate_data_copy2.csv")

In [68]:
from sklearn.preprocessing import MinMaxScaler

In [69]:
min_max_scaler = MinMaxScaler()

In [70]:
x_norm = min_max_scaler.fit_transform(
    dataset_split[['Tn']]
)

In [71]:
data_normalisasi = pd.DataFrame(x_norm)

In [72]:
data_normalisasi.head()

Unnamed: 0,0
0,0.086992
1,0.085366
2,0.082114
3,0.085366
4,0.086179


# Standarisasi

In [83]:
dataset_split.to_csv("climate_data_copy3.csv")

In [133]:
print("Nilai Standar Deviasi Sebelum Distandarisasi: ")
np.std(dataset_split)

Nilai Standar Deviasi Sebelum Distandarisasi: 


Tn              2.280685
Tx              2.311656
Tavg            1.939655
RH_avg         14.337656
RR             17.928732
ss              3.261583
ff_x            2.612283
ddd_x         107.657358
ff_avg          1.803356
station_id    542.418701
dtype: float64

In [128]:
from sklearn.preprocessing import StandardScaler

In [129]:
standard_scaler = StandardScaler()

In [130]:
x_standard = standard_scaler.fit_transform(
    dataset_split[['Tn', 'Tx', 'Tavg', 'RH_avg']]
)

In [131]:
print("Nilai Sesudah Distandarisasi: ")
np.std(x_standard)

Nilai Sesudah Distandarisasi: 


nan

In [137]:
np.mean(x_standard)

nan

In [135]:
data_standarisasi = pd.DataFrame(x_standard)

In [136]:
data_standarisasi.head()

Unnamed: 0,0,1,2,3
0,-0.838393,-0.574893,0.126066,-0.034131
1,-1.013779,-0.834447,-0.595712,0.872572
2,-1.364551,-2.0457,-1.214378,1.081811
3,-1.013779,-1.007483,-0.544156,0.52384
4,-0.926086,-0.661411,-0.080156,0.52384


# DATA CLEANING

In [138]:
df = pd.read_csv("climate_data.csv")

In [140]:
print("Jumlah Nilai Null")
df.isna().sum()

Jumlah Nilai Null


date               0
Tn             23383
Tx             37736
Tavg           45105
RH_avg         48182
RR            125384
ss             43721
ff_x           10214
ddd_x          13128
ff_avg         10127
ddd_car        13739
station_id         0
dtype: int64

In [141]:
df[df.isnull().any(axis=1)]

Unnamed: 0,date,Tn,Tx,Tavg,RH_avg,RR,ss,ff_x,ddd_x,ff_avg,ddd_car,station_id
4,05-01-2010,21.2,30.0,26.7,90.0,2.0,0.4,,,,,96001
5,06-01-2010,21.2,30.0,26.1,93.0,11.0,0.3,,,,,96001
6,07-01-2010,21.4,29.0,25.4,96.0,2.0,0.1,,,,,96001
20,21-01-2010,20.2,30.0,25.5,92.0,0.0,0.4,,,,,96001
34,04-02-2010,21.0,30.6,26.6,88.0,0.0,0.6,,,,,96001
...,...,...,...,...,...,...,...,...,...,...,...,...
589246,13-12-2020,24.0,31.8,28.5,74.0,,4.2,4.0,250.0,1.0,C,97980
589252,19-12-2020,24.0,,28.3,80.0,4.5,7.5,5.0,320.0,2.0,C,97980
589254,21-12-2020,25.9,,28.5,79.0,0.2,7.5,7.0,250.0,2.0,C,97980
589261,28-12-2020,25.3,31.6,28.1,78.0,,3.0,12.0,260.0,2.0,C,97980


In [145]:
df[['Tn', 'Tx', 'Tavg', 'RH_avg', 'RR', 'ss', 'ff_x', 'ddd_x', 'ff_avg']] = df[['Tn', 'Tx', 'Tavg', 'RH_avg', 'RR', 'ss', 'ff_x', 'ddd_x', 'ff_avg']].fillna(df[['Tn', 'Tx', 'Tavg', 'RH_avg', 'RR', 'ss', 'ff_x', 'ddd_x', 'ff_avg']].mean())

In [146]:
df.isna().sum()

date              0
Tn                0
Tx                0
Tavg              0
RH_avg            0
RR                0
ss                0
ff_x              0
ddd_x             0
ff_avg            0
ddd_car       13739
station_id        0
dtype: int64

# Mengganti Tipe Data

## Cek tipe data

In [154]:
df.dtypes

date           object
Tn            float64
Tx            float64
Tavg          float64
RH_avg        float64
RR            float64
ss            float64
ff_x          float64
ddd_x         float64
ff_avg        float64
ddd_car        object
station_id      int64
dtype: object

In [156]:
df = df.astype({'Tn':int})

## Tipe Data 'Tn' Berubah

In [157]:
df.dtypes

date           object
Tn              int32
Tx            float64
Tavg          float64
RH_avg        float64
RR            float64
ss            float64
ff_x          float64
ddd_x         float64
ff_avg        float64
ddd_car        object
station_id      int64
dtype: object

# One-Hot Encoding

In [158]:
from sklearn.preprocessing import OneHotEncoder

In [159]:
onehot_encoder = OneHotEncoder(sparse=False)

In [160]:
onehot = onehot_encoder.fit_transform(df[['ddd_car']])

In [161]:
df_onehot = pd.DataFrame(onehot)

In [162]:
df_onehot.head(10)

Unnamed: 0,0
0,1.0
1,1.0
2,1.0
3,1.0
4,1.0
5,1.0
6,1.0
7,1.0
8,1.0
9,1.0


In [163]:
df = df.join(df_onehot)

In [164]:
df.head(10)

Unnamed: 0,date,Tn,Tx,Tavg,RH_avg,RR,ss,ff_x,ddd_x,ff_avg,ddd_car,station_id,0
0,01-01-2010,21,30.2,27.1,82.0,9.0,0.5,7.0,90.0,5.0,,96001,1.0
1,02-01-2010,21,29.6,25.7,95.0,24.0,0.2,6.0,90.0,4.0,,96001,1.0
2,03-01-2010,20,26.8,24.5,98.0,63.0,0.0,5.0,90.0,4.0,,96001,1.0
3,04-01-2010,21,29.2,25.8,90.0,0.0,0.1,4.0,225.0,3.0,,96001,1.0
4,05-01-2010,21,30.0,26.7,90.0,2.0,0.4,4.709601,188.488325,1.95668,,96001,1.0
5,06-01-2010,21,30.0,26.1,93.0,11.0,0.3,4.709601,188.488325,1.95668,,96001,1.0
6,07-01-2010,21,29.0,25.4,96.0,2.0,0.1,4.709601,188.488325,1.95668,,96001,1.0
7,08-01-2010,21,29.8,26.8,91.0,3.0,0.6,5.0,90.0,4.0,,96001,1.0
8,09-01-2010,21,30.2,26.7,92.0,3.0,0.7,4.0,90.0,3.0,,96001,1.0
9,10-01-2010,20,30.2,27.1,88.0,28.0,0.6,5.0,90.0,4.0,,96001,1.0
