# Import Packages

In [1]:
import pandas as pd
import numpy as np

# package for visualization
import seaborn as sns
import matplotlib.pyplot as plt

# package for data science
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# modeling
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC

# evaluation
from sklearn.metrics import confusion_matrix
from sklearn.metrics import f1_score

# Import Dataset

In [2]:
df_weather = pd.read_parquet('../data/interim/weather_data_final.parquet')

df_weather.sample(5)

Unnamed: 0,id,main,description,icon,dt,main.temp,main.feels_like,main.humidity,main.temp_min,main.temp_max,wind.speed,wind.deg,wind.gust,clouds.all,rain.1h
1886,803,Clouds,broken clouds,04d,2022-09-17 03:00:00+00:00,304.16,309.06,64,304.16,304.16,1.42,343,1.08,83,
233,804,Clouds,overcast clouds,04n,2022-07-10 16:00:00+00:00,296.93,297.5,82,296.93,296.93,0.91,178,0.97,90,
1217,803,Clouds,broken clouds,04d,2022-08-20 10:00:00+00:00,303.05,305.6,59,303.05,304.52,0.73,317,1.83,72,
1050,802,Clouds,scattered clouds,03n,2022-08-13 12:00:00+00:00,298.07,298.89,87,298.07,298.07,1.41,46,1.65,44,
2138,804,Clouds,overcast clouds,04n,2022-09-27 14:00:00+00:00,296.93,297.61,86,296.93,300.07,0.75,152,1.07,100,


In [3]:
df = pd.read_parquet('../data/interim/interim_data.parquet')

df.sample(5)

Unnamed: 0,time,kemendagri_kabupaten_kode,kemendagri_kabupaten_nama,street,level,median_length,median_delay,median_speed_kmh,total_records,id,date,median_level,geometry,day_name,hour,weekday,isweekend,month,pub_holiday
88372,2022-08-29 07:00:00,32.71,KOTA BOGOR,N9 Otto Iskandardinata,3,702.0,127.0,13.15,8,36845873,2022-08-29,3.0,"MULTILINESTRING ((106.805156 -6.601706, 106.80...",Monday,7,0,0,8,0
12600,2022-07-14 17:00:00,32.71,KOTA BOGOR,Binamarga,1,619.0,61.0,15.13,11,33955763,2022-07-14,1.0,"MULTILINESTRING ((106.813635 -6.60058, 106.811...",Thursday,17,3,0,7,0
70123,2022-08-17 17:00:00,32.71,KOTA BOGOR,Pahlawan,3,635.0,88.0,15.705,14,36143447,2022-08-17,3.0,"MULTILINESTRING ((106.803668 -6.614671, 106.80...",Wednesday,17,2,0,8,1
64137,2022-08-13 18:00:00,32.71,KOTA BOGOR,N9 Jalan Raya Pajajaran,3,2090.0,210.0,15.75,88,35902996,2022-08-13,3.0,"MULTILINESTRING ((106.804723 -6.592909, 106.80...",Saturday,18,5,1,8,0
49293,2022-08-05 10:00:00,32.71,KOTA BOGOR,Kedunghalang Raya,2,950.0,78.0,19.19,4,35391006,2022-08-05,2.0,"MULTILINESTRING ((106.805846 -6.545258, 106.80...",Friday,10,4,0,8,0


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100529 entries, 0 to 100528
Data columns (total 19 columns):
 #   Column                     Non-Null Count   Dtype         
---  ------                     --------------   -----         
 0   time                       100529 non-null  datetime64[ns]
 1   kemendagri_kabupaten_kode  100529 non-null  float64       
 2   kemendagri_kabupaten_nama  100529 non-null  object        
 3   street                     100529 non-null  object        
 4   level                      100529 non-null  int64         
 5   median_length              100529 non-null  float64       
 6   median_delay               100529 non-null  float64       
 7   median_speed_kmh           100529 non-null  float64       
 8   total_records              100529 non-null  int64         
 9   id                         100529 non-null  int64         
 10  date                       100529 non-null  object        
 11  median_level               100529 non-null  float64 

In [5]:
df_weather.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2366 entries, 0 to 2365
Data columns (total 15 columns):
 #   Column           Non-Null Count  Dtype              
---  ------           --------------  -----              
 0   id               2366 non-null   int64              
 1   main             2366 non-null   object             
 2   description      2366 non-null   object             
 3   icon             2366 non-null   object             
 4   dt               2366 non-null   datetime64[ns, UTC]
 5   main.temp        2366 non-null   float64            
 6   main.feels_like  2366 non-null   float64            
 7   main.humidity    2366 non-null   int64              
 8   main.temp_min    2366 non-null   float64            
 9   main.temp_max    2366 non-null   float64            
 10  wind.speed       2366 non-null   float64            
 11  wind.deg         2366 non-null   int64              
 12  wind.gust        2354 non-null   float64            
 13  clouds.all       2

Format datetime in df not in UTC, hence the time will be set as UTC datetime

In [9]:
df['time'] = pd.to_datetime(df['time'], format='%Y-%m-%d %H:%M:%S', utc=True)

In [11]:
df.sample(5)

Unnamed: 0,time,kemendagri_kabupaten_kode,kemendagri_kabupaten_nama,street,level,median_length,median_delay,median_speed_kmh,total_records,id,date,median_level,geometry,day_name,hour,weekday,isweekend,month,pub_holiday
16692,2022-07-16 19:00:00+00:00,32.71,KOTA BOGOR,RE Martadinata,2,1365.0,108.0,13.955,16,34105836,2022-07-16,2.0,"MULTILINESTRING ((106.792117 -6.583234, 106.79...",Saturday,19,5,1,7,0
89166,2022-08-29 15:00:00+00:00,32.71,KOTA BOGOR,Laladon Raya,2,1032.0,89.0,14.4,2,36846640,2022-08-29,2.0,"LINESTRING (106.754526 -6.576533, 106.754518 -...",Monday,15,0,0,8,0
89464,2022-08-29 17:00:00+00:00,32.71,KOTA BOGOR,N8 Jalan Raya Bogor,1,984.0,60.0,24.49,2,36846916,2022-08-29,1.0,"LINESTRING (106.825415 -6.545147, 106.825749 -...",Monday,17,0,0,8,0
47043,2022-08-03 19:00:00+00:00,32.71,KOTA BOGOR,N9 Jalan Raya Tajur,1,1500.0,67.0,27.415,34,35270770,2022-08-03,1.0,"MULTILINESTRING ((106.824909 -6.63156, 106.824...",Wednesday,19,2,0,8,0
8531,2022-07-11 22:00:00+00:00,32.71,KOTA BOGOR,N9 Jalan Raya Dramaga,1,850.0,63.0,21.18,2,33772181,2022-07-11,1.0,"LINESTRING (106.734709 -6.564806, 106.735341 -...",Monday,22,0,0,7,0


The weather data gathered from openweather api have interval update time 1 hour, hence the time data from df will be rounded, eg: 22:45 will be rounded to 22:00

In [12]:
df['time_rounded'] = df['time'].dt.round(freq='1h')
df['hour_rounded'] = df['time_rounded'].dt.hour

df.sample(5)

Unnamed: 0,time,kemendagri_kabupaten_kode,kemendagri_kabupaten_nama,street,level,median_length,median_delay,median_speed_kmh,total_records,id,...,median_level,geometry,day_name,hour,weekday,isweekend,month,pub_holiday,time_rounded,hour_rounded
79221,2022-08-23 19:00:00+00:00,32.71,KOTA BOGOR,Bumi Kencana Permai,1,847.0,145.0,9.35,11,36473972,...,1.0,"MULTILINESTRING ((106.783114 -6.534318, 106.78...",Tuesday,19,1,0,8,0,2022-08-23 19:00:00+00:00,19
92443,2022-08-31 15:00:00+00:00,32.71,KOTA BOGOR,Achmad Adnawijaya,2,602.0,89.5,12.92,6,36957150,...,2.0,"MULTILINESTRING ((106.816178 -6.587286, 106.81...",Wednesday,15,2,0,8,0,2022-08-31 15:00:00+00:00,15
59448,2022-08-11 07:00:00+00:00,32.71,KOTA BOGOR,N9 Jalan Raya Pajajaran,2,1578.0,95.0,24.43,25,35772715,...,2.0,"MULTILINESTRING ((106.814643 -6.61658, 106.814...",Thursday,7,3,0,8,0,2022-08-11 07:00:00+00:00,7
67684,2022-08-15 20:00:00+00:00,32.71,KOTA BOGOR,Kapten Muslihat,4,593.0,372.0,4.07,39,36028893,...,4.0,"MULTILINESTRING ((106.793427 -6.596729, 106.79...",Monday,20,0,0,8,0,2022-08-15 20:00:00+00:00,20
83209,2022-08-26 15:00:00+00:00,32.71,KOTA BOGOR,Merdeka,3,430.0,94.0,11.23,13,36638880,...,3.0,"MULTILINESTRING ((106.787708 -6.586892, 106.78...",Friday,15,4,0,8,0,2022-08-26 15:00:00+00:00,15


## Joining df with weather data

In [13]:
df.rename(columns={'time_rounded':'dt'}, inplace=True)

In [17]:
df_final = pd.merge(df, df_weather[['dt', 'main', 'rain.1h']], on='dt', how='left')

In [26]:
df_final.sample(5)

Unnamed: 0,time,kemendagri_kabupaten_kode,kemendagri_kabupaten_nama,street,level,median_length,median_delay,median_speed_kmh,total_records,id,...,day_name,hour,weekday,isweekend,month,pub_holiday,dt,hour_rounded,main,rain.1h
43131,2022-08-01 12:00:00+00:00,32.71,KOTA BOGOR,N9 KS Tubun,3,543.0,82.0,10.360001,32,35154595,...,Monday,12,0,0,8,0,2022-08-01 12:00:00+00:00,12,Clouds,
37170,2022-07-29 14:00:00+00:00,32.71,KOTA BOGOR,N6 Jalan Raya Baru,2,1122.0,60.0,29.21,3,34943816,...,Friday,14,4,0,7,0,2022-07-29 14:00:00+00:00,14,Clouds,
86477,2022-08-28 04:00:00+00:00,32.71,KOTA BOGOR,Exit 40: Bogor/Kebun Raya,2,1179.0,75.0,31.595001,6,36786531,...,Sunday,4,6,1,8,0,2022-08-28 04:00:00+00:00,4,Clouds,
77894,2022-08-22 19:00:00+00:00,32.71,KOTA BOGOR,N9 Jalan Raya Dramaga,3,1175.0,261.0,11.52,46,36421607,...,Monday,19,0,0,8,0,2022-08-22 19:00:00+00:00,19,Clouds,
57173,2022-08-09 16:00:00+00:00,32.71,KOTA BOGOR,Haji Achmad Sobana,4,235.0,115.0,5.42,7,35674959,...,Tuesday,16,1,0,8,0,2022-08-09 16:00:00+00:00,16,Clouds,


In [25]:
# checking data with rain
df_final.query(r"main == 'Rain'")

Unnamed: 0,time,kemendagri_kabupaten_kode,kemendagri_kabupaten_nama,street,level,median_length,median_delay,median_speed_kmh,total_records,id,...,day_name,hour,weekday,isweekend,month,pub_holiday,dt,hour_rounded,main,rain.1h
14,2022-07-06 04:00:00+00:00,32.71,KOTA BOGOR,Tol Jagorawi,2,2927.0,115.0,31.380,1,33469034,...,Wednesday,4,2,0,7,0,2022-07-06 04:00:00+00:00,4,Rain,0.16
15,2022-07-06 04:00:00+00:00,32.71,KOTA BOGOR,KH Soleh Iskandar (Jalur Lambat),5,167.0,-1.0,0.000,60,33469033,...,Wednesday,4,2,0,7,0,2022-07-06 04:00:00+00:00,4,Rain,0.16
16,2022-07-06 05:00:00+00:00,32.71,KOTA BOGOR,KH Soleh Iskandar (Jalur Lambat),5,167.0,-1.0,0.000,60,33469035,...,Wednesday,5,2,0,7,0,2022-07-06 05:00:00+00:00,5,Rain,0.12
17,2022-07-06 05:00:00+00:00,32.71,KOTA BOGOR,Merdeka,2,713.0,108.5,12.105,8,33469037,...,Wednesday,5,2,0,7,0,2022-07-06 05:00:00+00:00,5,Rain,0.12
18,2022-07-06 05:00:00+00:00,32.71,KOTA BOGOR,Mawar,3,241.0,106.0,6.010,9,33469036,...,Wednesday,5,2,0,7,0,2022-07-06 05:00:00+00:00,5,Rain,0.12
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
100573,2022-09-04 23:00:00+00:00,32.71,KOTA BOGOR,Kapten Yusuf,3,402.0,106.0,9.080,11,37226041,...,Sunday,23,6,1,9,0,2022-09-04 23:00:00+00:00,23,Rain,0.35
100574,2022-09-04 23:00:00+00:00,32.71,KOTA BOGOR,KH Soleh Iskandar (Jalur Lambat),5,167.0,-1.0,0.000,60,37226042,...,Sunday,23,6,1,9,0,2022-09-04 23:00:00+00:00,23,Rain,0.35
100575,2022-09-04 23:00:00+00:00,32.71,KOTA BOGOR,N8 Jalan Raya Bogor,1,1743.0,85.0,26.820,1,37226043,...,Sunday,23,6,1,9,0,2022-09-04 23:00:00+00:00,23,Rain,0.35
100576,2022-09-04 23:00:00+00:00,32.71,KOTA BOGOR,N8 Jalan Raya Bogor,2,1801.0,184.0,19.200,11,37226044,...,Sunday,23,6,1,9,0,2022-09-04 23:00:00+00:00,23,Rain,0.35


## Checking Null Value & Duplicate

### Duplicate Values

In [27]:
df_final.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 100584 entries, 0 to 100583
Data columns (total 23 columns):
 #   Column                     Non-Null Count   Dtype              
---  ------                     --------------   -----              
 0   time                       100584 non-null  datetime64[ns, UTC]
 1   kemendagri_kabupaten_kode  100584 non-null  float64            
 2   kemendagri_kabupaten_nama  100584 non-null  object             
 3   street                     100584 non-null  object             
 4   level                      100584 non-null  int64              
 5   median_length              100584 non-null  float64            
 6   median_delay               100584 non-null  float64            
 7   median_speed_kmh           100584 non-null  float64            
 8   total_records              100584 non-null  int64              
 9   id                         100584 non-null  int64              
 10  date                       100584 non-null  object      

In [29]:
df_final.duplicated().sum()

55

In [31]:
df_final[df_final.duplicated()]

Unnamed: 0,time,kemendagri_kabupaten_kode,kemendagri_kabupaten_nama,street,level,median_length,median_delay,median_speed_kmh,total_records,id,...,day_name,hour,weekday,isweekend,month,pub_holiday,dt,hour_rounded,main,rain.1h
2994,2022-07-08 00:00:00+00:00,32.71,KOTA BOGOR,N9 KH Soleh Iskandar,3,499.0,95.0,12.889999,2,33598481,...,Friday,0,4,0,7,0,2022-07-08 00:00:00+00:00,0,Clouds,
2996,2022-07-08 00:00:00+00:00,32.71,KOTA BOGOR,N9 Jalan Raya Pajajaran,2,1417.0,115.0,21.36,1,33598479,...,Friday,0,4,0,7,0,2022-07-08 00:00:00+00:00,0,Clouds,
2998,2022-07-08 00:00:00+00:00,32.71,KOTA BOGOR,Kolonel Ahmad Syam,3,508.0,92.5,12.860001,10,33598478,...,Friday,0,4,0,7,0,2022-07-08 00:00:00+00:00,0,Clouds,
3000,2022-07-08 00:00:00+00:00,32.71,KOTA BOGOR,Jalan Lingkar Dramaga,1,1393.0,86.0,18.73,1,33598476,...,Friday,0,4,0,7,0,2022-07-08 00:00:00+00:00,0,Clouds,
3002,2022-07-08 00:00:00+00:00,32.71,KOTA BOGOR,Tol Jagorawi,1,1178.0,61.0,31.86,3,33598482,...,Friday,0,4,0,7,0,2022-07-08 00:00:00+00:00,0,Clouds,
3004,2022-07-08 00:00:00+00:00,32.71,KOTA BOGOR,N9 KH Soleh Iskandar,2,1631.0,136.0,22.564999,2,33598480,...,Friday,0,4,0,7,0,2022-07-08 00:00:00+00:00,0,Clouds,
3006,2022-07-08 00:00:00+00:00,32.71,KOTA BOGOR,KH Soleh Iskandar (Jalur Lambat),5,167.0,-1.0,0.0,61,33598477,...,Friday,0,4,0,7,0,2022-07-08 00:00:00+00:00,0,Clouds,
13006,2022-07-15 00:00:00+00:00,32.71,KOTA BOGOR,Semplak Raya,2,583.0,63.0,15.73,8,34019300,...,Friday,0,4,0,7,0,2022-07-15 00:00:00+00:00,0,Clouds,
13008,2022-07-15 00:00:00+00:00,32.71,KOTA BOGOR,N9 Jalan Raya Pajajaran,2,567.0,68.5,14.27,10,34019299,...,Friday,0,4,0,7,0,2022-07-15 00:00:00+00:00,0,Clouds,
13010,2022-07-15 00:00:00+00:00,32.71,KOTA BOGOR,Tol Jagorawi,1,2189.0,64.0,46.85,2,34019301,...,Friday,0,4,0,7,0,2022-07-15 00:00:00+00:00,0,Clouds,


In [35]:
df_final.drop_duplicates(inplace=True)

In [36]:
df_final.duplicated().sum()

0