# Rainfall Prediction Model Using Random Forest Algorithm

In [83]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.utils import resample
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.model_selection import GridSearchCV, train_test_split, cross_val_score
from sklearn.ensemble import RandomForestClassifier
import pickle

In [84]:
data = pd.read_csv("../dataset/Rainfall.csv")
data.head(10)

Unnamed: 0,day,pressure,maxtemp,temparature,mintemp,dewpoint,humidity,cloud,rainfall,sunshine,winddirection,windspeed
0,1,1025.9,19.9,18.3,16.8,13.1,72,49,yes,9.3,80.0,26.3
1,2,1022.0,21.7,18.9,17.2,15.6,81,83,yes,0.6,50.0,15.3
2,3,1019.7,20.3,19.3,18.0,18.4,95,91,yes,0.0,40.0,14.2
3,4,1018.9,22.3,20.6,19.1,18.8,90,88,yes,1.0,50.0,16.9
4,5,1015.9,21.3,20.7,20.2,19.9,95,81,yes,0.0,40.0,13.7
5,6,1018.8,24.3,20.9,19.2,18.0,84,51,yes,7.7,20.0,14.5
6,7,1021.8,21.4,18.8,17.0,15.0,79,56,no,3.4,30.0,21.5
7,8,1020.8,21.0,18.4,16.5,14.4,78,28,no,7.7,60.0,14.3
8,9,1020.6,18.9,18.1,17.1,14.3,78,79,no,3.3,70.0,39.3
9,10,1017.5,18.5,18.0,17.2,15.5,85,91,yes,0.0,70.0,37.7


In [85]:
data.shape

(366, 12)

In [86]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 366 entries, 0 to 365
Data columns (total 12 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   day                     366 non-null    int64  
 1   pressure                366 non-null    float64
 2   maxtemp                 366 non-null    float64
 3   temparature             366 non-null    float64
 4   mintemp                 366 non-null    float64
 5   dewpoint                366 non-null    float64
 6   humidity                366 non-null    int64  
 7   cloud                   366 non-null    int64  
 8   rainfall                366 non-null    object 
 9   sunshine                366 non-null    float64
 10           winddirection  365 non-null    float64
 11  windspeed               365 non-null    float64
dtypes: float64(8), int64(3), object(1)
memory usage: 34.4+ KB


In [87]:
data.duplicated().sum()

np.int64(0)

In [88]:
data.isnull().sum()

day                       0
pressure                  0
maxtemp                   0
temparature               0
mintemp                   0
dewpoint                  0
humidity                  0
cloud                     0
rainfall                  0
sunshine                  0
         winddirection    1
windspeed                 1
dtype: int64

In [89]:
# Assuming df is your DataFrame
data.columns = data.columns.str.strip()

In [90]:
data.isnull().sum()

day              0
pressure         0
maxtemp          0
temparature      0
mintemp          0
dewpoint         0
humidity         0
cloud            0
rainfall         0
sunshine         0
winddirection    1
windspeed        1
dtype: int64

In [91]:
data["rainfall"].value_counts()

rainfall
yes    249
no     117
Name: count, dtype: int64

In [92]:
percentages = data['rainfall'].value_counts(normalize=True) * 100
print(percentages)

rainfall
yes    68.032787
no     31.967213
Name: proportion, dtype: float64


In [93]:
data.sample(10)

Unnamed: 0,day,pressure,maxtemp,temparature,mintemp,dewpoint,humidity,cloud,rainfall,sunshine,winddirection,windspeed
207,26,1008.3,32.0,29.4,27.0,26.4,84,57,yes,7.4,60.0,17.5
363,29,1025.9,18.9,17.7,16.4,13.3,75,78,yes,4.6,70.0,33.4
339,5,1018.4,20.3,18.4,15.7,15.8,85,91,yes,0.1,70.0,41.3
39,9,1020.8,21.2,16.7,12.7,7.2,54,29,no,10.2,40.0,17.2
61,2,1023.8,20.6,16.6,14.4,11.4,72,32,no,10.6,60.0,27.1
201,20,1009.8,31.9,29.2,25.6,25.6,82,70,yes,6.4,220.0,22.6
128,8,1011.4,31.2,28.7,27.3,25.2,82,80,no,3.8,150.0,9.5
110,20,1014.6,22.9,21.6,20.2,19.5,88,89,yes,0.2,40.0,20.9
246,3,1013.0,29.0,27.6,25.8,25.8,90,87,yes,0.7,60.0,18.2
153,2,1006.0,33.0,30.3,28.7,26.3,79,58,no,10.0,230.0,27.4


In [94]:
# dropping the unwanted features
data = data.drop("day", axis=1)
data.sample(10)

Unnamed: 0,pressure,maxtemp,temparature,mintemp,dewpoint,humidity,cloud,rainfall,sunshine,winddirection,windspeed
87,1024.1,19.9,16.9,15.2,10.0,65,58,no,5.6,60.0,26.9
127,1010.6,31.0,28.7,27.1,25.1,81,69,no,4.7,170.0,12.1
329,1017.7,25.3,22.6,18.2,16.8,70,44,no,6.9,20.0,28.6
121,1011.5,23.4,22.2,20.4,20.9,92,90,yes,0.5,50.0,23.7
139,1009.9,28.1,25.5,24.0,22.5,84,85,yes,3.5,70.0,26.4
250,1007.2,29.6,28.3,26.7,25.9,87,82,yes,0.5,250.0,21.3
122,1011.0,29.8,25.6,22.8,23.6,89,82,yes,2.0,50.0,11.1
330,1020.6,21.2,18.2,15.8,7.8,52,46,no,3.5,20.0,34.3
284,1018.0,22.1,20.5,18.5,16.6,78,88,yes,0.0,20.0,38.3
97,1013.2,26.9,23.9,22.4,22.3,91,84,no,1.8,40.0,10.0


In [95]:
data.isnull().sum()

pressure         0
maxtemp          0
temparature      0
mintemp          0
dewpoint         0
humidity         0
cloud            0
rainfall         0
sunshine         0
winddirection    1
windspeed        1
dtype: int64

In [96]:
data["winddirection"].unique()

array([ 80.,  50.,  40.,  20.,  30.,  60.,  70.,  10., 200., 220., 120.,
       190., 210., 300., 240., 180., 230.,  90., 170., 150., 100., 130.,
        nan, 160., 270., 280., 250., 260., 290., 350., 110., 140.])

In [97]:
data["winddirection"].median()

np.float64(70.0)

In [98]:
data["winddirection"].mode()

0    20.0
Name: winddirection, dtype: float64

In [99]:
data["windspeed"].median()

np.float64(20.5)

In [100]:
data["windspeed"].mean()

np.float64(21.53698630136986)

In [101]:
data["windspeed"].mode()

0    14.5
Name: windspeed, dtype: float64

In [102]:
# filling the missing values
data["winddirection"] = data["winddirection"].fillna(data["winddirection"].mode()[0])
data["windspeed"] = data["winddirection"].fillna(data["windspeed"].median())

In [103]:
data.isnull().sum()

pressure         0
maxtemp          0
temparature      0
mintemp          0
dewpoint         0
humidity         0
cloud            0
rainfall         0
sunshine         0
winddirection    0
windspeed        0
dtype: int64

In [104]:
data["winddirection"].unique()

array([ 80.,  50.,  40.,  20.,  30.,  60.,  70.,  10., 200., 220., 120.,
       190., 210., 300., 240., 180., 230.,  90., 170., 150., 100., 130.,
       160., 270., 280., 250., 260., 290., 350., 110., 140.])

In [105]:
data["rainfall"].value_counts()

rainfall
yes    249
no     117
Name: count, dtype: int64

In [106]:
# encoding the rainfall feature into numerical: yes-->1, no-->0
data["rainfall"] = data["rainfall"].replace(
    {"yes": 1, "no": 0}
)

  data["rainfall"] = data["rainfall"].replace(


In [107]:
data["rainfall"].value_counts()

rainfall
1    249
0    117
Name: count, dtype: int64

In [108]:
sns.set_theme(
    style="whitegrid",
    context="notebook",  # or "paper", "talk", "poster"
    palette="deep",
    font="sans-serif",
    font_scale=1,
    rc=None
)

In [109]:
data.describe()

Unnamed: 0,pressure,maxtemp,temparature,mintemp,dewpoint,humidity,cloud,rainfall,sunshine,winddirection,windspeed
count,366.0,366.0,366.0,366.0,366.0,366.0,366.0,366.0,366.0,366.0,366.0
mean,1013.742623,26.191257,23.747268,21.894536,19.989071,80.177596,71.128415,0.680328,4.419399,101.284153,101.284153
std,6.414776,5.978343,5.632813,5.594153,5.997021,10.06247,21.798012,0.466988,3.934398,81.722827,81.722827
min,998.5,7.1,4.9,3.1,-0.4,36.0,0.0,0.0,0.0,10.0,10.0
25%,1008.5,21.2,18.825,17.125,16.125,75.0,58.0,0.0,0.5,40.0,40.0
50%,1013.0,27.75,25.45,23.7,21.95,80.5,80.0,1.0,3.5,70.0,70.0
75%,1018.1,31.2,28.6,26.575,25.0,87.0,88.0,1.0,8.2,190.0,190.0
max,1034.6,36.3,32.4,30.0,26.7,98.0,100.0,1.0,12.1,350.0,350.0


In [110]:
data.corr()

Unnamed: 0,pressure,maxtemp,temparature,mintemp,dewpoint,humidity,cloud,rainfall,sunshine,winddirection,windspeed
pressure,1.0,-0.829088,-0.85178,-0.839851,-0.860232,-0.274907,0.005621,-0.089275,-0.198171,-0.656828,-0.656828
maxtemp,-0.829088,1.0,0.986193,0.961534,0.899829,0.021882,-0.289765,-0.101003,0.507416,0.625447,0.625447
temparature,-0.85178,0.986193,1.0,0.990187,0.936866,0.09203,-0.205895,-0.03736,0.420545,0.645266,0.645266
mintemp,-0.839851,0.961534,0.990187,1.0,0.944515,0.13904,-0.156097,-0.007697,0.368288,0.629738,0.629738
dewpoint,-0.860232,0.899829,0.936866,0.944515,1.0,0.425282,0.044635,0.139916,0.185234,0.623952,0.623952
humidity,-0.274907,0.021882,0.09203,0.13904,0.425282,1.0,0.655493,0.489623,-0.563579,0.099471,0.099471
cloud,0.005621,-0.289765,-0.205895,-0.156097,0.044635,0.655493,1.0,0.625766,-0.848334,-0.077037,-0.077037
rainfall,-0.089275,-0.101003,-0.03736,-0.007697,0.139916,0.489623,0.625766,1.0,-0.552519,-0.015058,-0.015058
sunshine,-0.198171,0.507416,0.420545,0.368288,0.185234,-0.563579,-0.848334,-0.552519,1.0,0.251374,0.251374
winddirection,-0.656828,0.625447,0.645266,0.629738,0.623952,0.099471,-0.077037,-0.015058,0.251374,1.0,1.0
