# Exploring different Techniques to Improve Perfomance

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.utils import resample
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.model_selection import GridSearchCV, train_test_split, cross_val_score
from sklearn.ensemble import RandomForestClassifier
import pickle

In [2]:
data = pd.read_csv("../dataset/Rainfall.csv")
data.sample(10)

Unnamed: 0,day,pressure,maxtemp,temparature,mintemp,dewpoint,humidity,cloud,rainfall,sunshine,winddirection,windspeed
176,25,1008.9,35.5,31.4,28.9,26.1,74,39,no,10.6,150.0,9.1
225,13,1007.9,29.6,28.0,26.4,25.6,87,84,yes,0.1,230.0,22.9
188,7,1005.9,34.0,30.2,27.9,25.6,77,53,yes,10.5,270.0,11.3
110,20,1014.6,22.9,21.6,20.2,19.5,88,89,yes,0.2,40.0,20.9
115,25,1009.8,28.4,26.0,23.9,24.3,90,83,yes,3.1,180.0,13.0
6,7,1021.8,21.4,18.8,17.0,15.0,79,56,no,3.4,30.0,21.5
159,8,1006.3,30.0,27.1,24.1,25.1,89,85,yes,3.1,190.0,12.6
161,10,1005.7,31.1,27.9,26.6,25.8,89,80,yes,4.5,220.0,14.6
217,5,1006.1,33.3,29.6,27.6,24.4,74,27,no,10.8,220.0,8.7
337,3,1019.4,22.3,20.8,19.6,16.5,77,93,yes,0.0,20.0,27.9


In [3]:
data = data.drop("day", axis=1)
data.sample(10)

Unnamed: 0,pressure,maxtemp,temparature,mintemp,dewpoint,humidity,cloud,rainfall,sunshine,winddirection,windspeed
17,1017.1,17.8,15.2,11.9,11.1,76,49,no,3.9,50.0,28.4
160,1005.7,31.7,28.2,26.6,25.7,86,79,yes,6.5,,
303,1017.8,29.6,26.7,25.3,23.1,81,53,no,8.4,100.0,16.5
347,1016.4,21.0,20.4,19.7,17.8,85,83,yes,0.0,70.0,30.3
290,1013.4,30.8,26.2,23.6,19.5,69,17,no,10.5,70.0,12.4
7,1020.8,21.0,18.4,16.5,14.4,78,28,no,7.7,60.0,14.3
357,1017.8,22.3,21.1,19.8,19.6,91,79,yes,0.0,40.0,15.0
166,1005.3,32.0,30.3,29.3,26.3,79,86,yes,1.6,210.0,29.3
42,1013.4,21.6,19.2,17.6,18.5,95,86,yes,0.6,30.0,13.9
38,1023.9,19.6,14.8,11.6,4.3,52,0,no,10.2,40.0,16.7


In [4]:
data = data.replace(
    {"yes": 1, "no": 0}
)
data.sample(10)

  data = data.replace(


Unnamed: 0,pressure,maxtemp,temparature,mintemp,dewpoint,humidity,cloud,rainfall,sunshine,winddirection,windspeed
333,1021.2,24.1,22.1,20.3,17.4,75,40,0,8.8,60.0,16.0
285,1019.0,24.9,23.1,21.0,16.8,67,88,1,0.0,50.0,28.2
189,1001.0,34.2,31.0,28.1,25.9,75,46,0,10.4,280.0,23.5
344,1015.5,20.9,19.3,17.6,16.8,86,80,1,5.1,20.0,10.2
18,1020.1,17.6,16.4,15.3,12.5,78,84,0,1.0,60.0,38.0
103,1005.5,25.1,21.8,20.9,21.6,98,93,1,0.0,40.0,14.5
301,1017.0,29.2,26.7,25.5,23.3,82,46,1,9.5,110.0,15.8
2,1019.7,20.3,19.3,18.0,18.4,95,91,1,0.0,40.0,14.2
129,1010.1,30.6,28.7,26.9,25.0,81,69,0,3.9,170.0,16.3
139,1009.9,28.1,25.5,24.0,22.5,84,85,1,3.5,70.0,26.4


In [5]:
data.columns

Index(['pressure ', 'maxtemp', 'temparature', 'mintemp', 'dewpoint',
       'humidity ', 'cloud ', 'rainfall', 'sunshine', '         winddirection',
       'windspeed'],
      dtype='object')

In [6]:
data.columns = data.columns.str.strip()
data.columns

Index(['pressure', 'maxtemp', 'temparature', 'mintemp', 'dewpoint', 'humidity',
       'cloud', 'rainfall', 'sunshine', 'winddirection', 'windspeed'],
      dtype='object')

In [7]:
data.isnull().sum()

pressure         0
maxtemp          0
temparature      0
mintemp          0
dewpoint         0
humidity         0
cloud            0
rainfall         0
sunshine         0
winddirection    1
windspeed        1
dtype: int64

In [8]:
# filling the missing values
data["winddirection"] = data["winddirection"].fillna(data["winddirection"].mode()[0])
data["windspeed"] = data["windspeed"].fillna(data["windspeed"].median())

In [9]:
data.isnull().sum()

pressure         0
maxtemp          0
temparature      0
mintemp          0
dewpoint         0
humidity         0
cloud            0
rainfall         0
sunshine         0
winddirection    0
windspeed        0
dtype: int64

In [10]:
# dropping temperature files t avoid multicolinearity
data = data.drop(columns=["maxtemp", "mintemp", "dewpoint"])
data.sample(10)

Unnamed: 0,pressure,temparature,humidity,cloud,rainfall,sunshine,winddirection,windspeed
15,1013.5,16.4,95,93,1,0.0,60.0,40.0
123,1011.4,26.5,86,84,1,3.1,220.0,11.8
296,1013.0,27.0,73,50,0,8.3,80.0,9.9
304,1020.4,25.6,79,66,1,5.3,80.0,40.8
149,1007.9,29.1,84,77,1,3.6,210.0,19.0
36,1024.9,13.6,39,13,0,10.2,20.0,44.7
338,1021.1,19.5,75,88,1,0.6,50.0,37.9
234,1000.1,30.2,71,48,1,6.1,10.0,12.0
344,1015.5,19.3,86,80,1,5.1,20.0,10.2
226,1007.4,26.9,90,88,1,0.2,210.0,12.9
