# 1. Importing library

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib.pyplot import rcParams
rcParams['figure.figsize']=12,4

from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer

In [5]:
conda install -c anaconda py-xgboost

^C

Note: you may need to restart the kernel to use updated packages.


In [7]:
import xgboost as xgb
from xgboost.sklearn import XGBClassifier
from sklearn.model_selection import cross_validate
from sklearn.model_selection import GridSearchCV

In [9]:
rain=pd.read_csv('weatherAUS.csv')
rain.head()

Unnamed: 0.1,Unnamed: 0,Date,Location,MinTemp,MaxTemp,Rainfall,Evaporation,Sunshine,WindGustDir,WindGustSpeed,...,Humidity3pm,Pressure9am,Pressure3pm,Cloud9am,Cloud3pm,Temp9am,Temp3pm,RainToday,RISK_MM,RainTomorrow
0,1,2008-12-01,Albury,13.4,22.9,0.6,,,W,44.0,...,22.0,1007.7,1007.1,8.0,,16.9,21.8,No,0.0,No
1,2,2008-12-02,Albury,7.4,25.1,0.0,,,WNW,44.0,...,25.0,1010.6,1007.8,,,17.2,24.3,No,0.0,No
2,3,2008-12-03,Albury,12.9,25.7,0.0,,,WSW,46.0,...,30.0,1007.6,1008.7,,2.0,21.0,23.2,No,0.0,No
3,4,2008-12-04,Albury,9.2,28.0,0.0,,,NE,24.0,...,16.0,1017.6,1012.8,,,18.1,26.5,No,1.0,No
4,5,2008-12-05,Albury,17.5,32.3,1.0,,,W,41.0,...,33.0,1010.8,1006.0,7.0,8.0,17.8,29.7,No,0.2,No


In [10]:
rain.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 145460 entries, 0 to 145459
Data columns (total 25 columns):
 #   Column         Non-Null Count   Dtype  
---  ------         --------------   -----  
 0   Unnamed: 0     145460 non-null  int64  
 1   Date           145460 non-null  object 
 2   Location       145460 non-null  object 
 3   MinTemp        143975 non-null  float64
 4   MaxTemp        144199 non-null  float64
 5   Rainfall       142199 non-null  float64
 6   Evaporation    82670 non-null   float64
 7   Sunshine       75625 non-null   float64
 8   WindGustDir    135134 non-null  object 
 9   WindGustSpeed  135197 non-null  float64
 10  WindDir9am     134894 non-null  object 
 11  WindDir3pm     141232 non-null  object 
 12  WindSpeed9am   143693 non-null  float64
 13  WindSpeed3pm   142398 non-null  float64
 14  Humidity9am    142806 non-null  float64
 15  Humidity3pm    140953 non-null  float64
 16  Pressure9am    130395 non-null  float64
 17  Pressure3pm    130432 non-nul

# Target variable = RainToday

## we are going to predict the rain will happen or not happen today

In [12]:
# Dropping the Rainfall column is a must because it records the amount of rain in millimeters.

cols_to_drop=['Date','Location','RainTomorrow','Rainfall']
rain.drop(cols_to_drop,axis=1,inplace=True)

In [13]:
# missing values/proportions

missing_props=rain.isna().mean(axis=0)
missing_props

Unnamed: 0       0.000000
MinTemp          0.010209
MaxTemp          0.008669
Evaporation      0.431665
Sunshine         0.480098
WindGustDir      0.070989
WindGustSpeed    0.070555
WindDir9am       0.072639
WindDir3pm       0.029066
WindSpeed9am     0.012148
WindSpeed3pm     0.021050
Humidity9am      0.018246
Humidity3pm      0.030984
Pressure9am      0.103568
Pressure3pm      0.103314
Cloud9am         0.384216
Cloud3pm         0.408071
Temp9am          0.012148
Temp3pm          0.024811
RainToday        0.022419
RISK_MM          0.022460
dtype: float64

In [14]:
# If the proportion is higher than 40% we will drop the columns

over_threshold=missing_props[missing_props>=0.4]
over_threshold

Evaporation    0.431665
Sunshine       0.480098
Cloud3pm       0.408071
dtype: float64

In [15]:
rain.drop(over_threshold.index,axis=1,inplace=True)

In [16]:
# x and y

x=rain.drop('RainToday',axis=1)
y=rain.RainToday

### categorical vars,impute missing vals by mode='Most_frequent',and encode them using(one-hot-encoding)

In [27]:
# importing the pipelines
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder

In [18]:
categorical_pipeline=Pipeline(steps=[('impute',SimpleImputer(strategy='most_frequent')),('oh-encode',OneHotEncoder(handle_unknown='ignore',sparse=False)),])

## For continuous/numeric vars,we will standardize and impute by median

In [21]:
from sklearn.preprocessing import StandardScaler

numeric_pipeline=Pipeline(steps=[('impute',SimpleImputer(strategy='mean')),('scale',StandardScaler())])

In [22]:
# after making pipelines, lets seprate categorical and continious variable

cat_cols=x.select_dtypes(exclude='number').columns
num_cols=x.select_dtypes(include='number').columns

In [23]:
# now see how we are using pipelines for transforming categorical and continious variables.

# first we make columnTransformer which will transform catg+numeric var by our pipelines

from sklearn.compose import ColumnTransformer

full_processor=ColumnTransformer(transformers=[('numeric',numeric_pipeline,num_cols),('categorical',categorical_pipeline,cat_cols)])

In [24]:
# now we will apply on x (predictors)
x_processed=full_processor.fit_transform(x)

In [None]:
# see now there are 60 columns (predictors!). x was 15 only!
# In one shot, all catg and cont vars treated!!! 

In [25]:
# y is having null values,  we will impute by mode and it is in series form which should convert in to array form, so we will reshape it

In [34]:
y_processed=SimpleImputer(strategy='most_frequent').fit_transform(y.values.reshape(-1,1))

In [None]:
# meaning of reshape(-1,1)
## our y is a series, which is transformed into an array bcz, if you see, x_processed in an array, so y should also be in array form.

In [35]:
# so far we have done
#(i) missing value treatment 
#(ii) one hod encodeing of catg vars
#(iii) reshaping of y as sn array

## Now time to go for XGB Modeling

# XGB Modeling

In [36]:
# first will split the data into train and test

from sklearn.model_selection import train_test_split

x_train,x_test,y_train,y_test=train_test_split(x_processed,y_processed,stratify=y_processed,random_state=1121218)

In [37]:
y.value_counts()  

No     110319
Yes     31880
Name: RainToday, dtype: int64

## No is 110319 and Yes is 31880 too imbalanced in data that's why stratify is used

In [39]:
# building model

from sklearn.metrics import accuracy_score
# Init classifier
xgb_cl=xgb.XGBClassifier()

In [41]:
# fit
xgb_cl.fit(x_train,y_train)
# predict
preds=xgb_cl.predict(x_test)



In [42]:
# score

accuracy_score(y_test,preds)

0.8601952426783995