In [None]:
# Imp libs:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Spliting data
from sklearn.model_selection import train_test_split,GridSearchCV

# Algorithms
from sklearn import linear_model
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix
import itertools
from sklearn.linear_model import SGDClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC, LinearSVC
from sklearn.naive_bayes import GaussianNB
# Importing metrics
from sklearn.metrics import accuracy_score,confusion_matrix,classification_report

from statsmodels.stats.outliers_influence import variance_inflation_factor

# Removing warnings
import warnings
warnings.filterwarnings('ignore')

%matplotlib inline

In [None]:
file_loc = 'https://raw.githubusercontent.com/dsrscientist/dataset3/main/weatherAUS.csv'
df = pd.read_csv(file_loc)

In [None]:
print('First 10 rows')
df.head(10)

In [None]:
print('Last 10 rows')
df.tail(10)

In [None]:
df.shape

In [None]:
df.columns

In [None]:
df.dtypes

In [None]:
null_data = df.isnull().sum()
null_data.sort_values(ascending=0)

In [None]:
df.dropna(axis=0,inplace=True)

In [None]:
df.shape

In [None]:
8425-3490

In [None]:
desc = df.describe().T
desc['range']=desc['max']-desc['min']
desc

In [None]:
df.describe(include='object').T

In [None]:
desc[['min','mean','max','range']]

In [None]:
"""EXPLORATORY DATA ANALYSIS

PROJECT NAME RAINFALL PREDICTION WEATHER FORECASTING

Dataset contains 8425 and 23 columns.

Column names mentioned below
    ['Date', 'Location', 'MinTemp', 'MaxTemp', 'Rainfall', 'Evaporation',
       'Sunshine', 'WindGustDir', 'WindGustSpeed', 'WindDir9am', 'WindDir3pm',
       'WindSpeed9am', 'WindSpeed3pm', 'Humidity9am', 'Humidity3pm',
       'Pressure9am', 'Pressure3pm', 'Cloud9am', 'Cloud3pm', 'Temp9am',
       'Temp3pm', 'RainToday', 'RainTomorrow']

Data type of columns mentioned below:
            Date              object
        Location          object
        MinTemp          float64
        MaxTemp          float64
        Rainfall         float64
        Evaporation      float64
        Sunshine         float64
        WindGustDir       object
        WindGustSpeed    float64
        WindDir9am        object
        WindDir3pm        object
        WindSpeed9am     float64
        WindSpeed3pm     float64
        Humidity9am      float64
        Humidity3pm      float64
        Pressure9am      float64
        Pressure3pm      float64
        Cloud9am         float64
        Cloud3pm         float64
        Temp9am          float64
        Temp3pm          float64
        RainToday         object
        RainTomorrow      object

Data set columns contains significant null values. Null values in each column are mentioned below
        Sunshine         3994
        Evaporation      3512
        Cloud3pm         2455
        Cloud9am         2421
        Pressure3pm      1312
        Pressure9am      1309
        WindGustDir       991
        WindGustSpeed     991
        WindDir9am        829
        WindDir3pm        308
        RainToday         240
        Rainfall          240
        RainTomorrow      239
        WindSpeed3pm      107
        Humidity3pm       102
        Temp3pm            96
        WindSpeed9am       76
        MinTemp            75
        MaxTemp            60
        Humidity9am        59
        Temp9am            56
        Location            0
        Date                0

TREATMENT OF NULL VALUES

        SINCE THE DATA IS TECHNICAL REGARDING THE VALUES OF RAINFALL WE ARE NOT SURE ABOUT THE MISSING VALUES DOES 
        THE BEST POSSIBLE WAY TO TREAT MISSING VALUES IS TO DROP THEM.

        Now we are left with 3790 rows of data after deleting null values.

STATISTICAL DESCRIPTION OF DATA SET

NUMERICAL COLUMNS
        For minimum temperature minimum value is minus 0.7 and maximum value is 28.5.
        Data scenes okay for minimum temperature and maximum temperature.
        For rainfall colum minimum value is zero that indicates there was some reasons where there was no rainfall at all.
        Evaporation sunshine cloud 9:00 a.m. and cloud 3:00 p.m. columns also have zero as minimum values. We need to 
        keep this in mind and research about skewness if present.

CATEGORICAL COLUMNS
        For date colum n there are 37 90 total counts from which 1873 values are unique and for 2011 02 07 date has 
        most frequency.

Perth airport location has most occurrences.

For went guest direction north direction has stopped frequency.
For wind direction at 9:00 a.m. and wind direction at 3:00 p.m. top frequencies are North and South respectively.
For column rain today and rain tomorrow no is the most occurring response.

CONTINUOUS COLUMNS AND DATA SET
        ['MinTemp', 'MaxTemp', 'Rainfall', 'Evaporation', 'Sunshine',
       'WindGustSpeed', 'WindSpeed9am', 'WindSpeed3pm', 'Humidity9am',
       'Humidity3pm', 'Pressure9am', 'Pressure3pm', 'Cloud9am', 'Cloud3pm',
       'Temp9am', 'Temp3pm'],

CATEGORICAL COLUMNS AND DATA SET
        ['Date', 'Location', 'WindGustDir', 'WindDir9am', 'WindDir3pm',
       'RainToday', 'RainTomorrow']

UNIVARIATE ANALYSIS:

MOST FREQUENT VALUES IN CATEGORICAL COLUMNS

        For Date , most frequent value is:  ModeResult(mode=array(['2011-01-19'], dtype=object), count=array([5])) 

        For Location , most frequent value is:  ModeResult(mode=array(['Melbourne'], dtype=object), count=array([1622])) 

        For WindGustDir , most frequent value is:  ModeResult(mode=array(['N'], dtype=object), count=array([713])) 

        For WindDir9am , most frequent value is:  ModeResult(mode=array(['N'], dtype=object), count=array([906])) 

        For WindDir3pm , most frequent value is:  ModeResult(mode=array(['SE'], dtype=object), count=array([813])) 

        For RainToday , most frequent value is:  ModeResult(mode=array(['No'], dtype=object), count=array([6195])) 

        For RainTomorrow , most frequent value is:  ModeResult(mode=array(['No'], dtype=object), count=array([6195])) 

UNIQUE VALUES AND THEIR ACCOUNTS IN CATEGORICAL COLUMN

        For column Date unique values are:  ['2008-12-01' '2008-12-02' '2008-12-03' ... '2013-06-06' '2013-06-07'
         '2013-06-08']
        For column Date count of unique values are:  3004 


        For column Location unique values are:  ['Albury' 'CoffsHarbour' 'Newcastle' 'Penrith' 'Williamtown' 'Wollongong'
         'Melbourne' 'Brisbane' 'Adelaide' 'PerthAirport' 'Darwin' 'Uluru']
        For column Location count of unique values are:  12 


        For column WindGustDir unique values are:  ['W' 'WNW' 'WSW' 'NE' 'NNW' 'N' 'NNE' 'SW' nan 'ENE' 'SSE' 'S' 'NW' 'SE'
         'ESE' 'E' 'SSW']
        For column WindGustDir count of unique values are:  16 


        For column WindDir9am unique values are:  ['W' 'NNW' 'SE' 'ENE' 'SW' 'SSE' 'S' 'NE' nan 'SSW' 'N' 'WSW' 'ESE' 'E'
         'NW' 'WNW' 'NNE']
        For column WindDir9am count of unique values are:  16 


        For column WindDir3pm unique values are:  ['WNW' 'WSW' 'E' 'NW' 'W' 'SSE' 'ESE' 'ENE' 'NNW' 'SSW' 'SW' 'SE' 'N' 'S'
         'NNE' nan 'NE']
        For column WindDir3pm count of unique values are:  16 


        For column RainToday unique values are:  ['No' 'Yes' nan]
        For column RainToday count of unique values are:  2 


        For column RainTomorrow unique values are:  ['No' 'Yes' nan]
        For column RainTomorrow count of unique values are:  2 

VALUE COUNTS OF EACH ENTRY IN CATEGORICAL COLUMNS

        For column -- Date -- value counts are: 
         2011-01-31    5
        2011-05-08    5
        2011-05-21    5
        2011-05-20    5
        2011-05-19    5
                     ..
        2013-01-28    1
        2013-01-29    1
        2013-01-30    1
        2013-01-31    1
        2013-06-08    1
        Name: Date, Length: 3004, dtype: int64 


        For column -- Location -- value counts are: 
         Melbourne       1622
        Williamtown     1230
        PerthAirport    1204
        Albury           907
        Newcastle        822
        CoffsHarbour     611
        Brisbane         579
        Penrith          482
        Wollongong       474
        Darwin           250
        Adelaide         205
        Uluru             39
        Name: Location, dtype: int64 


        For column -- WindGustDir -- value counts are: 
         N      713
        SSE    578
        S      577
        SW     572
        E      557
        WNW    531
        W      507
        WSW    504
        SE     484
        ENE    415
        SSW    396
        NW     383
        NE     353
        NNE    343
        ESE    302
        NNW    219
        Name: WindGustDir, dtype: int64 


        For column -- WindDir9am -- value counts are: 
         N      906
        SW     704
        NW     625
        WSW    543
        SE     505
        WNW    480
        SSW    467
        ENE    433
        NNE    430
        W      414
        NE     409
        S      402
        E      380
        SSE    365
        NNW    280
        ESE    253
        Name: WindDir9am, dtype: int64 


        For column -- WindDir3pm -- value counts are: 
         SE     813
        S      742
        SSE    623
        WSW    580
        NE     544
        N      524
        SW     494
        WNW    487
        NW     468
        W      462
        ESE    462
        E      460
        ENE    417
        SSW    370
        NNE    365
        NNW    306
        Name: WindDir3pm, dtype: int64 


        For column -- RainToday -- value counts are: 
         No     6195
        Yes    1990
        Name: RainToday, dtype: int64 


        For column -- RainTomorrow -- value counts are: 
         No     6195
        Yes    1991
        Name: RainTomorrow, dtype: int64 


# Univariate analysis:

        COUNT PLOT ANALYSIS SUMMARY
        For different locations contributing to rainfall data, Melbourne and perth airport location contributed to maximum 
        number of rainfall data and William town and Darwin contributed the least.

        For went girls direction maximum count is for East Southwest and north direction respectively.

        For when direction at 9:00 a.m. maximum count is for north and Southwest direction respectively.

        For wind direction at 3:00 p.m. south direction has highest number of counts.

        For most Number of days there was no rainfall today and tomorrow.

BY VARIATE ANALYSIS
        Cat plot analysis summary:
        For cloud 9:00 a.m. and cloud 3:00 p.m. maximum possibility of rainfall is when the reading is 7 at cloud 9:00 a.m. colum.

BOX PLOT SUMMARY

        Fortime 3:00 p.m. colum and temp 9:00 p.m. colum for Melbourne and Perth airport location
        Possibility of rainfall tomorrow is high. Melbourne and perth airport location data contains lot of outliers as well.
        Cloud 9:00 p.m. data indicates for cough herber and Perth airport possibility of rainfall tomorrow is high. Here also 
        Melbourne data set contains out liars.
        Wind speed 9:00 a.m. box plot with location and rain tomorrow indicates that Melbourne Brisbane data have lots of outliers. 
        For low Sunshine values typically for all locations possibility of rain tomorrow is high.
        Possibility of rain tomorrow is no for high evaporation values for williamtown location.
        For Max temperature values ranging from 15 to 30 possibility of rainfall is high for Melbourne and pers airport 
        locations where and Perth airport contains lot of outliers.
        Possibility of rainfall at coffs harbour and Brisbane is high and max temperature range is between 20 to 25.
        Proportionately as for rainfall tomorrow data possibility of rainfall today is quite high for Melbourne and Perth 
        airport locations.
        For evaporation values renging from 5 to 15 possibility of rain today is quite high for William town.
        For Sunshine values ranging from 2 to 8 possibility of rainfall is quite high for nearly all locations.
        For humidity 9:00 a.m. values above 70 high possibility of rainfall can be seen for nearly all locations.
        For darvin location possibility of rainfall today is hi for low pressure 9M values as compared to other locations.

STRIP PLOT SUMMARY
        Rainfall today and tomorrow can be expected considerably for locations Melbourne Perth airport and coffs harbour.
        Very less rainfall can be expected for William town and Brisbane and Darwin location.

ANALYSING RELATIONSHIPS USING LM PLOT

        Rain today has a positive relationship with min temp and humidity 9:00 a.m. , min temp and humidity 3:00 p.m. 
        cloud 9:00 a.m. and mintemp and temp 9:00 a.m.
        Pressure and cloud columns with evaporation rainfall show neutral relationship.
        Sunshine feature has negative relationship with the rain today and rain tomorrow nearly for all other features as well.

MULTIVARIATE ANALYSIS USING PAIR PLOT

CHECKING CORRELATION BETWEEN FEATURES AND TARGET VARIABLE

        Sunshine evaporation and max temperature have negative correlation with rainfall column.

        Humidity columns and cloud columns have high positive correlation with rainfall.

ANALYSING MULTI COLLINEARITY PROBLEM
        Wind speed 9:00 a.m. and wind speed 3:00 p.m. and wind gust speed are are highly correlated to each other. 
        Temperature columns are also highly related to each other and does the value of one column can be used to protect 
        the value of other colum. That's we can conclude that multi collinearity problem exist.

USING VARIANCE INFLATION FACTOR
        Min temperature Max temperature evaporation Sunshine wind speed columns humidity and pressure columns and early 
        all of other columns have high variance in fashion factor values and does we need to treat them.

FURTHER STEPS PERFORMED
        Selection of features and target column
        Encoding categorical columns
        Normalising data using power transform
        Feature selection using principle component analysis
        Splitting training and testing data
        Model instantiation
        Model validation
        Selecting best model and hyperparameter boosting
        Result:
            Training accuracy:  96.65485157288435
            Testing accuracy:  84.97913769123782
            Confusion matrix: 
             [[521  27]
             [ 81  90]]
            classification report:                precision    recall  f1-score   support

                       0       0.87      0.95      0.91       548
                       1       0.77      0.53      0.62       171

                accuracy                           0.85       719
               macro avg       0.82      0.74      0.77       719
            weighted avg       0.84      0.85      0.84       719
        Saving the model."""

In [None]:
cont_data = df.select_dtypes(include=['int64','float64'])

cat_data= df.select_dtypes(include=['object'])

cont_columns = cont_data.columns

cat_columns = cat_data.columns

In [None]:
cont_columns

In [None]:
cat_columns

In [None]:
from scipy import stats

for i in cat_columns:
    print('For',i,', most frequent value is: ',stats.mode(df[i]),'\n')

In [None]:
for i in cat_columns:
    print('For column',i,'unique values are: ',df[i].unique())
    print('For column',i,'count of unique values are: ',df[i].nunique(),'\n\n')

In [None]:
for i in cat_columns:
    print('For column --',i,'-- value counts are: \n',df[i].value_counts(),'\n\n')

In [None]:
# Univariate analysis:

for i in cat_columns:
    f= plt.figure(figsize=(12,5))
    ax = sns.countplot(x=df[i],data=df)
    plt.xticks(rotation=90)

In [None]:
for i in cont_columns:
    f= plt.figure(figsize=(20,20))
    ax = sns.catplot(x=i, kind="count",hue = 'RainTomorrow', data=df)
    plt.xticks(rotation=90)
    plt.show()

In [None]:
df.columns

In [None]:
sns.boxplot(x ='Location', y ='Temp3pm', data = df, hue ='RainTomorrow')

In [None]:
sns.boxplot(x ='Location', y ='Temp9am', data = df, hue ='RainTomorrow')

In [None]:
sns.boxplot(x ='Location', y ='Cloud3pm', data = df, hue ='RainTomorrow')

In [None]:
sns.boxplot(x ='Location', y ='Cloud9am', data = df, hue ='RainTomorrow')

In [None]:
sns.boxplot(x ='Location', y ='Pressure9am', data = df, hue ='RainTomorrow')

In [None]:
sns.boxplot(x ='Location', y ='Pressure3pm', data = df, hue ='RainTomorrow')

In [None]:
sns.boxplot(x ='Location', y ='Humidity9am', data = df, hue ='RainTomorrow')

In [None]:
sns.boxplot(x ='Location', y ='WindSpeed3pm', data = df, hue ='RainTomorrow')

In [None]:
sns.boxplot(x ='Location', y ='WindSpeed9am', data = df, hue ='RainTomorrow')

In [None]:
sns.boxplot(x ='Location', y ='WindGustSpeed', data = df, hue ='RainTomorrow')

In [None]:
sns.boxplot(x ='Location', y ='Sunshine', data = df, hue ='RainTomorrow')

In [None]:
sns.boxplot(x ='Location', y ='Evaporation', data = df, hue ='RainTomorrow')

In [None]:
sns.boxplot(x ='Location', y ='Rainfall', data = df, hue ='RainTomorrow')

In [None]:
sns.boxplot(x ='Location', y ='MaxTemp', data = df, hue ='RainTomorrow')

In [None]:
sns.boxplot(x ='Location', y ='MinTemp', data = df, hue ='RainTomorrow')

In [None]:
for i in cont_columns:
    plt.figure(figsize=(15,10))
    sns.boxplot(x ='Location', y =i, data = df, hue ='RainToday')
    plt.show()

In [None]:
cat_columns

In [None]:
for i in cont_columns:
    plt.figure(figsize=(15,10))
    sns.boxplot(x ='WindGustDir', y =i, data = df, hue ='RainTomorrow')
    plt.show()

In [None]:
for i in cont_columns:
    plt.figure(figsize=(15,10))
    sns.boxplot(x ='WindDir9am', y =i, data = df, hue ='RainTomorrow')
    plt.show()

In [None]:
for i in cont_columns:
    plt.figure(figsize=(15,10))
    sns.boxplot(x ='WindDir3pm', y =i, data = df, hue ='RainTomorrow')
    plt.show()

In [None]:
for i in cont_columns:
    plt.figure(figsize=(15,10))
    sns.boxplot(x ='WindGustDir', y =i, data = df, hue ='RainToday')
    plt.show()

In [None]:
for i in cont_columns:
    plt.figure(figsize=(15,10))
    sns.boxplot(x ='WindDir9am', y =i, data = df, hue ='RainToday')
    plt.show()

In [None]:
for i in cont_columns:
    plt.figure(figsize=(15,10))
    sns.boxplot(x ='WindDir3pm', y =i, data = df, hue ='RainToday')
    plt.show()

In [None]:
# Violin plots

In [None]:
cat_columns

In [None]:
for i in cont_columns:
    plt.figure(figsize=(15,10))
    sns.violinplot(x ='WindGustDir', y =i, data = df, hue ='RainTomorrow')
    plt.show()

In [None]:
for i in cont_columns:
    plt.figure(figsize=(15,10))
    sns.violinplot(x ='WindDir9am', y =i, data = df, hue ='RainTomorrow')
    plt.show()

In [None]:
for i in cont_columns:
    plt.figure(figsize=(15,10))
    sns.violinplot(x ='WindDir3pm', y =i, data = df, hue ='RainTomorrow')
    plt.show()

In [None]:
for i in cont_columns:
    plt.figure(figsize=(15,10))
    sns.violinplot(x ='WindGustDir', y =i, data = df, hue ='RainToday')
    plt.show()

In [None]:
for i in cont_columns:
    plt.figure(figsize=(15,10))
    sns.violinplot(x ='WindDir9am', y =i, data = df, hue ='RainToday')
    plt.show()

In [None]:
for i in cont_columns:
    plt.figure(figsize=(15,10))
    sns.violinplot(x ='WindDir3pm', y =i, data = df, hue ='RainToday')
    plt.show()

In [None]:
#Lineplot

In [None]:
cat_columns

In [None]:
for i in ['Location', 'WindGustDir', 'WindDir9am', 'WindDir3pm']:
    for j in cont_columns:
        plt.figure(figsize=(15,10))
        sns.stripplot(x = j, y =i, data = df, hue = 'RainToday')
        plt.show()

In [None]:
for i in ['Location', 'WindGustDir', 'WindDir9am', 'WindDir3pm']:
    for j in cont_columns:
        plt.figure(figsize=(15,10))
        sns.stripplot(x = j, y =i, data = df, hue = 'RainTomorrow')
        plt.show()

In [None]:
# Lineplot

In [None]:
for i in ['Location', 'WindGustDir', 'WindDir9am', 'WindDir3pm']:
    for j in cont_columns:
        plt.figure(figsize=(15,10))
        sns.lineplot(x = j, y =i, data = df, hue = 'RainTomorrow')
        plt.show()

In [None]:
for i in ['Location', 'WindGustDir', 'WindDir9am', 'WindDir3pm']:
    for j in cont_columns:
        plt.figure(figsize=(15,10))
        sns.lineplot(x = j, y =i, data = df, hue = 'RainToday')
        plt.show()

In [None]:
cont_columns

In [None]:
# Furthur analysing relationships

In [None]:
for i in ['MinTemp', 'MaxTemp', 'Rainfall', 'Evaporation', 'Sunshine','WindGustSpeed', 'WindSpeed9am']:
    for j in ['WindSpeed3pm', 'Humidity9am','Humidity3pm', 'Pressure9am', 'Pressure3pm', 'Cloud9am', 'Cloud3pm','Temp9am', 'Temp3pm']:
        plt.figure(figsize=(15,10))
        sns.lmplot(x = j,y=i, data = df, hue = 'RainToday')
        plt.show()

In [None]:
for i in ['MinTemp', 'MaxTemp', 'Rainfall', 'Evaporation', 'Sunshine','WindGustSpeed', 'WindSpeed9am']:
    for j in ['WindSpeed3pm', 'Humidity9am','Humidity3pm', 'Pressure9am', 'Pressure3pm', 'Cloud9am', 'Cloud3pm','Temp9am', 'Temp3pm']:
        plt.figure(figsize=(15,10))
        sns.lmplot(x = j,y=i, data = df, hue = 'RainTomorrow')
        plt.show()

In [None]:
df.shape[0]

In [None]:
df.shape[0]

In [None]:
3790-3592

In [None]:
198/3790*100

In [None]:
sns.pairplot(df,hue='RainTomorrow')

In [None]:
# Correlation between features and label:

# Replacing attrition column values:

df.drop(columns = 'Rainfall',axis = 1).corrwith(df.Rainfall).plot(kind='bar',grid=True,figsize=(10,7),title='corelation between features and labels')
plt.show()

In [None]:
# Checking relationship between independent and dependent variable:

df_corr = df.corr().abs()
plt.figure(figsize=(18,14))
sns.heatmap(df_corr,annot=True,annot_kws={'size':10})
plt.show()

In [None]:
from statsmodels.stats.outliers_influence import variance_inflation_factor


vif = pd.DataFrame()
vif['vif'] = [variance_inflation_factor(df[cont_columns],i) for i in range(df[cont_columns].shape[1])]
vif['features'] = df[cont_columns].columns
vif

In [None]:
# Using Z Statistics to check and remove any more outliers:

from scipy.stats import zscore

z_score = zscore(df[cont_columns])

abs_z_score = np.abs(z_score)

filtering_entry = (abs_z_score < 3).all(axis=1) # values lying in 3 times std will be removed

df = df[filtering_entry]

df.describe()

In [None]:
# Splitting data into features and label:

y1 = df['Rainfall']
X_pre = df.drop(['Rainfall'],axis=1)

In [None]:
X_dums = pd.get_dummies(X_pre)

In [None]:
X_dums.shape

In [None]:
from sklearn.preprocessing import PowerTransformer

pt = PowerTransformer()

X_trans1 = pt.fit_transform(X_dums)

In [None]:
from sklearn.decomposition import PCA
# Using PCA i.e. Principal Component Analysis that is a diamensionallity reduction technique:

pca1 = PCA()
pca1.fit_transform(X_trans1)

In [None]:
# Using Scree Plot to identify best components:

plt.figure()
plt.plot(np.cumsum(pca1.explained_variance_ratio_))
plt.xlabel('Principal Components')
plt.ylabel('Variance Covered')
plt.title('PCA')
plt.show()

In [None]:
pca1 = PCA(n_components=1750)
new_pcomp1 = pca1.fit_transform(X_trans1)
princi_comp1 = pd.DataFrame(new_pcomp1)
princi_comp1

In [None]:
from sklearn.linear_model import LinearRegression
import xgboost as xgb

# Spliting data
from sklearn.model_selection import train_test_split,GridSearchCV

# Importing metrics
from sklearn import metrics
from sklearn.metrics import r2_score,mean_squared_error

# Removing warnings
import warnings
warnings.filterwarnings('ignore')

In [None]:
# Splitting our data to training data and testing data
# x_train,x_test,y_train,y_test

x_train1,x_test1,y_train1,y_test1 = train_test_split(princi_comp1,y1,test_size=0.20,random_state=1)

# Here we are keeping training data as our scalled data and testing data as our label or target.

In [None]:
# MOdel instantiating and training

rm = LinearRegression()
rm.fit(x_train1,y_train1) 
# here we will pass training data

y_pred = rm.predict(x_test1)

print('Training score', rm.score(x_train1,y_train1))

print(" Testing Score: ", rm.score(x_test1, y_test1))

plt.scatter(y_test1,y_pred)
plt.xlabel('Actual Charges')
plt.ylabel('Predicted Charges')
plt.title('Actual vs Model Predicted')
plt.show()

# Model Evaluation: MAE , MSE , RMSE

from sklearn.metrics import mean_absolute_error,mean_squared_error

In [None]:
from sklearn.linear_model import Ridge,Lasso,RidgeCV,LassoCV

lassocv = LassoCV(alphas = None , max_iter = 100, normalize = True)
lassocv.fit(x_train1,y_train1)

# Best alpha parameter
alpha = lassocv.alpha_ # Best alpha rate
print(alpha)

# Now since we have the best parameter, lasso regression will be used:
lasso_reg = Lasso(alpha)
lasso_reg.fit(x_train1,y_train1)
# i.e. when model is training it will learn at this speed 6....

In [None]:
lasso_reg.score(x_test1,y_test1)

In [None]:
# Ridge Method:

ridgecv = RidgeCV(alphas = np.arange(0.001,0.1,0.01),normalize = True)
ridgecv.fit(x_train1,y_train1)

In [None]:
ridgecv.alpha_ # Best alpha rate

In [None]:
ridge_model = Ridge(alpha = ridgecv.alpha_)
ridge_model.fit(x_train1,y_train1)

In [None]:
ridge_model.score(x_test1,y_test1)

In [None]:
#model performance

print("MAE: ", metrics.mean_absolute_error(y_test1, y_pred))
print("MSE: ", metrics.mean_squared_error(y_test1, y_pred))
print("RMSE: ", metrics.mean_squared_error(y_test1, y_pred, squared=False))
print("R2: ", metrics.r2_score(y_test1, y_pred), "\n")
print("Score: ", rm.score(x_test1, y_test1))

In [None]:
# Using decision tree:

from sklearn.tree import DecisionTreeRegressor

model = DecisionTreeRegressor()

model.fit(x_train1, y_train1)

y_preddt = model.predict(x_test1)

r2_score(y_test1,y_preddt)


In [None]:
# Random forest:

from sklearn.ensemble import RandomForestRegressor
regressor_rf = RandomForestRegressor(n_estimators = 100)

regressor_rf.fit(x_train1, y_train1)

lr_normal_rf = regressor_rf.score(x_train1, y_train1)

lr_normal_rf

In [None]:
y_predrf = regressor_rf.predict(x_test1)

lr_normal_rf_test = regressor_rf.score(x_test1, y_test1)

lr_normal_rf_test

mse_lr_normal_rf  = mean_absolute_error(y_test1, y_predrf)

mse_lr_normal_rf

In [None]:
# Model Evaluation: MAE , MSE , RMSE

from sklearn.metrics import mean_absolute_error,mean_squared_error

print("MAE: ", metrics.mean_absolute_error(y_test1, y_predrf))
print("MSE: ", metrics.mean_squared_error(y_test1, y_predrf))
print("RMSE: ", metrics.mean_squared_error(y_test1, y_predrf, squared=False))
print("R2: ", metrics.r2_score(y_test1, y_predrf), "\n")
print("Score: ", regressor_rf.score(x_test1, y_predrf))

In [None]:
# Using Support vector regressor:

# Fit the model over the training data
from sklearn.svm import SVR
svr = SVR()
svr.fit(x_train1, y_train1)

y_predsvr = svr.predict(x_test1)

r2_score(y_test1,y_predsvr)

In [None]:
# Using XGBoost:

xgb_clf = xgb.XGBRegressor()
xgb_clf.fit(x_train1,y_train1)

In [None]:
# Using XGBoost:

xgb_clf = xgb.XGBRegressor(base_score=0.5, booster='gbtree', callbacks=None,
             colsample_bylevel=1, colsample_bynode=1, colsample_bytree=1,
             early_stopping_rounds=None, enable_categorical=False,
             eval_metric=None, gamma=0, gpu_id=-1, grow_policy='depthwise',
             importance_type=None, interaction_constraints='',
             learning_rate=0.300000012, max_bin=256, max_cat_to_onehot=4,
             max_delta_step=0, max_depth=6, max_leaves=0, min_child_weight=1
             ,monotone_constraints='()', n_estimators=100, n_jobs=0,
             num_parallel_tree=1, predictor='auto', random_state=0, reg_alpha=0,
             reg_lambda=1)

xgb_clf.fit(x_train1,y_train1)

y_predx = xgb_clf.predict(x_test1)

r2_score(y_test1,y_predx)

In [None]:
# Hyperparameter tuning for xgboost model

params = {"learning_rate"    : [0.05, 0.10] ,
         "max_depth"        : [ 3, 5, 8, 12]}

grd = GridSearchCV(xgb_clf,param_grid=params)



In [None]:
xgb_boosted = grd.fit(x_train1,y_train1)

y_predxx = xgb_boosted.predict(x_test1)

r2_score(y_test1,y_predxx)

In [None]:
from sklearn.ensemble import AdaBoostRegressor

ada = AdaBoostRegressor()

ada.fit(x_train1,y_train1)

In [None]:
y_predada = ada.predict(x_test1)

In [None]:
r2_score(y_test1,y_predada)

In [None]:
# HyperParameter tuning using Randomised Search CV :

from sklearn.model_selection import RandomizedSearchCV

params = {'n_estimators':[47,50,60,70],'learning_rate':[0.25,0.3,0.4]}

rnd = RandomizedSearchCV(ada,param_distributions=params)

rnd.fit(x_train1,y_train1)

In [None]:
rnd.best_estimator_

In [None]:
ada_boosted = AdaBoostRegressor(learning_rate=0.25)

ada_boosted.fit(x_train1,y_train1)

yb_pred = ada_boosted.predict(x_test1)

In [None]:
# Model Evaluation: MAE , MSE , RMSE

from sklearn.metrics import mean_absolute_error,mean_squared_error

print("MAE: ", metrics.mean_absolute_error(y_test1, yb_pred))
print("MSE: ", metrics.mean_squared_error(y_test1, yb_pred))
print("RMSE: ", metrics.mean_squared_error(y_test1, yb_pred, squared=False))
print("R2: ", metrics.r2_score(y_test1, yb_pred), "\n")
print("Score: ", ada_boosted.score(x_test1, yb_pred))


In [None]:
# Creating pipeline:

from sklearn.pipeline import Pipeline

pipe1 = Pipeline([('pt',PowerTransformer()),('pca',PCA(n_components=1750)),('base_model1',xgb.XGBRegressor())])

pipe1.fit(x_train1,y_train1)

y_pred1 = pipe1.predict(x_test1)

metrics.r2_score(y_test1, y_pred1)


In [None]:
# Saving regression model to pickle string

import pickle 
saved_model1 = pickle.dumps(pipe1) 
pipe_pickle1 = pickle.loads(saved_model1)
pipe_pickle1.predict(X_test) # predicting testing data


In [None]:
# Using label encoder to encode dependent variable column:

# Import label encoder
from sklearn import preprocessing
  
# label_encoder object knows how to understand word labels.
label_encoder = preprocessing.LabelEncoder()
  
# Encode labels in column 'species'.
df['RainTomorrow']= label_encoder.fit_transform(df['RainTomorrow'])
  
df['RainTomorrow'].unique()

In [None]:
# Correlation between features and label:

# Replacing attrition column values:

df.drop(columns = 'RainTomorrow',axis = 1).corrwith(df.RainTomorrow).plot(kind='bar',grid=True,figsize=(10,7),title='corelation between features and labels')
plt.show()

In [None]:
# Using label encoder to encode dependent variable column:

# Import label encoder
from sklearn import preprocessing
  
# label_encoder object knows how to understand word labels.
label_encoder = preprocessing.LabelEncoder()
  
# Encode labels in column 'species'.
df['RainTomorrow']= label_encoder.fit_transform(df['RainTomorrow'])
  
df['RainTomorrow'].unique()

In [None]:
# Splitting data into features and label:

y = df['RainTomorrow']
X = df.drop(['RainTomorrow'],axis=1)

In [None]:
y

In [None]:
X.columns

In [None]:
X_dummies = pd.get_dummies(X)

In [None]:
X_dummies

In [None]:
from sklearn.preprocessing import PowerTransformer

pt = PowerTransformer()

X_trans = pt.fit_transform(X_dummies)

In [None]:
from sklearn.decomposition import PCA
# Using PCA i.e. Principal Component Analysis that is a diamensionallity reduction technique:

pca = PCA()
pca.fit_transform(X_trans)

In [None]:
# Using Scree Plot to identify best components:

plt.figure()
plt.plot(np.cumsum(pca.explained_variance_ratio_))
plt.xlabel('Principal Components')
plt.ylabel('Variance Covered')
plt.title('PCA')
plt.show()


In [None]:
X_trans.shape

In [None]:
pca = PCA(n_components=1750)
new_pcomp = pca.fit_transform(X_trans)
princi_comp = pd.DataFrame(new_pcomp)
princi_comp

In [None]:
princi_comp.skew()

In [None]:
from imblearn.over_sampling import SMOTE

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(princi_comp, y, test_size=0.2, random_state=0)

print("Number transactions X_train dataset: ", X_train.shape)
print("Number transactions y_train dataset: ", y_train.shape)
print("Number transactions X_test dataset: ", X_test.shape)
print("Number transactions y_test dataset: ", y_test.shape)

In [None]:
print("Before OverSampling, counts of label '1': {}".format(sum(y_train==1)))
print("Before OverSampling, counts of label '0': {} \n".format(sum(y_train==0)))

sm = SMOTE(random_state=2)
X_train_res, y_train_res = sm.fit_resample(X_train, y_train.ravel())

print('After OverSampling, the shape of train_X: {}'.format(X_train_res.shape))
print('After OverSampling, the shape of train_y: {} \n'.format(y_train_res.shape))

print("After OverSampling, counts of label '1': {}".format(sum(y_train_res==1)))
print("After OverSampling, counts of label '0': {}".format(sum(y_train_res==0)))

In [None]:
arr = np.array(y_train_res)

un, co = np.unique(arr,return_counts=True)

dict(zip(un,co))

# Models

In [None]:
# Logistic Regression:

log_reg = LogisticRegression(random_state=1)

log_reg.fit(X_train_res,y_train_res) 

pred_train = log_reg.predict(X_train_res)

y_pred = log_reg.predict(X_test)

acc = accuracy_score(y_test,y_pred)

confusion_mat = confusion_matrix(y_test,y_pred)

print('Confusion matrix: \n',confusion_mat)

clr = classification_report(y_test,y_pred)

print('classification report: ' ,clr)

In [None]:
# Best result for Decision Tree Classifier:

dtc = DecisionTreeClassifier(random_state=1)
dtc.fit(X_train_res,y_train_res) 
pred_train_dtc = dtc.predict(X_train_res)
y_pred = dtc.predict(X_test)
acc = accuracy_score(y_test,y_pred)
print('Training accuracy: ', accuracy_score(y_train_res,pred_train_dtc)*100)
print('Testing accuracy: ', acc*100)
confusion_mat = confusion_matrix(y_test,y_pred)
print('Confusion matrix: \n',confusion_mat)
clr = classification_report(y_test,y_pred)
print('classification report: ' ,clr)

In [None]:
# Random forrest classifier model:

rfc_f = RandomForestClassifier()
rfc_f.fit(X_train_res,y_train_res) 
pred_train_rfc_f = rfc_f.predict(X_train_res)
y_pred = rfc_f.predict(X_test)
acc = accuracy_score(y_test,y_pred)
print('Training accuracy: ', accuracy_score(y_train_res,pred_train_rfc_f)*100)
print('Testing accuracy: ', acc*100)
confusion_mat = confusion_matrix(y_test,y_pred)
print('Confusion matrix: \n',confusion_mat)
clr = classification_report(y_test,y_pred)
print('classification report: ' ,clr)

In [None]:
# Using best parameters for improved score:

svc = SVC()

svc.fit(X_train_res,y_train_res)

pred_train_svc = svc.predict(X_train_res)
y_pred = svc.predict(X_test)
acc = accuracy_score(y_test,y_pred)
print('Training accuracy: ', accuracy_score(y_train_res,pred_train_svc)*100)
print('Testing accuracy: ', acc*100)
confusion_mat = confusion_matrix(y_test,y_pred)
print('Confusion matrix: \n',confusion_mat)
clr = classification_report(y_test,y_pred)
print('classification report: ' ,clr)

In [None]:
# KNN classifier:

knn = KNeighborsClassifier()
knn.fit(X_train_res,y_train_res) 
pred_train_knn = knn.predict(X_train_res)
y_pred = knn.predict(X_test)
acc = accuracy_score(y_test,y_pred)
print('Training accuracy: ', accuracy_score(y_train_res,pred_train_knn)*100)
print('Testing accuracy: ', acc*100)

In [None]:
from sklearn.ensemble import GradientBoostingClassifier

# GBC

gbdt_clf = GradientBoostingClassifier(random_state=3)
gbdt_clf.fit(X_train_res,y_train_res) 
pred_train_gbdt_clf = gbdt_clf.predict(X_train_res)
y_pred = gbdt_clf.predict(X_test)
acc = accuracy_score(y_test,y_pred)
print('Training accuracy: ', accuracy_score(y_train_res,pred_train_gbdt_clf)*100)
print('Testing accuracy: ', acc*100)
confusion_mat = confusion_matrix(y_test,y_pred)
print('Confusion matrix: \n',confusion_mat)
clr = classification_report(y_test,y_pred)
print('classification report: ' ,clr)

In [None]:
# ADA model:
from sklearn.ensemble import AdaBoostClassifier

ada_boosted = AdaBoostClassifier()
ada_boosted.fit(X_train_res,y_train_res)
yb_pred = ada_boosted.predict(X_test)
pred_train_ada = ada_boosted.predict(X_train_res)
y_pred = ada_boosted.predict(X_test)
acc = accuracy_score(y_test,y_pred)
print('Training accuracy: ', accuracy_score(y_train_res,pred_train_ada)*100)
print('Testing accuracy: ', acc*100)
confusion_mat = confusion_matrix(y_test,y_pred)
print('Confusion matrix: \n',confusion_mat)
clr = classification_report(y_test,y_pred)
print('classification report: ' ,clr)

In [None]:
# training a Naive Bayes classifier
from sklearn.naive_bayes import GaussianNB
gnb = GaussianNB().fit(X_train_res, y_train_res)
gnb_predictions = gnb.predict(X_test)
  
# accuracy on X_test
accuracy = gnb.score(X_test, y_test)
print(accuracy)

In [None]:
# Lets plot ROC AUC curve to choose best model:

from sklearn.metrics import roc_curve,roc_auc_score
from sklearn.metrics import plot_roc_curve

In [None]:
# Lets check ROC AUC Curve for fitted models on training data: (True +ive Rate/False +ive Rate)

disp = plot_roc_curve(dtc,X_train_res,y_train_res)

plot_roc_curve(knn,X_train_res,y_train_res,ax=disp.ax_)

plot_roc_curve(log_reg,X_train_res,y_train_res,ax=disp.ax_)

plot_roc_curve(svc,X_train_res,y_train_res,ax=disp.ax_)

plot_roc_curve(rfc_f,X_train_res,y_train_res,ax=disp.ax_)

plot_roc_curve(ada_boosted,X_train_res,y_train_res,ax=disp.ax_)

plot_roc_curve(gbdt_clf,X_train_res,y_train_res,ax=disp.ax_)

plt.legend(prop={'size':10},loc='lower right')

plt.show()

# This result is on training data.

In [None]:
# Lets check ROC AUC Curve for fitted models on testing data: (True +ive Rate/False +ive Rate)

disp = plot_roc_curve(dtc,X_test,y_test)

plot_roc_curve(knn,X_test,y_test,ax=disp.ax_)

plot_roc_curve(log_reg,X_test,y_test,ax=disp.ax_)

plot_roc_curve(svc,X_test,y_test,ax=disp.ax_)

plot_roc_curve(rfc_f,X_test,y_test,ax=disp.ax_)

plot_roc_curve(ada_boosted,X_test,y_test,ax=disp.ax_)

plot_roc_curve(gbdt_clf,X_test,y_test,ax=disp.ax_)

plt.legend(prop={'size':10},loc='lower right')

plt.show()

# This result is on testing data.

In [None]:
# Hyper parameter tuning GBC classifier:

In [None]:
from sklearn.model_selection import GridSearchCV

param_grid = {'max_depth':range(4,8),
          'min_samples_split':range(2,8,2),
          'learning_rate':np.arange(0.1,0.3)}

gridsearch = GridSearchCV(estimator=gbdt_clf,param_grid=param_grid)

gridsearch.fit(X_train_res,y_train_res)

In [None]:
gridsearch.best_params_

In [None]:
# Instantiating gradient boosting classifier:

gbdt_clf_f = GradientBoostingClassifier(learning_rate=0.1, max_depth=7, min_samples_split=2)

# Training the model
gbdt_clf_f.fit(X_train_res,y_train_res)

In [None]:
gbdt_clf = GradientBoostingClassifier(random_state=3)
gbdt_clf.fit(X_train_res,y_train_res) 
pred_train_gbdt_clf = gbdt_clf.predict(X_train_res)
y_pred = gbdt_clf.predict(X_test)
acc = accuracy_score(y_test,y_pred)
print('Training accuracy: ', accuracy_score(y_train_res,pred_train_gbdt_clf)*100)
print('Testing accuracy: ', acc*100)
confusion_mat = confusion_matrix(y_test,y_pred)
print('Confusion matrix: \n',confusion_mat)
clr = classification_report(y_test,y_pred)
print('classification report: ' ,clr)

In [None]:
# Creating pipeline:

from sklearn.pipeline import Pipeline

pipe1 = Pipeline([('pt',PowerTransformer()),
                  ('pca',PCA(n_components=17000)),('base_model1',GradientBoostingClassifier(random_state=3))])

pipe1.fit(X_train_res,y_train_res)

y_pred = pipe.predict(X_test)

accuracy_score(y_test,y_pred)

# Saving regression model to pickle string

import pickle 
saved_model1 = pickle.dumps(pipe1) 
pipe_pickle1 = pickle.loads(saved_model1)
pipe_pickle1.predict(X_test) # predicting testing data

