In [None]:
#Import Dependencies
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [None]:
# Path to the csv
path="../aus_weather/weatherAUS.csv"
# Reading csv ->df
df_aus_weather=pd.read_csv(path)
df_aus_weather.head()

In [None]:
#Display the columns in the dataset
df_aus_weather.columns

In [None]:
#DF info
df_aus_weather.info()

In [None]:
#Drop records that have any null values
df_aus_weather.dropna(inplace=True)

In [None]:
#Reset Index
df_aus_weather.reset_index(drop=True,inplace=True)

In [None]:
#DF info
df_aus_weather.info()

In [None]:
# Maintain a copy of the df
df_aus_weather_cp=df_aus_weather

In [None]:
#Drop the unnecessary columns
df_aus_weather = df_aus_weather.drop(['Date', 'Location','RISK_MM','WindDir9am', 'WindDir3pm'], axis=1)
df_aus_weather.head()

In [None]:
df_aus_weather.info()

In [None]:
df_aus_weather.shape

In [None]:
df_aus_weather.head()

In [None]:
# Use get_dummies to convert categorical data (columns - RainToday and RainTomorrow)
bin_encoded_rtod=pd.get_dummies(df_aus_weather["RainToday"],drop_first=True)
bin_encoded_rtom=pd.get_dummies(df_aus_weather["RainTomorrow"],drop_first=True)
df_aus_weather["RainToday"]=bin_encoded_rtod
df_aus_weather["RainTomorrow"]=bin_encoded_rtom

In [None]:
#Verify if RainToday and RainTomorrow are encoded
df_aus_weather.head()

In [None]:
df_aus_weather.info()

In [None]:
#Use Pandas get_dummies to convert categorical data
df_aus_weather=pd.get_dummies(df_aus_weather)
df_aus_weather.head()

In [None]:
# #Export Pre processed Data
# df_aus_weather.to_csv("preprocessed_data.csv")

# Machine Learning

In [None]:
#Import Dependencies
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

In [None]:
df_aus_weather["RainTomorrow"]

In [None]:
#Assign X and y values
X=df_aus_weather.drop("RainTomorrow",axis=1)
y=df_aus_weather["RainTomorrow"].values.reshape(-1,1)
print(X.shape,y.shape)
X

In [None]:
#Split the data into training and testing 
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(X,y,random_state=42)

In [None]:
#Scale or Normalize data
from sklearn.preprocessing import StandardScaler
#Create a StandardScalar model and fit it to the training data
X_scaler=StandardScaler().fit(X_train)

In [None]:
#Transform the training and testing data using the X_scaler 
#Since the features are measured from different scales, appling featruring scaling
X_train=X_scaler.transform(X_train)
X_test=X_scaler.transform(X_test)

# Create Logistic Regression Model

In [None]:
from warnings import simplefilter
# ignore all future warnings
simplefilter(action='ignore', category=FutureWarning)

In [None]:
from sklearn.linear_model import LogisticRegression
classifier=LogisticRegression()
classifier

In [None]:
#Fit(train) the model using th scaled training data
classifier.fit(X_train,y_train.ravel())

In [None]:
#Validate the model using the test data
print(f"Training Data Score: {classifier.score(X_train,y_train)}")
print(f"Testing Data Score: {classifier.score(X_test,y_test)}")

# Feature importance in Logistic Regression using weights

In [None]:
# Extract Column Names
col_names=X.columns.tolist()
col_names

In [None]:
feature_importance = abs(classifier.coef_[0])
feature_importance = 100.0 * (feature_importance / feature_importance.max())
sorted_idx = np.argsort(feature_importance)
pos = np.arange(sorted_idx.shape[0]) 
featfig = plt.figure(figsize=(5,10))
featax = featfig.add_subplot(1, 1, 1)
featax.barh(pos, feature_importance[sorted_idx], align='center')
featax.set_yticks(pos)
featax.set_yticklabels(np.array(X.columns)[sorted_idx], fontsize=8)
featax.set_xlabel('Relative Feature Importance')
plt.tight_layout()   
plt.show()

In [None]:
# Now that we have trained our algorithm, it’s time to make some predictions.
y_pred=classifier.predict(X_test)
y_pred

In [None]:
# Model Evaluation using Confusion Matrix
from sklearn import metrics
cnf_matrix = metrics.confusion_matrix(y_test, y_pred)
cnf_matrix

In [None]:
# Accuracy of the model
print("Accuracy:",metrics.accuracy_score(y_test, y_pred))

# Logistic Regression - Drop the columns with low feature importance

In [None]:
df_aus_weather_cp.head()

In [None]:
df_aus_weather_cp=df_aus_weather_cp.drop(['Date', 'Location','RISK_MM',
                      'WindDir9am', 'WindDir3pm',
                      'WindGustDir', 'Rainfall','Evaporation',
                      'Humidity9am'], axis=1)

In [None]:
df_aus_weather_cp.head()

In [None]:
# Use get_dummies to convert categorical data (columns - RainToday and RainTomorrow)
bin_encoded_rtod1=pd.get_dummies(df_aus_weather_cp["RainToday"],drop_first=True)
bin_encoded_rtom1=pd.get_dummies(df_aus_weather_cp["RainTomorrow"],drop_first=True)
df_aus_weather_cp["RainToday"]=bin_encoded_rtod1
df_aus_weather_cp["RainTomorrow"]=bin_encoded_rtom1

In [None]:
#Use Pandas get_dummies to convert categorical data
df_aus_weather_cp=pd.get_dummies(df_aus_weather_cp)
df_aus_weather_cp.head()

# Machine Learning - New(updated) dataset

In [None]:
#Assign X and y values
X=df_aus_weather_cp.drop("RainTomorrow",axis=1)
y=df_aus_weather_cp["RainTomorrow"].values.reshape(-1,1)
print(X.shape,y.shape)

In [None]:
#Split the data into training and testing 
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(X,y,random_state=42)

In [None]:
#Scale or Normalize data
from sklearn.preprocessing import StandardScaler
#Create a StandardScalar model and fit it to the training data
X_scaler=StandardScaler().fit(X_train)

In [None]:
#Transform the training and testing data using the X_scaler 
#Since the features are measured from different scales, appling featruring scaling
X_train=X_scaler.transform(X_train)
X_test=X_scaler.transform(X_test)

# Create Logistic Regression Model

In [None]:
from sklearn.linear_model import LogisticRegression
classifier_new=LogisticRegression()
classifier_new

In [None]:
#Fit(train) the model using the scaled training data
classifier.fit(X_train,y_train.ravel())

In [None]:
#Validate the model using the test data
print(f"Training Data Score: {classifier.score(X_train,y_train)}")
print(f"Testing Data Score: {classifier.score(X_test,y_test)}")

In [None]:
# Now that we have trained our algorithm, it’s time to make some predictions.
y_pred_new=classifier.predict(X_test)
y_pred_new

In [None]:
# Model Evaluation using Confusion Matrix
from sklearn import metrics
cnf_matrix_new = metrics.confusion_matrix(y_test, y_pred_new)
cnf_matrix_new

In [None]:
# Accuracy of the model
print("Accuracy:",metrics.accuracy_score(y_test, y_pred_new))

# Hyperparameter Tuning

In [None]:
# Use GridSearchCV to tune the model's parameters
from sklearn.model_selection import GridSearchCV
param_grid = {'C':[0.1, 0.01, 0.001, 0.0001],
              'penalty': ["l1", "l2"]}
grid = GridSearchCV(classifier, param_grid,n_jobs=-1, cv=5, verbose=3)

In [None]:
grid.fit(X_train, y_train.ravel())

In [None]:
print(grid.best_params_)
print(grid.best_score_)

In [None]:
# Make predictions with the hypertuned model
predictions = grid.predict(X_test)

In [None]:
 # Calculate classification report
from sklearn.metrics import classification_report
print(classification_report(y_test, predictions,
                            target_names=["blue", "red"]))

# Repeating LR model with the third dataset- weatherAUS_feature_engineer

In [None]:
# Path to the csv
path="../aus_weather/weatherAUS_feature_engineer.csv"
# Reading csv ->df
df_aus_weather=pd.read_csv(path)
df_aus_weather.head()

# Save the model

In [None]:
# save fitted model to file
import joblib
filename = 'logistic.sav'
joblib.dump(grid, filename)