<a href="https://www.kaggle.com/code/himanshunakrani/rain-prediction?scriptVersionId=104032798" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

# Import Libraries

In [None]:
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns 

In [None]:
pd.set_option('display.max_columns', None)

import warnings
warnings.filterwarnings("ignore")

# Import Data

In [None]:
df = pd.read_csv('../input/weather-dataset-rattle-package/weatherAUS.csv')
df.head()

# Exploratory Data Analysis

### Statistical Analysis

In [None]:
#check the shape of the dataframe
df.shape 

Rows = 145460
Columns = 23

Target = 'RainTomorrow'

In [None]:
df.info()

In [None]:
df.isna().sum()

In [None]:
df.describe(include="O")

In [None]:
df.describe()

In [None]:
df.duplicated().sum()

In [None]:
df.nunique()

In [None]:
#target count

df.RainTomorrow.value_counts()

### Visualization Analysis

In [None]:
plt.figure(figsize=(10,5))
sns.set_style("darkgrid")
plt.title('RainTomorrow value counts')
sns.countplot(x=df["RainTomorrow"]);

In [None]:
plt.figure(figsize=(14,7))
sns.countplot(x=df["RainToday"], hue=df["RainTomorrow"], palette=sns.color_palette("husl")[4:]);

In [None]:
plt.figure(figsize=(17,13))
sns.countplot(data=df, y='Location')

plt.title('Location distribution')
plt.xlabel('')
plt.ylabel('')
plt.tight_layout()

In [None]:
plt.figure(figsize=(17,13))
sns.countplot(data=df, y='Location', hue="RainToday", palette=sns.color_palette("Set2"))

plt.title('Today Rain count by LOC')
plt.xlabel('')
plt.ylabel('')
plt.tight_layout()

In [None]:
plt.figure(figsize=(10,11))
plt.pie(df["WindDir9am"].value_counts(),
        labels=list(df["WindDir9am"].value_counts().index),
        autopct='%1.2f%%',
        pctdistance=0.8,
       );

In [None]:
plt.figure(figsize=(10,11))
plt.pie(df["WindDir3pm"].value_counts(),
        labels=list(df["WindDir3pm"].value_counts().index),
        autopct='%1.2f%%',
        pctdistance=0.8,
       );

In [None]:
df.hist(figsize=(17,13), color="m");

In [None]:
df.plot(kind="kde", subplots=True, layout=(4,4), figsize=(27,25),sharex=False, sharey=False);

In [None]:
df.plot(kind="box", subplots=True, layout=(4,4), figsize=(27,25),sharex=False, sharey=False);

### Multivariate

In [None]:
plt.figure(figsize=(18,16))
sns.heatmap(df.corr(), annot=True, cmap=plt.cm.CMRmap_r);

In [None]:
sns.pairplot(df.sample(n=100), hue="RainTomorrow");

# Preprocessing



In [None]:
df_preprocessed = df.copy()

In [None]:
df_preprocessed.isnull().mean() * 100

In [None]:
mean = df_preprocessed.mean()
df_preprocessed.fillna(mean, inplace=True)

df_preprocessed.isna().sum()

In [None]:
df_preprocessed.replace(np.nan, 'NaN', inplace=True)
df_preprocessed.isna().sum()

In [None]:
df_preprocessed.head()

In [None]:
df_preprocessed['Date'] = pd.to_datetime(df_preprocessed['Date'])

In [None]:
#columns segregation
date_columns = ['Date']
cat_columns = df_preprocessed.select_dtypes(include=['object']).columns
num_columns = df_preprocessed.select_dtypes(include=['float64', 'int64']).columns
target_col = 'RainTomorrow'

##### encoding the categorical variables

In [None]:
df_preprocessed.head()

In [None]:
cat_columns

In [None]:
for i in cat_columns:
    print('Unique Values in ' + i + ' Column:', end = " ")
    print(len(df_preprocessed[i].unique()))
    print('-----------------------------------------------------')

In [None]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
for i in ['Location','WindGustDir','WindDir9am','WindDir3pm']:
    df_preprocessed[i] = le.fit_transform(df_preprocessed[i])
df_preprocessed.head()

In [None]:
df_preprocessed[['RainToday']].value_counts()

In [None]:
df_preprocessed[['RainTomorrow']].value_counts()

In [None]:
df_preprocessed[['RainToday','RainTomorrow']].isna().sum()

In [None]:
df_clean = df_preprocessed[(df_preprocessed != "NaN").all(axis=1)]
del df_preprocessed

In [None]:
df_clean['RainToday'].value_counts()

In [None]:
label_dict = {'No': 0,'Yes':1}
df_clean['RainTomorrow'] = df_clean['RainTomorrow'].map(label_dict)
df_clean['RainToday'] = df_clean['RainToday'].map(label_dict)

In [None]:
df_clean.head()

In [None]:
# removing Date Column
df_clean.drop(['Date'], axis=1, inplace=True)

In [None]:
df_clean.reset_index(drop=True, inplace=True)

In [None]:
## train test split

from sklearn.model_selection import train_test_split

X = df_clean.drop(columns=["RainTomorrow"])
y = df_clean["RainTomorrow"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)


In [None]:
## Feature Scaling
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [None]:
X_train_scaled.shape

In [None]:
X_train_scaled[0,:]

# Feature Selection


In [None]:
from sklearn.feature_selection import SelectKBest

fs = SelectKBest(k=10)
X_train_scaled = fs.fit_transform(X_train_scaled, y_train)
X_test_scaled = fs.transform(X_test_scaled)

# BaseLine Modeling




In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

In [None]:
lr = LogisticRegression()
lr.fit(X_train_scaled, y_train)

y_lr = lr.predict(X_test_scaled)

In [None]:

dtree = DecisionTreeClassifier()
dtree.fit(X_train_scaled, y_train)

y_dtree = dtree.predict(X_test_scaled)

In [None]:
rf = RandomForestClassifier(n_estimators=200)
rf.fit(X_train_scaled, y_train)

y_rf = rf.predict(X_test_scaled)

#### Model Evaluation

In [None]:
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

print("*"*10, "Accuracy", "*"*10)

print("-"*30)
print("Logistic Regression: ", accuracy_score(y_test, y_lr))
print("-"*30)


print("-"*30)
print("Decision Tree: ", accuracy_score(y_test, y_dtree))
print("-"*30)


print("-"*30)
print("Random Forest: ", accuracy_score(y_test, y_rf))
print("-"*30)


In [None]:

print("*"*10, "Classification Report", "*"*10)

print("-"*30)
print("Logistic Regression: ", classification_report(y_test, y_lr))
print("-"*30)


print("-"*30)
print("Decision Tree: ", classification_report(y_test, y_dtree))
print("-"*30)


print("-"*30)
print("Random Forest: ", classification_report(y_test, y_rf))
print("-"*30)

In [None]:
metric_val = {
    "accuracy score": {
    "logistic reg": accuracy_score(y_test,y_lr)*100,
    "decision tree": accuracy_score(y_test,y_dtree)*100,
    "random for": accuracy_score(y_test,y_rf)*100
    }
}

ax = pd.DataFrame(metric_val).plot(kind="bar", 
                             figsize = (10,7), 
                             legend =False, 
                             title = "Accuracy Score",
                             color = "y");
                    
for p in ax.patches:
    ax.annotate(str(round(p.get_height(), 1)), (p.get_x() * 1.005, p.get_height() * 1.005))

# Hyperparameter Tuning



In [None]:
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import uniform

logistic = LogisticRegression(solver='saga', tol=1e-2, max_iter=200, random_state=0)

space = dict()
space['solver'] = ['newton-cg', 'lbfgs']
space['penalty'] = ['l2', 'none']
space['C'] = [1e-5, 1e-4, 1e-3, 1e-2, 1e-1, 1, 10, 100]
space['tol'] = [1e-2, 1e-3, 1e-4, 1e-5]
space['max_iter'] = [200, 400, 600]

clf = RandomizedSearchCV(logistic, space, random_state=0)
search = clf.fit(X_train_scaled, y_train)
search.best_params_

In [None]:
lr = LogisticRegression(**search.best_params_)
lr.fit(X_train_scaled, y_train)

accuracy_score(y_test, lr.predict(X_test_scaled))

# Model Evaluation (performance analysis)



In [None]:
print(classification_report(y_test, lr.predict(X_test_scaled)))

In [None]:
from sklearn.metrics import accuracy_score
y_pred = lr.predict(X_test_scaled)
print(accuracy_score(y_pred, y_test))

# Conclusion

In [None]:
conclusion = {
    "R2 score": {
    "Baseline Model ": accuracy_score(y_test,y_rf)*100,
    "Model after hyperparameter tuning": accuracy_score(y_test,y_pred)*100
    }
}

ax = pd.DataFrame(conclusion).plot(kind="bar", 
                             figsize = (10,5), 
                             legend =False, 
                             title = "R2 Score",
                             color = 'm');
                    
for p in ax.patches:
    ax.annotate(str(round(p.get_height(), 1)), (p.get_x() * 1.005, p.get_height() * 1.005))

***