In [2]:
import os
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px
from sklearn import linear_model

In [3]:
df = pd.read_csv("rainfall in india 1901-2015.csv", na_values = 'NA')

In [4]:
df.dropna(how = 'any', inplace = True)

In [5]:
df.isnull().sum()

SUBDIVISION    0
YEAR           0
JAN            0
FEB            0
MAR            0
APR            0
MAY            0
JUN            0
JUL            0
AUG            0
SEP            0
OCT            0
NOV            0
DEC            0
ANNUAL         0
Jan-Feb        0
Mar-May        0
Jun-Sep        0
Oct-Dec        0
dtype: int64

In [6]:
df.duplicated().sum()

0

In [7]:
df.shape

(4090, 19)

In [8]:
df['SUBDIVISION'].nunique()

36

In [9]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error

In [10]:
feature_columns = ['SUBDIVISION', 'YEAR', 'JAN', 'FEB', 'MAR', 'APR', 'MAY', 'JUN', 'JUL', 'AUG', 'SEP', 'OCT', 'NOV', 'DEC', 'Jan-Feb', 'Mar-May', 'Jun-Sep', 'Oct-Dec']
target_column = 'ANNUAL'

In [11]:
# Separate features and target
X = df[feature_columns]
y = df[target_column]

In [12]:
# Preprocessing for numerical and categorical data
numeric_features = ['YEAR', 'JAN', 'FEB', 'MAR', 'APR', 'MAY', 'JUN', 'JUL', 'AUG', 'SEP', 'OCT', 'NOV', 'DEC', 'Jan-Feb', 'Mar-May', 'Jun-Sep', 'Oct-Dec']
categorical_features = ['SUBDIVISION']

In [13]:
# Create the preprocessing pipelines for both numeric and categorical data
numeric_transformer = Pipeline(steps=[
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])

## Linear Regression

In [34]:
from sklearn.linear_model import LinearRegression
# Define the model pipeline
model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', LinearRegression())
])

In [35]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [36]:
# Train the model
model.fit(X_train, y_train)

In [37]:
# Make predictions
y_pred = model.predict(X_test)

In [41]:
# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

In [43]:
print(f'Mean Squared Error: {mse}')
print(f'Mean Absolute Error: {mae}')
print(f'R^2 Score: {r2}')

Mean Squared Error: 0.0031047926537013473
Mean Absolute Error: 0.04300698447929414
R^2 Score: 0.9999999964130872


In [44]:
y_train_pred = model.predict(X_train)
mse = mean_squared_error(y_train, y_train_pred)
mae = mean_absolute_error(y_train, y_train_pred)
r2 = r2_score(y_train, y_train_pred)
print(f'Mean Squared Error: {mse}')
print(f'Mean Absolute Error: {mae}')
print(f'R^2 Score: {r2}')

Mean Squared Error: 0.0032623619432623073
Mean Absolute Error: 0.04336571541199728
R^2 Score: 0.9999999959437585


In [45]:
new_data = pd.DataFrame({
    'SUBDIVISION': ['ANDAMAN & NICOBAR ISLANDS'],
    'YEAR': [1901],
    'JAN': [49.2],
    'FEB': [87.1],
    'MAR': [29.2],
    'APR': [2.3],
    'MAY': [528.8],
    'JUN': [517.5],
    'JUL': [365.1],
    'AUG': [481.1],
    'SEP': [332.6],
    'OCT': [388.5],
    'NOV': [558.2],
    'DEC': [33.6],
    'Jan-Feb': [136.3],
    'Mar-May': [560.3],
    'Jun-Sep': [1696.3],
    'Oct-Dec': [980.3]
})

# Make predictions using the trained model
predicted_annual_rainfall = model.predict(new_data)

# Print the predicted annual rainfall
print(f'Predicted Annual Rainfall: {predicted_annual_rainfall[0]}')

Predicted Annual Rainfall: 3373.1940420140754


In [46]:
new_data = pd.DataFrame({
    'SUBDIVISION': ['LAKSHADWEEP'],
    'YEAR': [2015],
    'JAN': [2.2],
    'FEB': [0.5],
    'MAR': [3.7],
    'APR': [87.1],
    'MAY': [133.1],
    'JUN': [296.6],
    'JUL': [257.5],
    'AUG': [146.4],
    'SEP': [160.4],
    'OCT': [165.4],
    'NOV': [231],
    'DEC': [159],
    'Jan-Feb': [2.7],
    'Mar-May': [223.9],
    'Jun-Sep': [860.9],
    'Oct-Dec': [555.4]
})

# Make predictions using the trained model
predicted_annual_rainfall = model.predict(new_data)

# Print the predicted annual rainfall
print(f'Predicted Annual Rainfall: {predicted_annual_rainfall[0]}')

Predicted Annual Rainfall: 1642.8948855449196


## Ridge Regression

In [14]:
from sklearn.linear_model import Ridge 
# Define the model pipeline
model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', Ridge(alpha=0.01))
])

In [15]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [16]:
# Train the model
model.fit(X_train, y_train)

In [17]:
# Make predictions
y_pred = model.predict(X_test)

In [18]:
# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print(f'Mean Squared Error: {mse}')
print(f'Mean Absolute Error: {mae}')
print(f'R^2 Score: {r2}')

Mean Squared Error: 0.0051897057289241036
Mean Absolute Error: 0.05741097691370031
R^2 Score: 0.9999999940044236


## Lasso Regression

In [19]:
from sklearn.linear_model import Lasso 
# Define the model pipeline
model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', Lasso(alpha=3.9))
])

In [20]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [21]:
# Train the model
model.fit(X_train, y_train)

In [22]:
# Make predictions
y_pred = model.predict(X_test)

In [23]:
# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print(f'Mean Squared Error: {mse}')
print(f'Mean Absolute Error: {mae}')
print(f'R^2 Score: {r2}')

Mean Squared Error: 41.49118367345085
Mean Absolute Error: 5.004962108569532
R^2 Score: 0.9999520659601392


## Decision Tree Regression

In [24]:
from sklearn.tree import DecisionTreeRegressor 
# Define the model pipeline
model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', DecisionTreeRegressor(criterion="squared_error",max_depth=None))
])

In [25]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [26]:
# Train the model
model.fit(X_train, y_train)

In [27]:
# Make predictions
y_pred = model.predict(X_test)

In [28]:
# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print(f'Mean Squared Error: {mse}')
print(f'Mean Absolute Error: {mae}')
print(f'R^2 Score: {r2}')

Mean Squared Error: 16177.586454767725
Mean Absolute Error: 75.0880195599022
R^2 Score: 0.9813103169078661


## Random Forest Regression

In [29]:
from sklearn.ensemble import RandomForestRegressor
# Define the model pipeline
model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', RandomForestRegressor(random_state=50))
])

In [30]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [31]:
# Train the model
model.fit(X_train, y_train)

In [32]:
# Make predictions
y_pred = model.predict(X_test)

In [33]:
# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print(f'Mean Squared Error: {mse}')
print(f'Mean Absolute Error: {mae}')
print(f'R^2 Score: {r2}')

Mean Squared Error: 7435.897360809284
Mean Absolute Error: 41.512633251833705
R^2 Score: 0.9914094376458608


## KNN Regression

In [34]:
from sklearn.neighbors import KNeighborsRegressor
# Define the model pipeline
model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', KNeighborsRegressor())
])

In [35]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [36]:
# Train the model
model.fit(X_train, y_train)

In [37]:
# Make predictions
y_pred = model.predict(X_test)

In [38]:
# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print(f'Mean Squared Error: {mse}')
print(f'Mean Absolute Error: {mae}')
print(f'R^2 Score: {r2}')

Mean Squared Error: 25838.766110024455
Mean Absolute Error: 110.68124694376527
R^2 Score: 0.9701489247831647


## XGBoost

In [39]:
from xgboost import XGBRegressor
# Define the model pipeline
model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', XGBRegressor())
])

In [40]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [41]:
# Train the model
model.fit(X_train, y_train)

In [42]:
# Make predictions
y_pred = model.predict(X_test)

In [43]:
# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print(f'Mean Squared Error: {mse}')
print(f'Mean Absolute Error: {mae}')
print(f'R^2 Score: {r2}')

Mean Squared Error: 6279.755385872519
Mean Absolute Error: 40.36816693145081
R^2 Score: 0.9927451082776634
