# **Rainfall bold text Trends and Forecasting in India**

In [55]:
import pandas as pd

In [4]:
#importing datasets
rainfall_data = pd.read_csv('/content/rainfall_area-wt_India_1901-2015.csv')
print(rainfall_data.head())

  REGION  YEAR   JAN   FEB   MAR   APR   MAY    JUN    JUL    AUG    SEP  \
0  INDIA  1901  34.7  37.7  18.0  39.3  50.8  113.4  242.2  272.9  124.4   
1  INDIA  1902   7.4   4.3  19.0  43.5  48.3  108.8  284.0  199.7  201.5   
2  INDIA  1903  17.0   8.3  31.3  17.1  59.5  118.3  297.0  270.4  199.1   
3  INDIA  1904  14.4   9.6  31.8  33.1  72.4  164.8  261.0  206.4  129.6   
4  INDIA  1905  25.3  20.9  42.7  33.7  55.7   93.3  252.8  200.8  178.4   

     OCT   NOV   DEC  ANNUAL  Jan-Feb  Mar-May  Jun-Sep  Oct-Dec  
0   52.7  38.0   8.3  1032.3     72.4    108.1    752.8     99.0  
1   61.5  27.9  24.4  1030.2     11.7    110.8    794.0    113.8  
2  117.9  36.9  17.7  1190.5     25.3    107.9    884.8    172.5  
3   69.0  11.2  16.3  1019.8     24.0    137.4    761.8     96.6  
4   51.4   9.7  10.5   975.3     46.2    132.2    725.4     71.6  


### Analyzing Annual Rainfall Trends Over Time

In [56]:
#import plotly libraries
import plotly.express as px
import plotly.graph_objects as go

In [57]:
from re import template
#analyze trends in annual rainfall over time
annual_rainfall = rainfall_data[['YEAR','ANNUAL']]
fig_annual = go.Figure()
fig_annual.add_trace(go.Scatter(
    x=annual_rainfall['YEAR'],
    y=annual_rainfall['ANNUAL'],
    mode='lines',
    name='Annual Rainfall',
    line= dict(color='blue', width=2),
    opacity=0.8
    ))
fig_annual.add_trace(go.Scatter(
    x=annual_rainfall['YEAR'],
    y=[annual_rainfall['ANNUAL'].mean()]*len(annual_rainfall),
    mode='lines',
    name='Mean Rainfall',
    line= dict(color='red', width=2, dash='dash'),
    ))

fig_annual.update_layout(
    title='Trends in Annual Rainfall in India (1901-2015)',
    xaxis_title='Year',
    yaxis_title='Rainfall (mm)',
    font=dict(size=12),
    template ='plotly_white',
    legend=dict(title="Legend"),
    height = 500,
    width = 800
    )
fig_annual.show()

In [None]:
#identify months with the highest and the lowest rainfall on average
monthly_columns = ['JAN',   'FEB',   'MAR',   'APR',   'MAY',    'JUN',    'JUL',    'AUG',    'SEP',   'OCT',   'NOV',  'DEC']
monthly_avg = rainfall_data[monthly_columns].mean()

highest_rainfall_month = monthly_avg.idxmax()
lowest_rainfall_month = monthly_avg.idxmin()
print(f"The month with the highest average rainfall is {highest_rainfall_month} with an average of {monthly_avg[highest_rainfall_month]:.2f} mm.")
print(f"The month with the lowest average rainfall is {lowest_rainfall_month} with an average of {monthly_avg[lowest_rainfall_month]:.2f} mm.")

The month with the highest average rainfall is JUL with an average of 291.02 mm.
The month with the lowest average rainfall is DEC with an average of 14.98 mm.


In [None]:
#plotting the bar graph
fig_monthly = px.bar(
    x=monthly_avg.index,
    y=monthly_avg.values,
    labels={'x':'Month', 'y':'Average Rainfall (mm)'},
    title='Average Monthly Rainfall in India (1901-2015)',
    text = monthly_avg.values
)
fig_monthly.add_hline(
    y=monthly_avg.mean(),
    line_dash="dash",
    line_color="red",
    annotation_text="Mean Rainfall",
    annotation_position="top right"
    )
fig_monthly.update_traces(marker_color='blue', marker_line_color='rgb(8,48,107)',
                          marker_line_width=1.5, opacity=0.6)
fig_monthly.update_layout(
    template ='plotly_white',
    height = 500)
fig_monthly.show()

In [None]:
total_avg = monthly_avg.mean()
print(total_avg)

98.50231884057972


In [None]:
#seasonal rainfall distributation
seasonal_columns = ['Jan-Feb', 'Mar-May', 'Jun-Sep', 'Oct-Dec']
seasonal_avg = rainfall_data[seasonal_columns].mean()
print(seasonal_avg)

Jan-Feb     43.189565
Mar-May    128.694783
Jun-Sep    890.260870
Oct-Dec    119.882609
dtype: float64


In [None]:
#plotting the bar
fig_seasonal = px.bar(
    x=seasonal_avg.index,
    y=seasonal_avg.values,
    labels={'x':'Season', 'y':'Average Rainfall (mm)'},
    title='Seasonal Rainfall Distributtion in India (1901-2015)',
    text = seasonal_avg.values,
    color = seasonal_avg.values,
    color_continuous_scale = ['#6439FF','#AD49E1','#921A40','#E65C19']
)
fig_seasonal.update_traces(marker_line_color='black',
                          marker_line_width=1.5, opacity=0.6)
fig_seasonal.update_layout(
    template ='plotly_white',
    height = 500,
    coloraxis_colorbar=dict(title='mm')
)
fig_seasonal.show()


### Assessing the Impact of Climate Change in the Rainfall Trends in India

In [None]:
#calculating rolling averages to assess climate change impact

In [None]:
rainfall_data['10 year Rolling Avg'] = rainfall_data['ANNUAL'].rolling(window=10).mean()
print(rainfall_data[['YEAR', '10 year Rolling Avg']].head(20))

    YEAR  10 year Rolling Avg
0   1901                  NaN
1   1902                  NaN
2   1903                  NaN
3   1904                  NaN
4   1905                  NaN
5   1906                  NaN
6   1907                  NaN
7   1908                  NaN
8   1909                  NaN
9   1910              1088.69
10  1911              1090.21
11  1912              1095.00
12  1913              1082.84
13  1914              1102.29
14  1915              1118.48
15  1916              1137.78
16  1917              1181.84
17  1918              1174.80
18  1919              1189.88
19  1920              1174.64


In [None]:
#plotting the graphs
#plot the annual avg
fig_climate_change = go.Figure()
fig_climate_change.add_trace(go.Scatter(
    x=rainfall_data['YEAR'],
    y=rainfall_data['ANNUAL'],
    mode='lines',
    name='Annual Rainfall',
    line= dict(color='blue', width=2),
    opacity=0.8
    ))
#plot the rolling avg
fig_climate_change.add_trace(go.Scatter(
    x=rainfall_data['YEAR'],
    y=rainfall_data['10 year Rolling Avg'],
    mode='lines',
    line= dict(color='red', width=2),
    name='10 Year Rolling Average'
    ))
fig_climate_change.update_layout(
    title='Impact of Climate Change on Rainfall in India (1901-2015)',
    xaxis_title='Year',
    yaxis_title='Rainfall (mm)',
    font=dict(size=12),
    template ='plotly_white',
    legend=dict(title="Legend"),
    height = 500,
    width = 800
    )
fig_climate_change.show()

using statistical thresholds (1.5 standard deviations below or above the mean), let’s identify years with extreme or deficient rainfall.


In [9]:
#finding the drought and extreme rainfall using statical test
from scipy.stats import pearsonr

In [None]:
#identifying drought and extreme rainfall years
mean_rainfall = rainfall_data['ANNUAL'].mean()
std_rainfall = rainfall_data['ANNUAL'].std()

#getting the years in the list which have drought and extreme rainfall
dought_years = rainfall_data[rainfall_data['ANNUAL']< (mean_rainfall - 1.5*std_rainfall)]
extreme_years = rainfall_data[rainfall_data['ANNUAL'] > (mean_rainfall + 1.5*std_rainfall)]
# print("Years with drought:", dought_years['YEAR'].tolist())
# print("Years with extreme rainfall:", extreme_years['YEAR'].tolist())


Years with drought: [1905, 1965, 1972, 2002, 2009]
Years with extreme rainfall: [1917, 1933, 1956, 1959, 1961, 1988, 1990]


In [10]:
#correlation seasonal rainfall with annual rainfall totals
seasonal_columns = ['Jan-Feb', 'Mar-May', 'Jun-Sep', 'Oct-Dec']
annual_rainfall = rainfall_data['ANNUAL']
# seasonal_rainfall = rainfall_data[seasonal_columns]
# correlation_matrix = seasonal_rainfall.corrwith(annual_rainfall)
# print(correlation_matrix)
seasonal_correlation = {
    season: pearsonr(rainfall_data[season], annual_rainfall)[0]
    for season in seasonal_columns
}

In [None]:
#displaying
dought_years_summary = dought_years[['YEAR', 'ANNUAL']].reset_index(drop=True)
extreme_years_summary = extreme_years[['YEAR', 'ANNUAL']].reset_index(drop=True)
print("Years with drought:")
print(dought_years_summary)
print("Years with extreme rainfall:")
print(extreme_years_summary)

seasonal_correlation_summary = pd.Series(seasonal_correlation).reset_index()
seasonal_correlation_summary.columns = ['Season', 'Correlation']
print("Correlation between Seasonal Rainfall and Annual Rainfall:")
print(seasonal_correlation_summary)

Years with drought:
   YEAR  ANNUAL
0  1905   975.3
1  1965   938.4
2  1972   948.5
3  2002   920.8
4  2009   959.3
Years with extreme rainfall:
   YEAR  ANNUAL
0  1917  1480.3
1  1933  1393.5
2  1956  1386.2
3  1959  1382.1
4  1961  1403.0
5  1988  1351.0
6  1990  1400.6
Correlation between Seasonal Rainfall and Annual Rainfall:
    Season  Correlation
0  Jan-Feb     0.228913
1  Mar-May     0.313057
2  Jun-Sep     0.930027
3  Oct-Dec     0.531648


## Detecting Anomalies in the Rainfall Trends in India

In [7]:
#using Isolation forest algorithms to identify anomalies
from sklearn.ensemble import IsolationForest

In [8]:
#detect anomlous rainfall years based on annual data
isolation_forest = IsolationForest(contamination=0.05, random_state=42)
rainfall_data['Annual_Anomaly'] = isolation_forest.fit_predict(rainfall_data[['ANNUAL']])

#identify anomalies in annual rainfall
annual_anomalies = rainfall_data[rainfall_data['Annual_Anomaly'] == -1].reset_index()
print(annual_anomalies)
print("Years with anomalies in annual rainfall:")
print(annual_anomalies[['YEAR', 'ANNUAL']])

   index REGION  YEAR   JAN   FEB   MAR   APR   MAY    JUN    JUL  ...    SEP  \
0      4  INDIA  1905  25.3  20.9  42.7  33.7  55.7   93.3  252.8  ...  178.4   
1     16  INDIA  1917   8.7  38.7  22.8  43.2  75.0  231.8  285.2  ...  281.0   
2     64  INDIA  1965  10.9  26.0  26.4  43.6  51.2  115.8  269.2  ...  131.1   
3     71  INDIA  1972   9.7  27.1  21.0  36.9  55.6  123.0  205.5  ...  129.1   
4    101  INDIA  2002  16.8  21.0  22.9  38.9  57.7  170.1  138.9  ...  133.9   
5    108  INDIA  2009  11.8  13.2  15.2  26.0  56.6   86.5  283.7  ...  140.3   

     OCT   NOV   DEC  ANNUAL  Jan-Feb  Mar-May  Jun-Sep  Oct-Dec  \
0   51.4   9.7  10.5   975.3     46.2    132.2    725.4     71.6   
1  158.8  28.2  10.3  1480.3     47.3    141.1   1094.5    197.3   
2   33.5  17.4  21.1   938.4     36.9    121.2    708.4     72.0   
3   66.0  30.3  22.3   948.5     36.8    113.6    679.5    118.6   
4   54.4  14.7   5.2   920.8     37.8    119.5    689.2     74.3   
5   70.1  53.2  10.8   9

In [None]:
#detect anomalous months based on monthly data
monthly_data = ['JAN',   'FEB',   'MAR',   'APR',   'MAY',    'JUN',    'JUL',    'AUG',    'SEP',   'OCT',   'NOV',  'DEC']
monthly_anomalies = isolation_forest.fit_predict(rainfall_data[monthly_data])
#add anomaly detection results for months
rainfall_data['Monthly_Anomaly'] = monthly_anomalies
monthly_anomalies_df = rainfall_data[rainfall_data['Monthly_Anomaly'] == -1][['YEAR'] + monthly_columns]

print(monthly_anomalies_df)



    YEAR   JAN   FEB   MAR   APR    MAY    JUN    JUL    AUG    SEP    OCT  \
6   1907  16.2  46.0  37.8  62.8   32.6  154.4  225.4  310.4   96.9   22.7   
10  1911  45.7   5.6  49.9  22.8   47.6  191.9  162.7  213.5  182.3   70.6   
16  1917   8.7  38.7  22.8  43.2   75.0  231.8  285.2  296.5  281.0  158.8   
17  1918  12.2   4.4  41.6  38.8  102.8  212.6  183.8  242.7  109.7   20.0   
66  1967  11.2  13.4  63.3  29.1   42.4  144.9  304.6  262.9  170.4   40.3   
89  1990  14.9  44.3  53.3  42.0  114.5  194.0  286.7  293.2  196.6  103.2   

     NOV   DEC  
6   22.5  12.1  
10  42.8  12.0  
16  28.2  10.3  
17  41.1  16.4  
66  11.4  54.4  
89  29.5  28.4  


In [None]:
#plotting the anomalies
fig_annual_anomalies = go.Figure()

fig_annual_anomalies.add_trace(go.Scatter(
    x=rainfall_data['YEAR'],
    y=rainfall_data['ANNUAL'],
    mode='lines',
    name='Annual Rainfall',
    line=dict(color='blue', width=2),
    opacity=0.8
    ))
fig_annual_anomalies.add_trace(go.Scatter(
    x=annual_anomalies['YEAR'],
    y=annual_anomalies['ANNUAL'],
    mode='markers',
    name='Anomalies',
    marker=dict(color='red', size=10, symbol='x')
))
fig_annual_anomalies.add_hline(
    y=rainfall_data['ANNUAL'].mean(),
    line_dash="dash",
    line_color="green",
    annotation_text="Mean Rainfall",
    annotation_position="top right"
    )
fig_annual_anomalies.update_layout(
    title='Annual Rainfall Anomalies in India (1901-2015)',
    xaxis_title='Year',
    yaxis_title='Rainfall (mm)',
    template='plotly_white',
    legend=dict(title="Legend"),
    height=500,
    width=800
)
fig_annual_anomalies.show()


In [None]:
#preparing for the monthly anomalies
monthly_anomalies = []
for column in monthly_columns:
    for _, row in monthly_anomalies_df.iterrows():
        monthly_anomalies.append({'Year': row['YEAR'], 'Month': column, 'Rainfall': row[column]})

        monthly_anomalies_df_long = pd.DataFrame(monthly_anomalies)
        #print(monthly_anomalies_df_long)
fig_monthly_anomalies = px.line(
    rainfall_data,
    x='YEAR',
    y=monthly_columns,
    labels={'YEAR': 'Year', 'value': 'Rainfall (mm)','variable': 'Month'},
    title='Monthly Rainfall Anomalies in India (1901-2015)',
    color_discrete_sequence=px.colors.qualitative.Pastel
)
fig_monthly_anomalies.add_traces(go.Scatter(
    x=monthly_anomalies_df_long['Year'],
    y=monthly_anomalies_df_long['Rainfall'],
    mode='markers',
    name='Anomalies Months',
    marker=dict(color='red', size=5, symbol='circle')
))
fig_monthly_anomalies.update_layout(
    template='plotly_white',
    legend=dict(title="Legend"),
    height=500,
    width=800
)
fig_monthly_anomalies.show()

# Correlating Seasonal Rainfall with Annual Totals

we will calculate the correlation coefficients between seasonal rainfall and annual rainfall totals to understand how much each season contributes to the overall yearly rainfall:

In [11]:
#correlation analysis between monsoon (Jun-Sep) rainfall and other seasons
seasonal_column = ['Jan-Feb', 'Mar-May', 'Jun-Sep', 'Oct-Dec']
monsoon_column = 'Jun-Sep'
relationship ={}

In [15]:
for season in seasonal_column:
  if season != monsoon_column:
    corr, _ = pearsonr(rainfall_data[season], rainfall_data[monsoon_column])
    relationship[season] = corr

correlational_data = pd.DataFrame({
    'Season': list(relationship.keys()),
    'Correlation': list(relationship.values())
})
correlational_data.head()

Unnamed: 0,Season,Correlation
0,Jan-Feb,0.142731
1,Mar-May,0.10434
2,Oct-Dec,0.28652


In [25]:
#plotting
fig_correlation = px.bar(
    correlational_data,
    x='Season',
    y='Correlation',
    labels={'Season': 'Season', 'Correlation': 'Correlation Coefficient'},
    title='Correlation between Seasonal Rainfall and Monsoon Rainfall(Jun-Sep)',
    color='Correlation',
    color_continuous_scale='RdBu',
    text='Correlation'
)
fig_correlation.update_traces(marker_line_color='black',marker_line_width=1.5, opacity=0.6,texttemplate='%{text:.2f}')
fig_correlation.update_layout(
    template='plotly_white',
    height=500,
)
fig_correlation.add_hline(
    y=0,
    line_dash="dash",
    line_color="red",
    annotation_text="No Correlation",
    annotation_position="left bottom"
    )
fig_correlation.show()

# Grouping Years Based on Rainfall Patterns

Now, by applying k-means clustering, we will group years into three categories: Dry, Normal, and Wet, based on rainfall patterns:


In [26]:
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler

In [None]:
#preprocessing data for clustering
rainfall_features = rainfall_data[['Jan-Feb','Mar-May','Jun-Sep','Oct-Dec','ANNUAL']]
scaler = StandardScaler()
scaled_features = scaler.fit_transform(rainfall_features)
print(scaled_features)

In [32]:
#perform k-means clustering
kmeans = KMeans(n_clusters=3, random_state=42)
rainfall_data['Rainfall_Cluster'] = kmeans.fit_predict(scaled_features)
print(rainfall_data[['YEAR', 'Rainfall_Cluster']])

     YEAR  Rainfall_Cluster
0    1901                 1
1    1902                 1
2    1903                 2
3    1904                 1
4    1905                 1
..    ...               ...
110  2011                 1
111  2012                 1
112  2013                 2
113  2014                 1
114  2015                 1

[115 rows x 2 columns]


In [33]:
#map cluster labels to catergories (eg. Dry, Normal and Wet)
cluster_labels = {
    0: 'Dry',
    1: 'Normal',
    2: 'Wet'
}
rainfall_data['Rainfall_Category'] = rainfall_data['Rainfall_Cluster'].map(cluster_labels)

In [39]:
#plotting
fig_cluster = px.scatter(
    rainfall_data,
    x='YEAR',
    y='ANNUAL',
    color='Rainfall_Category',
    color_discrete_sequence=px.colors.qualitative.Set2,
    hover_data= {'Rainfall_Cluster':True,'Rainfall_Category':True},
    labels={'YEAR': 'Year', 'ANNUAL': 'Annual Rainfall (mm)'},
    title='Rainfall Patterns in India (1901-2015)'
)
fig_cluster.update_layout(
    template='plotly_white',
    height=500
)

fig_cluster.show()

# Forecasting Future Rainfall

 we will use the Prophet library to forecast annual rainfall for the next 20 years:

In [None]:
rainfall_data['DATE'] = pd.to_datetime(rainfall_data['YEAR'], format='%Y')
annual_rainfall_ts = rainfall_data.set_index('DATE')['ANNUAL']
print(annual_rainfall_ts)

In [53]:
#forcasting using prophet algorithm
from prophet import Prophet
#prepare the data for prophet
prophet_data = annual_rainfall_ts.reset_index()
prophet_data.columns = ['ds', 'y']
# print(prophet_data)

from prophet.plot import plot_plotly, plot_components_plotly

prophet_model = Prophet()
prophet_model.fit(prophet_data)

#create a future dataframe for the next 20 years
future = prophet_model.make_future_dataframe(periods=20, freq='YE')
forecast = prophet_model.predict(future)
print("the predicted values from 2016-2035")
print(forecast[['ds', 'yhat']].tail(20))



INFO:prophet:Disabling weekly seasonality. Run prophet with weekly_seasonality=True to override this.
INFO:prophet:Disabling daily seasonality. Run prophet with daily_seasonality=True to override this.
DEBUG:cmdstanpy:input tempfile: /tmp/tmpbica02lw/ik_4ppci.json
DEBUG:cmdstanpy:input tempfile: /tmp/tmpbica02lw/sc1zz_f1.json
DEBUG:cmdstanpy:idx 0
DEBUG:cmdstanpy:running CmdStan, num_threads: None
DEBUG:cmdstanpy:CmdStan args: ['/usr/local/lib/python3.11/dist-packages/prophet/stan_model/prophet_model.bin', 'random', 'seed=40889', 'data', 'file=/tmp/tmpbica02lw/ik_4ppci.json', 'init=/tmp/tmpbica02lw/sc1zz_f1.json', 'output', 'file=/tmp/tmpbica02lw/prophet_model4jthimsd/prophet_model-20250305164041.csv', 'method=optimize', 'algorithm=lbfgs', 'iter=10000']
16:40:41 - cmdstanpy - INFO - Chain [1] start processing
INFO:cmdstanpy:Chain [1] start processing
16:40:41 - cmdstanpy - INFO - Chain [1] done processing
INFO:cmdstanpy:Chain [1] done processing


the predicted values from 2016-2035
            ds         yhat
115 2015-12-31  1112.452604
116 2016-12-31  1121.831442
117 2017-12-31  1116.986029
118 2018-12-31  1112.077926
119 2019-12-31  1107.111483
120 2020-12-31  1116.490321
121 2021-12-31  1111.644907
122 2022-12-31  1106.736805
123 2023-12-31  1101.770361
124 2024-12-31  1111.149200
125 2025-12-31  1106.303786
126 2026-12-31  1101.395684
127 2027-12-31  1096.429240
128 2028-12-31  1105.808079
129 2029-12-31  1100.962665
130 2030-12-31  1096.054563
131 2031-12-31  1091.088119
132 2032-12-31  1100.466957
133 2033-12-31  1095.621544
134 2034-12-31  1090.713442


In [54]:
fig_forcast = plot_plotly(prophet_model, forecast)
fig_forcast.update_layout(
    title='Annual Rainfall Forecast for the Next 20 Years',
    xaxis_title='Year',
    yaxis_title='Rainfall (mm)',
    template='plotly_white',
    height=500,
)

fig_forcast.show()