#### ***Model build***

##### ***Aim:*** 
- To use the non-normalized rain data to infer seasonal patterns in the annual pollution level in the top 10 polluting rivers in South East Asia

##### ***Context:***
- **From the EDA**: There is clear evidence of seasonality in precipitation levels across South-East Asia and some evidence of correlation between yearly precipitation levels and annual pollution levels for each river

- Building on The Ocean Cleanup model which shows an annual pollution level for 1000 rivers

- Focusing on the top 10 polluting rivers in South East Asia

- Aiming to vizualise this on a map, with an interactive slider across each month

- In my simplified version, precipitation is used as a proxy for Mobilization (P(M)) from The Ocean Cleanup model
    - Mobilization is then related to plastic pollution




##### ***Packages:***

In [95]:
import pickle
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from plotly.subplots import make_subplots
import plotly.express as px
import seaborn as sns
import sklearn
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LinearRegression

#### ***Grabbing Dataframes:***

In [96]:
ts_df = pd.read_pickle('C:\\Users\\liamr\\Documents\\Playground\\backup\\river_plastic_pollution\\data\\ts_df.pkl')
plastic_pollution = pd.read_pickle('C:\\Users\\liamr\\Documents\\Playground\\backup\\river_plastic_pollution\\data\\SE_top_10_pts.pkl')
average_rain_and_pollution = pd.read_pickle('C:\\Users\\liamr\\Documents\\Playground\\backup\\river_plastic_pollution\\data\\average_rain_and_pollution.pkl')

#### ***Model Version 1:***

##### ***Plan:***

- Create a **weighted distribution** of the annual plastic pollution, by river
- - Weighted acoording to precipitation variation across the months at each location


##### ***Inputs:***

- Annual plastic pollution
- Monthly precipitation values (averaged across years 2000-2019)
- Annual Precipitation 


##### ***Weighted Distibution Model:***

##### *Monthly Pollution = Annual Pollution x Weight*
- Where Weight = Monthly Distribution/Total Annual Precipitation




In [97]:
ts_df

Unnamed: 0,river_ID,country,lon,lat,year,month,avg_precipitation
0,1,Malaysia,101.38875,3.00292,1960,1,263.625
1,2,Vietnam,106.74708,10.49875,1960,1,4.725
2,3,Thailand,100.55795,13.61573,1960,1,4.750
3,4,Myanmar,96.19083,16.77458,1960,1,7.125
4,5,Malaysia,110.39875,1.55542,1960,1,717.250
...,...,...,...,...,...,...,...
7195,6,Malaysia,101.41125,2.80458,2019,12,30.000
7196,7,Myanmar,96.10625,16.84125,2019,12,32.000
7197,8,Malaysia,102.24458,6.18208,2019,12,29.500
7198,9,Malaysia,103.34292,3.80625,2019,12,30.000


In [98]:
plastic_pollution

Unnamed: 0,lon,lat,pollution_volume,country
13361,101.38875,3.00292,12816.0,Malaysia
14525,106.74708,10.49875,4127.44,Vietnam
15365,100.55795,13.61573,4027.33,Thailand
16338,96.19083,16.77458,3615.39,Myanmar
7006,110.39875,1.55542,3274.7,Malaysia
13349,101.41125,2.80458,2829.21,Malaysia
16367,96.10625,16.84125,2488.62,Myanmar
13615,102.24458,6.18208,2466.92,Malaysia
13414,103.34292,3.80625,2186.88,Malaysia
13241,103.7975,1.49625,2143.48,Malaysia


In [99]:
average_rain_and_pollution

Unnamed: 0,river_ID,avg_precip_river,lon,lat,pollution_volume,country
0,1,31.795834,101.38875,3.00292,12816.0,Malaysia
1,2,31.095835,106.74708,10.49875,4127.44,Vietnam
2,3,32.706245,100.55795,13.61573,4027.33,Thailand
3,4,32.603127,96.19083,16.77458,3615.39,Myanmar
4,5,31.692709,110.39875,1.55542,3274.7,Malaysia
5,6,31.141668,101.41125,2.80458,2829.21,Malaysia
6,7,32.728127,96.10625,16.84125,2488.62,Myanmar
7,8,31.064587,102.24458,6.18208,2466.92,Malaysia
8,9,31.174999,103.34292,3.80625,2186.88,Malaysia
9,10,30.64896,103.7975,1.49625,2143.48,Malaysia


In [100]:
# for river in rivers:

# group the ts_df by river and month, and calculate the average monthly precipitation for each river
monthly_precipitation = ts_df.groupby(['river_ID', 'month']).agg(avg_monthly_rain=('avg_precipitation','mean')).reset_index()

# sum the avg_pollution across months for each river, naming the new column avg_yearly_pollution
yearly_precipitation = monthly_precipitation.groupby(['river_ID']).agg(avg_yearly_rain=('avg_monthly_rain','sum')).reset_index()

# calculating the weights
weights = pd.merge(monthly_precipitation, yearly_precipitation, how='left', on='river_ID')
weights['weight'] = weights['avg_monthly_rain'] / weights['avg_yearly_rain']
countries = average_rain_and_pollution[['river_ID', 'country']]
weights = pd.merge(weights, countries, how='left', on='river_ID')

In [101]:
weights

Unnamed: 0,river_ID,month,avg_monthly_rain,avg_yearly_rain,weight,country
0,1,1,57.581249,704.434143,0.081741,Malaysia
1,1,2,47.967915,704.434143,0.068094,Malaysia
2,1,3,61.775414,704.434143,0.087695,Malaysia
3,1,4,64.261253,704.434143,0.091224,Malaysia
4,1,5,54.627918,704.434143,0.077549,Malaysia
...,...,...,...,...,...,...
115,10,8,45.442501,704.796265,0.064476,Malaysia
116,10,9,47.408749,704.796265,0.067266,Malaysia
117,10,10,55.966251,704.796265,0.079408,Malaysia
118,10,11,67.606247,704.796265,0.095923,Malaysia


In [102]:
fig = px.bar(weights, x='month', y='weight', color='river_ID', title='Weights for each river')
fig.show()


In [103]:
# visualising the monthly rainfall for each river and showing 10 plots in a grid of 2x5 that show x=month, y=weight
river_1 = weights[weights['river_ID'] == 1]
river_name1 = river_1['country'][river_1['month'] == 1].iloc[0]
fig1 = px.bar(river_1, x='month', y='weight', title='Weights for River 1')

river_2 = weights[weights['river_ID'] == 2]
river_name2 = river_2['country'][river_2['month'] == 1].iloc[0]
fig2 = px.bar(river_2, x='month', y='weight', title='Weights for River 2')

river_3 = weights[weights['river_ID'] == 3]
river_name3 = river_3['country'][river_3['month'] == 1].iloc[0]
fig3 = px.bar(river_3, x='month', y='weight', title='Weights for River 3')

river_4 = weights[weights['river_ID'] == 4]
river_name4 = river_4['country'][river_4['month'] == 1].iloc[0]
fig4 = px.bar(river_4, x='month', y='weight', title='Weights for River 4')

river_5 = weights[weights['river_ID'] == 5]
river_name5 = river_5['country'][river_5['month'] == 1].iloc[0]
fig5 = px.bar(river_5, x='month', y='weight', title='Weights for River 5')

river_6 = weights[weights['river_ID'] == 6]
river_name6 = river_6['country'][river_6['month'] == 1].iloc[0]
fig6 = px.bar(river_6, x='month', y='weight', title='Weights for River 6')

river_7 = weights[weights['river_ID'] == 7]
river_name7 = river_7['country'][river_7['month'] == 1].iloc[0]
fig7 = px.bar(river_7, x='month', y='weight', title='Weights for River 7')

river_8 = weights[weights['river_ID'] == 8]
river_name8 = river_8['country'][river_8['month'] == 1].iloc[0]
fig8 = px.bar(river_8, x='month', y='weight', title='Weights for River 8')

river_9 = weights[weights['river_ID'] == 9]
river_name9 = river_9['country'][river_9['month'] == 1].iloc[0]
fig9 = px.bar(river_9, x='month', y='weight', title='Weights for River 9')

river_10 = weights[weights['river_ID'] == 10]
river_name10 = river_10['country'][river_10['month'] == 1].iloc[0]
fig10 = px.bar(river_10, x='month', y='weight', title='Weights for River 10')

# titles = (f'{river_name1}', f'{river_name2}', f'{river_name3}', f'{river_name4}', f'{river_name5}', f'{river_name6}', f'{river_name7}', f'{river_name8}', f'{river_name9}', f'{river_name10}')

# assign the titles to equal to the river_name variable of each river
titles = [f'River 1: {river_name1}', f'River 2: {river_name2}', f'River 3: {river_name3}', f'River 4: {river_name4}', 
f'River 5: {river_name5}', f'River 6: {river_name6}', f'River 7: {river_name7}', f'River 8: {river_name8}', f'River 9: {river_name9}', f'River 10: {river_name10}']
# titles = f'River 1: {river_name1}', f'River 2: {river_name2}', f'River 3: {river_name3}', f'River 4: {river_name4}', 
# f'River 5: {river_name5}', f'River 6: {river_name6}', f'River 7: {river_name7}', f'River 8: {river_name8}', f'River 9: {river_name9}', f'River 10: {river_name10}'

# make subplots
fig = make_subplots(rows=2, cols=5, subplot_titles=titles)
fig.add_trace(fig1.data[0], row=1, col=1)
fig.add_trace(fig2.data[0], row=1, col=2)
fig.add_trace(fig3.data[0], row=1, col=3)
fig.add_trace(fig3.data[0], row=1, col=4)
fig.add_trace(fig3.data[0], row=1, col=5)
fig.add_trace(fig3.data[0], row=2, col=1)
fig.add_trace(fig3.data[0], row=2, col=2)
fig.add_trace(fig3.data[0], row=2, col=3)
fig.add_trace(fig3.data[0], row=2, col=4)
fig.add_trace(fig3.data[0], row=2, col=5)

# add title to the figure
fig.update_layout(title_text="River Weights", title_x=0.5)
fig.show()


##### ***Weighted Distibution of Annual Polltion:***

In [104]:
pollution_volume = average_rain_and_pollution[['river_ID', 'pollution_volume']]

In [105]:
pollution_volume = average_rain_and_pollution[['river_ID', 'pollution_volume']]

monthly_pollution_df = pd.merge(weights, pollution_volume, how='left', on='river_ID')

In [106]:
# for each river, create a new column that is monthly_pollution. This is equal to annual pollution x weight for each river and each month
monthly_pollution_df['monthly_pollution'] = monthly_pollution_df['pollution_volume'] * monthly_pollution_df['weight']
monthly_pollution_df

Unnamed: 0,river_ID,month,avg_monthly_rain,avg_yearly_rain,weight,country,pollution_volume,monthly_pollution
0,1,1,57.581249,704.434143,0.081741,Malaysia,12816.00,1047.594441
1,1,2,47.967915,704.434143,0.068094,Malaysia,12816.00,872.695953
2,1,3,61.775414,704.434143,0.087695,Malaysia,12816.00,1123.900203
3,1,4,64.261253,704.434143,0.091224,Malaysia,12816.00,1169.125923
4,1,5,54.627918,704.434143,0.077549,Malaysia,12816.00,993.863535
...,...,...,...,...,...,...,...,...
115,10,8,45.442501,704.796265,0.064476,Malaysia,2143.48,138.203188
116,10,9,47.408749,704.796265,0.067266,Malaysia,2143.48,144.183091
117,10,10,55.966251,704.796265,0.079408,Malaysia,2143.48,170.208816
118,10,11,67.606247,704.796265,0.095923,Malaysia,2143.48,205.609254


In [107]:


river1 = monthly_pollution_df[monthly_pollution_df['river_ID'] == 1]
river_country1 = monthly_pollution_df['country'][monthly_pollution_df['month'] == 1].iloc[0]
fig_pollution_1 = px.bar(river1, x='month', y='weight', title='Monthly Pollution for River 1')

river2 = weights[weights['river_ID'] == 2]
river_country2 = river_2['country'][river_2['month'] == 1].iloc[0]
fig_pollution_2 = px.bar(river2, x='month', y='weight', title='Monthly Pollution for River 2')

river3 = weights[weights['river_ID'] == 3]
river_country3 = river_3['country'][river_3['month'] == 1].iloc[0]
fig_pollution_3 = px.bar(river3, x='month', y='weight', title='Monthly Pollution for River 3')

river4 = weights[weights['river_ID'] == 4]
river_country4 = river_4['country'][river_4['month'] == 1].iloc[0]
fig_pollution_4 = px.bar(river4, x='month', y='weight', title='Monthly Pollution for River 4')

river5 = weights[weights['river_ID'] == 5]
river_country5 = river_5['country'][river_5['month'] == 1].iloc[0]
fig_pollution_5 = px.bar(river5, x='month', y='weight', title='Monthly Pollution for River 5')

river6 = weights[weights['river_ID'] == 6]
river_country6 = river_6['country'][river_6['month'] == 1].iloc[0]
fig_pollution_6 = px.bar(river6, x='month', y='weight', title='Monthly Pollution for River 6')

river7 = weights[weights['river_ID'] == 7]
river_country7 = river_7['country'][river_7['month'] == 1].iloc[0]
fig_pollution_7 = px.bar(river7, x='month', y='weight', title='Monthly Pollution for River 7')

river8 = weights[weights['river_ID'] == 8]
river_country8 = river_8['country'][river_8['month'] == 1].iloc[0]
fig_pollution_8 = px.bar(river8, x='month', y='weight', title='Monthly Pollution for River 8')

river9 = weights[weights['river_ID'] == 9]
river_country9 = river_9['country'][river_9['month'] == 1].iloc[0]
fig_pollution_9 = px.bar(river9, x='month', y='weight', title='Monthly Pollution for River 9')

river10 = weights[weights['river_ID'] == 10]
river_country10 = river_10['country'][river_10['month'] == 1].iloc[0]
fig_pollution_10 = px.bar(river10, x='month', y='weight', title='Monthly Pollution for River 10')

# titles = (f'{river_name1}', f'{river_name2}', f'{river_name3}', f'{river_name4}', f'{river_name5}', f'{river_name6}', f'{river_name7}', f'{river_name8}', f'{river_name9}', f'{river_name10}')

# assign the titles to equal to the river_name variable of each river
titles = [f'River 1: {river_country1}', f'River 2: {river_country2}', f'River 3: {river_country3}', f'River 4: {river_country4}', 
f'River 5: {river_country5}', f'River 6: {river_country6}', f'River 7: {river_country7}', f'River 8: {river_country8}', f'River 9: {river_country9}', f'River 10: {river_country10}']
# titles = f'River 1: {river_name1}', f'River 2: {river_name2}', f'River 3: {river_name3}', f'River 4: {river_name4}', 
# f'River 5: {river_name5}', f'River 6: {river_name6}', f'River 7: {river_name7}', f'River 8: {river_name8}', f'River 9: {river_name9}', f'River 10: {river_name10}'

# make subplots
fig_pollution = make_subplots(rows=2, cols=5, subplot_titles=titles)
fig_pollution.add_trace(fig_pollution_1.data[0], row=1, col=1)
fig_pollution.add_trace(fig_pollution_2.data[0], row=1, col=2)
fig_pollution.add_trace(fig_pollution_3.data[0], row=1, col=3)
fig_pollution.add_trace(fig_pollution_4.data[0], row=1, col=4)
fig_pollution.add_trace(fig_pollution_5.data[0], row=1, col=5)
fig_pollution.add_trace(fig_pollution_6.data[0], row=2, col=1)
fig_pollution.add_trace(fig_pollution_7.data[0], row=2, col=2)
fig_pollution.add_trace(fig_pollution_8.data[0], row=2, col=3)
fig_pollution.add_trace(fig_pollution_9.data[0], row=2, col=4)
fig_pollution.add_trace(fig_pollution_10.data[0], row=2, col=5)

# add title to the figure
fig_pollution.update_layout(title_text="River Pollution by Month", title_x=0.5)
# update the colors to red
fig_pollution.update_traces(marker_color='red')
fig_pollution.show()

In [None]:
# import plotly.io as pio
# pio.renderers.default = "iframe"
# fig.show()

# NEXT STEPS
# sort out above figure
# bring long and lat back into the df
# create a map of the rivers with a slider of the changing months
# bring into a website / web app format (use streamlit/dash/Bill's tips)
# can I build on this baseline model??