## Data Cleaning

### The entire dataset

In [22]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import folium
from folium.plugins import HeatMap
import branca
import plotly.express as px
import plotly.graph_objects as go

In [23]:
# entire dataset
nabr = pd.read_csv('./data/NABR_historic.csv')
nearterm = pd.read_csv('./data/nearterm_data_2020-2024.csv')

In [24]:
nabr.head()

Unnamed: 0,long,lat,year,TimePeriod,RCP,scenario,treecanopy,Ann_Herb,Bare,Herb,...,PPT_Annual,T_Winter,T_Summer,T_Annual,Tmax_Summer,Tmin_Winter,VWC_Winter_whole,VWC_Spring_whole,VWC_Summer_whole,VWC_Fall_whole
0,-110.0472,37.60413,1980,Hist,historical,sc1,0,0,84,5,...,13.79,0.964835,23.15924,23.15924,37.05,,,,,
1,-110.0472,37.60413,1980,Hist,historical,sc1,0,0,84,5,...,2.69,0.964835,23.15924,0.964835,37.05,,,,,
2,-110.0472,37.60413,1980,Hist,historical,sc1,0,0,84,5,...,13.79,0.964835,23.15924,0.964835,37.05,,,,,
3,-110.0472,37.60413,1980,Hist,historical,sc1,0,0,84,5,...,2.69,0.964835,23.15924,23.15924,37.05,,,,,
4,-110.0472,37.60413,1980,Hist,historical,sc1,0,0,84,5,...,,,,,,-12.45,0.113447,0.096831,0.041876,0.052298


In [25]:
nearterm.head()

Unnamed: 0,long,lat,year,TimePeriod,RCP,scenario,treecanopy,Ann_Herb,Bare,Herb,...,PPT_Annual,T_Winter,T_Summer,T_Annual,Tmax_Summer,Tmin_Winter,VWC_Winter_whole,VWC_Spring_whole,VWC_Summer_whole,VWC_Fall_whole
0,-110.0472,37.60413,2021,NT,4.5,sc22,0,0,84,5,...,6.37,1.630333,24.50402,24.50402,36.89,,,,,
1,-110.0472,37.60413,2021,NT,4.5,sc22,0,0,84,5,...,,,,,,-12.77,0.114652,0.078764,0.043514,0.051281
2,-110.0472,37.60413,2021,NT,4.5,sc23,0,0,84,5,...,3.09,1.389056,24.11043,24.11043,37.95,,,,,
3,-110.0472,37.60413,2021,NT,4.5,sc23,0,0,84,5,...,,,,,,-18.96,0.130221,0.096412,0.041232,0.092241
4,-110.0472,37.60413,2021,NT,4.5,sc24,0,0,84,5,...,6.87,-0.334389,25.54266,10.31321,37.74,,,,,


In [26]:
# lowercase column names
nabr.columns = nabr.columns.str.lower()
nearterm.columns = nearterm.columns.str.lower()

# merge the two datasets
df = pd.concat([nabr, nearterm], axis=0)

# save the cleaned data to a csv file
df.to_csv('./data/cleaned_data/final_data.csv', index=False)

### Greenery area dataset

In [27]:
# Greenery data
# extract the relevant data
greenery_columns = ['long', 'lat', 'year', 'rcp', 'treecanopy', 'bare', 'herb', 'ann_herb', 'litter', 'shrub']
greenery = df[greenery_columns]

# calculate the total greenery
greenery['total'] = greenery['treecanopy'] + greenery['herb'] + greenery['ann_herb'] + greenery['shrub'] + greenery['litter'] - greenery['bare'] 

# save the data to csv file
greenery.to_csv('./data/cleaned_data/greenery_data.csv', index=False)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  greenery['total'] = greenery['treecanopy'] + greenery['herb'] + greenery['ann_herb'] + greenery['shrub'] + greenery['litter'] - greenery['bare']


### VWC dataset

In [28]:
# VWC data
# extract the relevant data
vwc_columns = ['long', 'lat', 'year', 'rcp', 'vwc_winter_whole', 'vwc_spring_whole', 'vwc_summer_whole', 'vwc_fall_whole']
vwc = df[vwc_columns]

# deal with NaN values
# fill the null values with the mean of the column
null_columns = ['vwc_winter_whole', 'vwc_spring_whole', 'vwc_summer_whole', 'vwc_fall_whole']
vwc.loc[:, null_columns] = vwc[null_columns].fillna(vwc[null_columns].mean())

# save it to a new csv file
vwc.to_csv('./data/cleaned_data/vwc_data.csv', index=False)

### Summer dataset

In [29]:
# summer related data
# extract the relevant data
summer_columns = ['long', 'lat', 'year', 'rcp', 'drysoildays_summer_whole', 'evap_summer', 'nondryswa_summer_whole', 'ppt_summer', 'tmax_summer']
summer = df[summer_columns]

# fill missing values with the mean
summer.loc[:, 'drysoildays_summer_whole'] = summer['drysoildays_summer_whole'].fillna(summer['drysoildays_summer_whole'].mean())
summer.loc[:, 'evap_summer'] = summer['evap_summer'].fillna(summer['evap_summer'].mean())
summer.loc[:, 'nondryswa_summer_whole'] = summer['nondryswa_summer_whole'].fillna(summer['nondryswa_summer_whole'].mean())
summer.loc[:, 'ppt_summer'] = summer['ppt_summer'].fillna(summer['ppt_summer'].mean())
summer.loc[:, 'tmax_summer'] = summer['tmax_summer'].fillna(summer['tmax_summer'].mean())

# round the columns
summer = summer.round({'drysoildays_summer_whole': 3, 'evap_summer': 3, 'nondryswa_summer_whole': 3, 'ppt_summer': 3, 'tmax_summer': 3})

# save the data to csv file
summer.to_csv('./data/cleaned_data/summer_data.csv', index=False)

## Visualization

### Greenery area dataset

In [57]:
greenery.head()

Unnamed: 0,long,lat,year,rcp,treecanopy,bare,herb,ann_herb,litter,shrub,total
0,-110.0472,37.60413,1980,historical,0,84,5,0,11,7,-61
1,-110.0472,37.60413,1980,historical,0,84,5,0,11,7,-61
2,-110.0472,37.60413,1980,historical,0,84,5,0,11,7,-61
3,-110.0472,37.60413,1980,historical,0,84,5,0,11,7,-61
4,-110.0472,37.60413,1980,historical,0,84,5,0,11,7,-61


In [58]:
greenery_total = greenery.groupby(['long', 'lat'])['total'].mean().reset_index()

In [59]:
# create a folium heatmap
map_center = [greenery_total['lat'].mean(), greenery_total['long'].mean()]
map_width = 800  
map_height = 600 
mymap = folium.Map(location=map_center, zoom_start=13, width=map_width, height=map_height)

heat_data_green = greenery_total[['lat', 'long', 'total']].reset_index(drop=True).values.tolist()

In [60]:
# the higher the gradient, the more greenery there is. 
HeatMap(heat_data_green, radius=20, blur=10, gradient={0.2: 'red', 0.4: 'brown', 0.6: 'yellow', 0.8: '#32CD32', 1: 'green'}).add_to(mymap)

mymap