# Team 40 | Visualization

*Natural Disaster Projection Due To Climate Change Effects*

* Luis Ruiz Ponce.
* Simón Vallejo.
* Malcom Giraldo.
* Christian Fuertes.
* Juan Felipe Monsalvo.
* Sandra Barreto.
* Guillermo Giraldo.
* Francisco Rodriguez.


## Libraries import

In [1]:
import os
import pandas as pd
import numpy as np

import plotly
import plotly.express as px
import plotly.graph_objects as go

import matplotlib.pyplot as plt
import seaborn as sns

from IPython.display import IFrame

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


##Function definition

In [3]:
def missing_percentage(df):
  """
  This function calculate the percentage of missing values by columns in a dataframe
  INPUT:
    df: Pandas dataframe to analyze
  RETURN:
    missing_df: df containing the number and percentage of missing in every columns
  """
  missing_df = df.isnull().sum().to_frame().reset_index()
  missing_df.columns=["Features", "# Missing Values"]
  missing_df["% of Missing Values"] = (missing_df["# Missing Values"] / len(df) ) * 100

  return missing_df

##Mounting the Google Drive

In [4]:
# Loading our own drive from google
from google.colab import drive
drive.mount('/drive')

Mounted at /drive


## Exploring the folders

In [5]:
# Path variables
data_path = '/drive/MyDrive/DS4A - Team 40/00_DATA'
raw_data_path = data_path + '/00_RAW_DATA'
clean_data_path = data_path + '/01_CLEAN_DATA'

In [6]:
# All the files and folders under our path
print(f'These are the files inside directory 00_DATA {os.listdir(data_path)}')
print(f'These are the files inside directory 00_DATA/00_RAW_DATA {os.listdir(raw_data_path)}')
print(f'These are the files inside directory 00_DATA/01_CLEAN_DATA {os.listdir(clean_data_path)}')

These are the files inside directory 00_DATA ['01_Visualization_Disasters| Team 40.ipynb', '00_RAW_DATA', '01_CLEAN_DATA', 'XX_HTML', 'Data_EDA_Country | Team 40.ipynb', 'EDA.oxps', 'Images', 'Data_EDA Test| Team 40.ipynb', 'Data_EDA_Luis | Team 40.ipynb', 'Data_EDA Climate| Team 40.ipynb', 'Read_netCDF - Climate| Team 40.ipynb', 'Visualization_Temp| Team 40.ipynb', 'Data_EDA Disasters| Team 40.ipynb']
These are the files inside directory 00_DATA/00_RAW_DATA ['GlobalLandTemperaturesByCity.csv', 'GlobalLandTemperaturesByCountry.csv', 'GlobalTemperatures.csv', 'GlobalLandTemperaturesByState.csv', 'GlobalLandTemperaturesByMajorCity.csv', 'DISASTERS', 'New', 'new_disaster_data_EXCEL.xlsx', 'disaster_data.csv', 'climatology_Months.nc', 'climatology_Year.nc', 'Temperature_by_countries', 'temp_mean.7z', 'tas_timeseries_monthly_cru_1901-2020_ABW.csv', 'Temperature_by_countries_fixed', 'temp_by_country.csv', 'ISO_country.csv', 'temp_by_country_1.csv']
These are the files inside directory 00_DAT

## Visualization

### Empty dataframe

In [27]:
empty_df = pd.DataFrame(data= range(1960, 2022), columns=['Year'])
empty_df

Unnamed: 0,Year
0,1960
1,1961
2,1962
3,1963
4,1964
...,...
57,2017
58,2018
59,2019
60,2020


### Disasters data

#### disaster_data.csv

In [7]:
# Loading the clean data file as pandas  dataframe
filename = clean_data_path + '/Disaster_Clean.xlsx'
df_disaster = pd.read_excel(filename)

print(f'This is the file being loaded {filename}')
df_disaster.head()

This is the file being loaded /drive/MyDrive/DS4A - Team 40/00_DATA/01_CLEAN_DATA/Disaster_Clean.xlsx


Unnamed: 0,Year,Disaster Subgroup,Disaster Type,Disaster Subtype,Disaster Subsubtype,Event Name,Country,ISO,Region,Continent,...,Latitude,Longitude,River Basin,Start Month,End Year,End Month,Total Deaths,Total Affected,Total Damages ('000 US$),"Total Damages, Adjusted ('000 US$)"
0,1960,Geophysical,Earthquake,Tsunami,,,Chile,CHL,South America,Americas,...,-38.143,-73.407,,5.0,1960,5.0,6000.0,2003000.0,550000.0,5039165
1,1960,Meteorological,Storm,Tropical cyclone,,Donna,Anguilla,AIA,Caribbean,Americas,...,,,,9.0,1960,9.0,5.0,1250.0,35000.0,320674
2,1960,Meteorological,Storm,Tropical cyclone,,Donna,Netherlands Antilles,NLD,Caribbean,Americas,...,,,,9.0,1960,9.0,,,,0
3,1960,Meteorological,Storm,Tropical cyclone,,Donna,Antigua and Barbuda,ATG,Caribbean,Americas,...,,,,9.0,1960,9.0,2.0,,,0
4,1960,Meteorological,Storm,Tropical cyclone,,,Bangladesh,BGD,Southern Asia,Asia,...,,,,10.0,1960,10.0,3000.0,,,0


##### Exploration

In [10]:
df_selection = df_disaster[df_disaster['ISO'] == 'TCD']
df_selection

Unnamed: 0,Year,Disaster Subgroup,Disaster Type,Disaster Subtype,Disaster Subsubtype,Event Name,Country,ISO,Region,Continent,...,Latitude,Longitude,River Basin,Start Month,End Year,End Month,Total Deaths,Total Affected,Total Damages ('000 US$),"Total Damages, Adjusted ('000 US$)"
732,1966,Climatological,Drought,Drought,,,Chad,TCD,Middle Africa,Africa,...,,,,3.0,1966,,,,,0
823,1969,Climatological,Drought,Drought,,,Chad,TCD,Middle Africa,Africa,...,,,,10.0,1978,,,900000.0,83000.0,613098
921,1971,Biological,Epidemic,Bacterial disease,,Cholera,Chad,TCD,Middle Africa,Africa,...,,,,5.0,1971,5.0,2312.0,7476.0,,0
1141,1977,Meteorological,Storm,,,,Chad,TCD,Middle Africa,Africa,...,,,,5.0,1977,5.0,13.0,100.0,,0
2731,1981,Climatological,Drought,Drought,,,Chad,TCD,Middle Africa,Africa,...,,,,11.0,1985,,3000.0,1500000.0,,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14989,2019,Hydrological,Landslide,Landslide,,,Chad,TCD,Middle Africa,Africa,...,,,,9.0,2019,9.0,52.0,37.0,,0
15031,2019,Hydrological,Flood,,,,Chad,TCD,Middle Africa,Africa,...,,,,9.0,2019,9.0,,423.0,,0
15154,2020,Hydrological,Flood,,,,Chad,TCD,Middle Africa,Africa,...,962.794,180.854,,4.0,2020,4.0,,2062.0,,0
15155,2020,Hydrological,Flood,,,,Chad,TCD,Middle Africa,Africa,...,,,,8.0,2020,10.0,3.0,34872.0,,0


In [35]:
df_selection_group = df_selection.groupby(by=["Year"]).size().reset_index()
df_selection_group.columns = ["Year", "Count"]
type(df_selection_group['Year'].min())

numpy.int64

In [33]:
df_join = pd.merge(left=df_selection_group, right=empty_df, on='Year', how='right')
df_join.fillna(0, inplace=True)
df_join

Unnamed: 0,Year,Count
0,1960,0.0
1,1961,0.0
2,1962,0.0
3,1963,0.0
4,1964,0.0
...,...,...
57,2017,2.0
58,2018,1.0
59,2019,2.0
60,2020,2.0


In [8]:
df_disaster["Country"][0]
title = f'Disasters in {df_disaster["Country"][0]}'
title

'Disasters in Chile'

In [None]:
df_disaster.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15350 entries, 0 to 15349
Data columns (total 24 columns):
 #   Column                              Non-Null Count  Dtype  
---  ------                              --------------  -----  
 0   Year                                15350 non-null  int64  
 1   Disaster Subgroup                   15350 non-null  object 
 2   Disaster Type                       15350 non-null  object 
 3   Disaster Subtype                    12366 non-null  object 
 4   Disaster Subsubtype                 1069 non-null   object 
 5   Event Name                          3798 non-null   object 
 6   Country                             15350 non-null  object 
 7   ISO                                 15350 non-null  object 
 8   Region                              15350 non-null  object 
 9   Continent                           15350 non-null  object 
 10  Location                            13834 non-null  object 
 11  Origin                              3842 

In [None]:
disaster_subgroup_list = df_disaster["Disaster Subgroup"].unique()
disaster_subgroup_list

array(['Geophysical', 'Meteorological', 'Hydrological', 'Climatological',
       'Biological'], dtype=object)

In [None]:
riverine_df = df_disaster[df_disaster["Disaster Subtype"] == 'Riverine flood']
riverine_df.head()

Unnamed: 0,Year,Disaster Subgroup,Disaster Type,Disaster Subtype,Disaster Subsubtype,Event Name,Country,ISO,Region,Continent,...,Latitude,Longitude,River Basin,Start Month,End Year,End Month,Total Deaths,Total Affected,Total Damages ('000 US$),"Total Damages, Adjusted ('000 US$)"
98,1966,Hydrological,Flood,Riverine flood,,,Argentina,ARG,South America,Americas,...,,,,2.0,1966,2.0,62.0,120000.0,30000.0,250447
107,1966,Hydrological,Flood,Riverine flood,,,Bolivia (Plurinational State of),BOL,South America,Americas,...,,,,2.0,1966,2.0,,5500.0,500.0,4174
131,1966,Hydrological,Flood,Riverine flood,,,Indonesia,IDN,South-Eastern Asia,Asia,...,,,,3.0,1966,3.0,176.0,524100.0,33000.0,275492
210,1969,Hydrological,Flood,Riverine flood,,,Colombia,COL,South America,Americas,...,,,,12.0,1969,12.0,,70000.0,500.0,3693
272,1971,Hydrological,Flood,Riverine flood,,,Australia,AUS,Australia and New Zealand,Oceania,...,,,,2.0,1971,2.0,27.0,,20000.0,133840


In [None]:
riverine_df["Region"].unique()

array(['South America', 'South-Eastern Asia', 'Australia and New Zealand',
       'Western Europe', 'Southern Asia', 'Southern Europe',
       'Eastern Africa', 'Eastern Asia', 'Caribbean', 'Northern America',
       'Western Africa', 'Southern Africa', 'Middle Africa',
       'Central America', 'Eastern Europe', 'Northern Africa',
       'Russian Federation', 'Northern Europe', 'Central Asia',
       'Western Asia', 'Melanesia'], dtype=object)

In [None]:
riverine_group = riverine_df.groupby(by=["Year", "Region"]).size().reset_index().rename(columns={0:'Count'})
riverine_group

Unnamed: 0,Year,Region,Count
0,1964,Eastern Africa,1
1,1965,South America,1
2,1965,Southern Europe,1
3,1966,South America,3
4,1966,South-Eastern Asia,4
...,...,...,...
552,2020,Western Africa,1
553,2021,Eastern Africa,1
554,2021,Northern America,1
555,2021,South America,5


In [None]:
fig = px.line(riverine_group, x="Year", y="Count", color='Region', title='# of Riverine flood by Region')
fig.update_layout(modebar_add=["v1hovermode", "toggleSpikeLines"])

fig.show()

In [None]:

new_continent = []
for index, row in df_disaster.iterrows():
  if row['Continent'] == "Americas":
    new_continent.append(row["Region"])
  elif row['ISO'] == "AUS":
    new_continent.append('Australia')
  else:
    new_continent.append(row["Continent"])




df_disaster["Continent_max"] = new_continent

df_disaster["Continent_max"].replace({"Northern America": "North America", "Caribbean": "South America", "Central America": "North America" }, inplace=True)

In [None]:
df_disaster["Continent_max"].unique()

array(['South America', 'Asia', 'Africa', 'North America', 'Europe',
       'Oceania', 'Australia'], dtype=object)

In [None]:
disaster_groupby = df_disaster.groupby(by=["ISO"]).size().reset_index().rename(columns={'ISO':'Country', 0:'Count'})
disaster_groupby.head()

Unnamed: 0,Country,Count
0,AFG,200
1,AGO,74
2,AIA,7
3,ALB,38
4,ANT,3


In [None]:
fig = px.choropleth(disaster_groupby, locations="Country",
                    color="Count", # number of disasters in each country by year,
                    color_continuous_scale=px.colors.sequential.Plasma,
                    hover_name="Country", # column to add to hover information
                    scope = 'world',
                    title = "Number of disasters by Country")

fig.update_layout(
    geo=dict(
        showframe=True,
        showcoastlines=True,
        projection_type='natural earth'
    ),
)


fig.show()

In [None]:
import requests
import geopandas as gpd

ModuleNotFoundError: ignored

In [None]:
gdf = gpd.read_file(
    gpd.datasets.get_path("naturalearth_lowres"),
)
gdf

NameError: ignored

In [None]:
cont.json()

{'features': [{'geometry': {'coordinates': [[[[93.27554321289062,
        80.26361083984375],
       [93.14804077148438, 80.31387329101562],
       [91.42491149902344, 80.31011962890625],
       [92.60404968261719, 80.39044189453125],
       [91.90304565429688, 80.4585952758789],
       [92.7744369506836, 80.51596069335938],
       [93.32554626464844, 80.80581665039062],
       [92.49365997314453, 80.7675552368164],
       [93.16046905517578, 80.93373107910156],
       [93.05929565429688, 80.99179077148438],
       [95.53873443603516, 81.21942901611328],
       [95.12220764160156, 81.27082824707031],
       [95.78776550292969, 81.27998352050781],
       [97.96582794189453, 80.7100601196289],
       [97.129150390625, 80.66276550292969],
       [97.01693725585938, 80.52963256835938],
       [97.42095947265625, 80.31359100341797],
       [97.15887451171875, 80.23359680175781],
       [95.564697265625, 80.19247436523438],
       [93.71971130371094, 79.994140625],
       [92.07630157470703,

In [None]:
cont = requests.get(
    "https://gist.githubusercontent.com/hrbrmstr/91ea5cc9474286c72838/raw/59421ff9b268ff0929b051ddafafbeb94a4c1910/continents.json"
)
gdf = gpd.GeoDataFrame.from_features(cont.json())

gdf = gdf.assign(
    total_pageviews=np.random.randint(10 ** 7, 10 ** 9, len(gdf))
).set_index("CONTINENT")

Unnamed: 0_level_0,geometry,total_pageviews
CONTINENT,Unnamed: 1_level_1,Unnamed: 2_level_1
Asia,"MULTIPOLYGON (((93.27554 80.26361, 93.14804 80...",226223928
North America,"MULTIPOLYGON (((-25.28167 71.39166, -25.62389 ...",395077441
Europe,"MULTIPOLYGON (((58.06138 81.68776, 57.88986 81...",851228857
Africa,"MULTIPOLYGON (((0.69465 5.77337, 0.63583 5.944...",392100753
South America,"MULTIPOLYGON (((-81.71306 12.49028, -81.72015 ...",488710292
Oceania,"MULTIPOLYGON (((-177.39334 28.18416, -177.3879...",399833836
Australia,"MULTIPOLYGON (((142.27997 -10.26556, 142.18942...",939122274
Antarctica,"MULTIPOLYGON (((51.80305 -46.45667, 51.71055 -...",970935226


In [None]:
disaster_groupby = df_disaster.groupby(by=["Continents"]).size().reset_index().rename(columns={0:'Count'})
disaster_groupby

KeyError: ignored

In [None]:
fig = px.choropleth(disaster_groupby, 
                    geojson=gdf.geometry,
                    locations="Continent_max",
                    color="Count", # number of disasters in each country by year,
                    color_continuous_scale=px.colors.sequential.Plasma,
                    hover_name="Continent_max", # column to add to hover information
                    scope = 'world',
                    title = "Number of disasters by Country")

fig.update_layout(
    geo=dict(
        showframe=True,
        showcoastlines=True,
        projection_type='natural earth'
    ),
)


fig.show()

NameError: ignored

Analisemos por tipos de desastres

In [None]:
df_disaster["Disaster Subgroup"].unique()

array(['Geophysical', 'Meteorological', 'Hydrological', 'Climatological',
       'Biological'], dtype=object)

In [None]:
Disaster_subgroup_df = (df_disaster.groupby(by=["Year","Disaster Subgroup", "Continent"]).agg({'ISO':'count', 'Total Deaths': 'sum'}).reset_index()
.rename(columns={'ISO':'Number of disasters'})
)
Disaster_subgroup_df["Total Deaths"] = np.log(Disaster_subgroup_df["Total Deaths"] + 1 )
#print(Disaster_subgroup_df)

fig = px.scatter(Disaster_subgroup_df, x="Number of disasters", y="Total Deaths", animation_frame="Year", animation_group="Continent",
           color="Disaster Subgroup", size="Total Deaths", size_max=30, range_y=[-5,20], range_x=[0,150],  hover_name="Continent",
           title="Total Deaths vs Number of disaster by year, Continent and subgroup of disaster"
           )

fig.layout.updatemenus[0].buttons[0].args[1]["frame"]["duration"] = 1000

fig.show()

In [None]:
Disaster_subgroup_df = (df_disaster.groupby(by=["Year","Disaster Subgroup", "Disaster Type"]).agg({'ISO':'count', 'Total Deaths': 'sum'}).reset_index()
.rename(columns={'ISO':'Number of disasters'})
)
Disaster_subgroup_df["Total Deaths"] = np.log(Disaster_subgroup_df["Total Deaths"] + 1 )
#print(Disaster_subgroup_df)

fig = px.scatter(Disaster_subgroup_df, x="Number of disasters", y="Total Deaths", animation_frame="Year", animation_group="Disaster Type",
           color="Disaster Subgroup", size="Total Deaths", size_max=30, range_y=[-5,20], range_x=[0,250],  hover_name="Disaster Type",
           title="Total Deaths vs Number of disaster by year,subgroup and type of disaster"
           )

fig.layout.updatemenus[0].buttons[0].args[1]["frame"]["duration"] = 1000

fig.show()

In [None]:
Disaster_subgroup_df = (df_disaster.groupby(by=["Year","Disaster Subgroup", "Continent"]).agg({'ISO':'count', 'Total Deaths': 'sum'}).reset_index()
.rename(columns={'ISO':'Number of disasters'})
)
Disaster_subgroup_df["Total Deaths"] = np.log(Disaster_subgroup_df["Total Deaths"] + 1 )
#print(Disaster_subgroup_df)

fig = px.scatter(Disaster_subgroup_df, x="Disaster Subgroup", y="Total Deaths", animation_frame="Year", #animation_group="country",
           color="Continent", size="Number of disasters", size_max=100, #  hover_name="country",
           title="Total Deaths vs disaster subgropu  by year in each continent"
           )

fig.layout.updatemenus[0].buttons[0].args[1]["frame"]["duration"] = 1000

fig.show()

##### Geophysical Disaster

Analisaremos los desastres del tipo geofisicos

In [None]:
Geophysical_df = df_disaster[df_disaster["Disaster Subgroup"]=='Geophysical']

Veamos como esta constituido

In [None]:
Geophysical_df.groupby(by=["Disaster Subgroup", "Disaster Type"]).size().reset_index()

Unnamed: 0,Disaster Subgroup,Disaster Type,0
0,Geophysical,Earthquake,1231
1,Geophysical,Mass movement (dry),43
2,Geophysical,Volcanic activity,235


Podemos ver que la gran mayoria de estos desastres son terremotos, seguido de actividades volcanicas

In [None]:
Geophysical_df_plot = Geophysical_df.groupby(by=["Year","Disaster Type"]).size().reset_index()

Geophysical_df_plot.columns = ["Year","Disaster Type", "Count"]

fig = px.line(Geophysical_df_plot, x="Year", y="Count", color='Disaster Type', title='# Disasters by Year by Geophysical')
fig.update_layout(modebar_add=["v1hovermode", "toggleSpikeLines"])

fig.show()

Si analizamos por subtipo de desastre podemos apreciar que los terremotos estan constituidos por movimientos de tierras o tsunami.

In [None]:
Geophysical_df.groupby(by=["Disaster Subgroup", "Disaster Type", "Disaster Subtype"]).size().reset_index()

Unnamed: 0,Disaster Subgroup,Disaster Type,Disaster Subtype,0
0,Geophysical,Earthquake,Ground movement,1183
1,Geophysical,Earthquake,Tsunami,46
2,Geophysical,Mass movement (dry),Avalanche,5
3,Geophysical,Mass movement (dry),Landslide,29
4,Geophysical,Mass movement (dry),Rockfall,7
5,Geophysical,Mass movement (dry),Subsidence,1
6,Geophysical,Volcanic activity,Ash fall,215
7,Geophysical,Volcanic activity,Lava flow,9
8,Geophysical,Volcanic activity,Pyroclastic flow,3


In [None]:
#Earthquake
Earthquake_df = Geophysical_df[Geophysical_df["Disaster Type"] == "Earthquake"]

fig = px.density_mapbox(Earthquake_df, lat='Latitude', lon='Longitude', z='Dis Mag Value', radius=8,
                        hover_data = ['Dis Mag Value', 'Latitude', 'Longitude', 'Total Deaths'], 
                        mapbox_style="open-street-map", center=dict(lat=0, lon=180), zoom=0, 
                        title = "Location of all the Earthquake")

fig.show()

In [None]:
#Earthquake
Earthquake_df = Geophysical_df[Geophysical_df["Disaster Type"] == "Earthquake"]

fig = px.density_mapbox(Earthquake_df, lat='Latitude', lon='Longitude', z='Dis Mag Value', radius=8,
                        center=dict(lat=0, lon=180), zoom=0, hover_data = ['Dis Mag Value', 'Latitude', 'Longitude', 'Total Deaths'], 
                        animation_frame = "Year",
                        mapbox_style="open-street-map",
                        title = "Location of all the Earthquake by year")

fig.layout.updatemenus[0].buttons[0].args[1]["frame"]["duration"] = 1000

fig.show()

##### Meteorological Disaster

In [None]:
Meteo_df = df_disaster[df_disaster["Disaster Subgroup"]=='Meteorological']

In [None]:
Meteo_df["Disaster Type"].unique()

array(['Storm', 'Extreme temperature'], dtype=object)

In [None]:
Meteo_groupby = Meteo_df.groupby(by=["ISO"]).size().reset_index().rename(columns={'ISO':'Country', 0:'Count'})
Meteo_groupby

Unnamed: 0,Country,Count
0,AFG,16
1,AIA,6
2,ALB,7
3,ANT,3
4,ARG,31
...,...,...
193,WSM,11
194,YEM,6
195,YUG,1
196,ZAF,34


In [None]:
fig = px.choropleth(Meteo_groupby, locations="Country",
                    color="Count", # number of disasters in each country by year,
                    color_continuous_scale="Viridis",
                    hover_name="Country", # column to add to hover information
                    title = "Number of meteorological disasters by Country")
fig.show()

### Climate change Data

#### GlobalLandTemperaturesByCity.csv

This is the source to the dataset: https://www.kaggle.com/datasets/berkeleyearth/climate-change-earth-surface-temperature-data.

In [None]:
# Loading the data file as pandas  dataframe
filename = raw_data_path + '/GlobalLandTemperaturesByCity.csv'
df_temperature_by_city = pd.read_csv(filename)

print(f'This is the file fo this section {filename}')
df_temperature_by_city.head(5)

This is the file fo this section /drive/MyDrive/DS4A - Team 40/00_DATA/00_RAW_DATA/GlobalLandTemperaturesByCity.csv


Unnamed: 0,dt,AverageTemperature,AverageTemperatureUncertainty,City,Country,Latitude,Longitude
0,1743-11-01,6.068,1.737,Århus,Denmark,57.05N,10.33E
1,1743-12-01,,,Århus,Denmark,57.05N,10.33E
2,1744-01-01,,,Århus,Denmark,57.05N,10.33E
3,1744-02-01,,,Århus,Denmark,57.05N,10.33E
4,1744-03-01,,,Århus,Denmark,57.05N,10.33E


In [None]:
df_temperature_by_city.tail()

Unnamed: 0,dt,AverageTemperature,AverageTemperatureUncertainty,City,Country,Latitude,Longitude
8599207,2013-05-01,11.464,0.236,Zwolle,Netherlands,52.24N,5.26E
8599208,2013-06-01,15.043,0.261,Zwolle,Netherlands,52.24N,5.26E
8599209,2013-07-01,18.775,0.193,Zwolle,Netherlands,52.24N,5.26E
8599210,2013-08-01,18.025,0.298,Zwolle,Netherlands,52.24N,5.26E
8599211,2013-09-01,,,Zwolle,Netherlands,52.24N,5.26E


**Variables Available**

**dt**: Date of the sample.

**AverageTemperature**: Average temperature in Celsius degrees. 

**AverageTemperatureUncertainty**: Estimated average error of the temperature in Celsius degrees.  

**City**:  City where the sample was taken.

**Country**:  Country where the sample was taken.

**Latitude**: Exact Latitude where the sample was taken.

**Longitude**: Exact Longitude where the sample was taken.


##### Exploration

In [None]:
print(f'The file has a total number of {len(df_temperature_by_city.columns.values)} columns, which have the following names: {df_temperature_by_city.columns.values}')

The file has a total number of 7 columns, which have the following names: ['dt' 'AverageTemperature' 'AverageTemperatureUncertainty' 'City'
 'Country' 'Latitude' 'Longitude']


In [None]:
df_temperature_by_city.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8599212 entries, 0 to 8599211
Data columns (total 7 columns):
 #   Column                         Dtype  
---  ------                         -----  
 0   dt                             object 
 1   AverageTemperature             float64
 2   AverageTemperatureUncertainty  float64
 3   City                           object 
 4   Country                        object 
 5   Latitude                       object 
 6   Longitude                      object 
dtypes: float64(2), object(5)
memory usage: 459.2+ MB


##### Date variable


In [None]:
date_variable = ["dt"]
df_temperature_by_city[date_variable].head(10)

Unnamed: 0,dt
0,1743-11-01
1,1743-12-01
2,1744-01-01
3,1744-02-01
4,1744-03-01
5,1744-04-01
6,1744-05-01
7,1744-06-01
8,1744-07-01
9,1744-08-01


Since we want to work with years and months, we'll create separate columns with this variables.

In [None]:
df_temperature_by_city["dt"] = pd.to_datetime(df_temperature_by_city["dt"])
df_temperature_by_city["Year"] = df_temperature_by_city["dt"].dt.year
df_temperature_by_city["Month"] = df_temperature_by_city["dt"].dt.month
df_temperature_by_city[["Year","Month"]]

Unnamed: 0,Year,Month
0,1743,11
1,1743,12
2,1744,1
3,1744,2
4,1744,3
...,...,...
8599207,2013,5
8599208,2013,6
8599209,2013,7
8599210,2013,8


Let's check how many years we have in the dataset.

In [None]:
df_temperature_by_city["Year"].unique()

array([1743, 1744, 1745, 1746, 1747, 1748, 1749, 1750, 1751, 1752, 1753,
       1754, 1755, 1756, 1757, 1758, 1759, 1760, 1761, 1762, 1763, 1764,
       1765, 1766, 1767, 1768, 1769, 1770, 1771, 1772, 1773, 1774, 1775,
       1776, 1777, 1778, 1779, 1780, 1781, 1782, 1783, 1784, 1785, 1786,
       1787, 1788, 1789, 1790, 1791, 1792, 1793, 1794, 1795, 1796, 1797,
       1798, 1799, 1800, 1801, 1802, 1803, 1804, 1805, 1806, 1807, 1808,
       1809, 1810, 1811, 1812, 1813, 1814, 1815, 1816, 1817, 1818, 1819,
       1820, 1821, 1822, 1823, 1824, 1825, 1826, 1827, 1828, 1829, 1830,
       1831, 1832, 1833, 1834, 1835, 1836, 1837, 1838, 1839, 1840, 1841,
       1842, 1843, 1844, 1845, 1846, 1847, 1848, 1849, 1850, 1851, 1852,
       1853, 1854, 1855, 1856, 1857, 1858, 1859, 1860, 1861, 1862, 1863,
       1864, 1865, 1866, 1867, 1868, 1869, 1870, 1871, 1872, 1873, 1874,
       1875, 1876, 1877, 1878, 1879, 1880, 1881, 1882, 1883, 1884, 1885,
       1886, 1887, 1888, 1889, 1890, 1891, 1892, 18

In [None]:
# Number of data entries we have per year
columna_analizar = "Year"
temp_by_year = df_temperature_by_city[columna_analizar].value_counts().to_frame().reset_index()
temp_by_year.columns = [columna_analizar, "Count"]
temp_by_year = temp_by_year.sort_values(by=columna_analizar,ascending = False )
temp_by_year.head(10)

Unnamed: 0,Year,Count
172,2013,31590
115,2012,42120
100,2011,42120
99,2010,42120
98,2009,42120
69,2008,42120
70,2007,42120
71,2006,42120
72,2005,42120
73,2004,42120


In [None]:
fig = px.line(temp_by_year, x="Year", y="Count", title='Number of Rows Available by Year')
fig.show()

We will only consider data for years after 1960 since that's what we have available across all datasets.

In [None]:
df_temp_filter = df_temperature_by_city[(df_temperature_by_city["Year"] >= 1960) & (df_temperature_by_city["Year"] < 2022) ]
df_temp_filter["Year"].unique()

array([1960, 1961, 1962, 1963, 1964, 1965, 1966, 1967, 1968, 1969, 1970,
       1971, 1972, 1973, 1974, 1975, 1976, 1977, 1978, 1979, 1980, 1981,
       1982, 1983, 1984, 1985, 1986, 1987, 1988, 1989, 1990, 1991, 1992,
       1993, 1994, 1995, 1996, 1997, 1998, 1999, 2000, 2001, 2002, 2003,
       2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011, 2012, 2013])

In [None]:
# Let's check how rich is our filtered dataset
columna_analizar = "Year"
temp_by_year = df_temp_filter[columna_analizar].value_counts().to_frame().reset_index()
temp_by_year.columns = [columna_analizar, "Count"]
temp_by_year = temp_by_year.sort_values(by=columna_analizar,ascending = False )

fig = px.line(temp_by_year, x="Year", y="Count", title='Number of Rows Available by Year (1960 - 2013)')
fig.show()

##### Temperature Variables

Let's check the average global temperature per year.

In [None]:
temp_by_year = df_temp_filter.groupby(['Year'])['AverageTemperature'].mean().to_frame().reset_index()
temp_by_year.head(10)

Unnamed: 0,Year,AverageTemperature
0,1960,17.900934
1,1961,18.029934
2,1962,17.727212
3,1963,17.738178
4,1964,17.655023
5,1965,17.62381
6,1966,17.971467
7,1967,17.752272
8,1968,17.639048
9,1969,17.683529


The following graph shows that in rough terms, global temperature has been on the raise since 1960. 

In [None]:
fig = px.line(temp_by_year, x="Year", y="AverageTemperature", title='World Temperature (1960 - 2013)')
fig.show()

##### Missing values

In [None]:
df_temp_filter["Latitude_copy"]=df_temp_filter["Latitude"].str.replace("N", '/N')
df_temp_filter["Latitude_copy"]=df_temp_filter["Latitude_copy"].str.replace("S", '/S')

df_temp_filter[['Lat_number','Lat_position']]=df_temp_filter.Latitude_copy.str.split('/',expand=True)
df_temp_filter['Lat_number']=pd.to_numeric(df_temp_filter['Lat_number'])

conditionlist = [
    (df_temp_filter['Lat_number'] >= 15) & (df_temp_filter['Lat_position'] =="N"),
    (df_temp_filter['Lat_number'] < 15),
    (df_temp_filter['Lat_number'] >= 15) & (df_temp_filter['Lat_position'] =="S")]
choicelist = ['North', 'Inter', 'South']
df_temp_filter['Lat_Range'] = np.select(conditionlist, choicelist, default='Not Specified')




A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/

The table above shows that roughly 0.14% of AverageTemperature and AverageTemperatureUncertainty is missing. The number of missing values is so low that the heatmap below doesn't show them.

##### Final Dataset After Cleaning

In [None]:
temp_by_lat_year = df_temp_filter.groupby(['Year','Lat_Range'])['AverageTemperature'].mean().to_frame().reset_index()
temp_by_lat_year.head(10)

Unnamed: 0,Year,Lat_Range,AverageTemperature
0,1960,Inter,25.528154
1,1960,North,15.440981
2,1960,South,18.764304
3,1961,Inter,25.400037
4,1961,North,15.598767
5,1961,South,19.239617
6,1962,Inter,25.384054
7,1962,North,15.255755
8,1962,South,18.607462
9,1963,Inter,25.490393


In [None]:
fig = px.line(temp_by_lat_year, x="Year", y="AverageTemperature", color='Lat_Range', title='Temperature by Year')
fig.update_layout(modebar_add=["v1hovermode", "toggleSpikeLines"])

fig.show()

In [None]:
temp_by_long_year = df_temperature_by_city["Longitude"].value_counts().to_frame().reset_index()
#temp_by_long_year.columns = [columna_analizar, "Count"]
#temp_by_long_year = temp_by_year.sort_values(by=columna_analizar,ascending = False )
temp_by_long_year.head(10)

Unnamed: 0,index,Longitude
0,139.23E,129600
1,88.25E,88842
2,136.22E,86940
3,0.00W,83557
4,46.31W,82878
5,5.26E,64780
6,6.34E,64780
7,107.84E,61155
8,106.55E,58890
9,1.36W,51824


In [None]:
df_temp_filter["Longitude_copy"]=df_temp_filter["Longitude"].str.replace("E", '/E')
df_temp_filter["Longitude_copy"]=df_temp_filter["Longitude_copy"].str.replace("W", '/W')

df_temp_filter[['Lon_number','Lon_position']]=df_temp_filter.Longitude_copy.str.split('/',expand=True)
df_temp_filter['Lon_number']=pd.to_numeric(df_temp_filter['Lon_number'])
df_temp_filter.head()
conditionlist1 = [
    (df_temp_filter['Lon_number'] <= 90) & (df_temp_filter['Lon_position'] =="E"),
    (df_temp_filter['Lon_number'] > 90) & (df_temp_filter['Lon_position'] =="E"),
    (df_temp_filter['Lon_number'] <= 90) & (df_temp_filter['Lon_position'] =="W"),
    (df_temp_filter['Lon_number'] > 90) & (df_temp_filter['Lon_position'] =="W")]
choicelist1 = ['0-90E', '90-180E', '0-90W','90-180W']
df_temp_filter['Lon_Range'] = np.select(conditionlist1, choicelist1, default='Not Specified')



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/

In [None]:
temp_by_lon_year = df_temp_filter.groupby(['Year','Lon_Range'])['AverageTemperature'].mean().to_frame().reset_index()
temp_by_lon_year.head(10)

Unnamed: 0,Year,Lon_Range,AverageTemperature
0,1960,0-90E,17.638239
1,1960,0-90W,18.677821
2,1960,90-180E,18.057165
3,1960,90-180W,16.61642
4,1961,0-90E,17.668844
5,1961,0-90W,18.989403
6,1961,90-180E,18.242648
7,1961,90-180W,16.58454
8,1962,0-90E,17.481615
9,1962,0-90W,18.424832


In [None]:
fig = px.line(temp_by_lon_year, x="Year", y="AverageTemperature", color='Lon_Range', title='Temperature by Year')
fig.update_layout(modebar_add=["v1hovermode", "toggleSpikeLines"])

fig.show()

****

In [None]:
conditionlist2 = [
    (df_temp_filter['Lat_number'] >= 15) & (df_temp_filter['Lat_position'] =="N") & (df_temp_filter['Lon_number'] <= 180) & (df_temp_filter['Lon_number'] > 30) & (df_temp_filter['Lon_position'] =="E"),
    (df_temp_filter['Lat_number'] < 15) & (df_temp_filter['Lon_number'] <= 180) & (df_temp_filter['Lon_number'] > 30) & (df_temp_filter['Lon_position'] =="E"),
    (df_temp_filter['Lat_number'] >= 15) & (df_temp_filter['Lat_position'] =="S") & (df_temp_filter['Lon_number'] <= 180) & (df_temp_filter['Lon_number'] > 30) & (df_temp_filter['Lon_position'] =="E"),
    
    
    (df_temp_filter['Lat_number'] >= 15) & (df_temp_filter['Lat_position'] =="N") & (df_temp_filter['Lon_number'] <= 30) & (df_temp_filter['Lon_number'] >= 0) & (df_temp_filter['Lon_position'] =="E"),
    (df_temp_filter['Lat_number'] >= 15) & (df_temp_filter['Lat_position'] =="N") & (df_temp_filter['Lon_number'] <= 60) & (df_temp_filter['Lon_number'] >= 0) & (df_temp_filter['Lon_position'] =="W"),
    
    (df_temp_filter['Lat_number'] < 15) & (df_temp_filter['Lon_number'] <= 30) & (df_temp_filter['Lon_number'] >= 0) & (df_temp_filter['Lon_position'] =="E"),
    (df_temp_filter['Lat_number'] < 15) & (df_temp_filter['Lon_number'] <= 60) & (df_temp_filter['Lon_number'] >= 0) & (df_temp_filter['Lon_position'] =="W"),

    (df_temp_filter['Lat_number'] >= 15) & (df_temp_filter['Lat_position'] =="S") & (df_temp_filter['Lon_number'] <= 30) & (df_temp_filter['Lon_number'] >= 0) & (df_temp_filter['Lon_position'] =="E"),
    (df_temp_filter['Lat_number'] >= 15) & (df_temp_filter['Lat_position'] =="S") & (df_temp_filter['Lon_number'] <= 60) & (df_temp_filter['Lon_number'] >= 0) & (df_temp_filter['Lon_position'] =="W"),
    
    (df_temp_filter['Lat_number'] >= 15) & (df_temp_filter['Lat_position'] =="N") & (df_temp_filter['Lon_number'] <= 180) & (df_temp_filter['Lon_number'] > 60) & (df_temp_filter['Lon_position'] =="W"),
    (df_temp_filter['Lat_number'] < 15) & (df_temp_filter['Lon_number'] <= 180) & (df_temp_filter['Lon_number'] > 60) & (df_temp_filter['Lon_position'] =="W"),
    (df_temp_filter['Lat_number'] >= 15) & (df_temp_filter['Lat_position'] =="S") & (df_temp_filter['Lon_number'] <= 180) & (df_temp_filter['Lon_number'] > 60) & (df_temp_filter['Lon_position'] =="W")]

choicelist2 = ['NorthAme', 'CentralAme', 'SouthAme', 'Europ_Afri1', 'Europ_Afri1', 'Afri2', 'Afri2', 'Afri3','Afri3', 'Asia1', 'Asia2', 'Austr']
df_temp_filter['Range'] = np.select(conditionlist2, choicelist2, default='Not Specified')


In [None]:
temp_by_range_year = df_temp_filter.groupby(['Year','Range'])['AverageTemperature'].mean().to_frame().reset_index()
temp_by_range_year.head(10)

In [None]:
fig = px.line(temp_by_range_year, x="Year", y="AverageTemperature", color='Range', title='Temperature by Year')
fig.update_layout(modebar_add=["v1hovermode", "toggleSpikeLines"])
fig.show()

In [None]:
# Loading the data file as pandas  dataframe
filename = raw_data_path + '/temp_by_country.csv'
df_temperature_by_country = pd.read_csv(filename)

print(f'This is the file fo this section {filename}')
df_temperature_by_country.head(5)

This is the file fo this section /drive/MyDrive/DS4A - Team 40/00_DATA/00_RAW_DATA/temp_by_country.csv


Unnamed: 0,year,month,mean_temp,ISO
0,1960,1,26.9,ABW
1,1961,1,26.7,ABW
2,1962,1,26.6,ABW
3,1963,1,26.5,ABW
4,1964,1,26.6,ABW


In [None]:
df_disaster.head()

Unnamed: 0,Year,Disaster Subgroup,Disaster Type,Disaster Subtype,Disaster Subsubtype,Event Name,Country,ISO,Region,Continent,...,Longitude,River Basin,Start Month,End Year,End Month,Total Deaths,Total Affected,Total Damages ('000 US$),"Total Damages, Adjusted ('000 US$)",Continent_max
0,1960,Geophysical,Earthquake,Tsunami,,,Chile,CHL,South America,Americas,...,-73.407,,5.0,1960,5.0,6000.0,2003000.0,550000.0,5039165,South America
1,1960,Meteorological,Storm,Tropical cyclone,,Donna,Anguilla,AIA,Caribbean,Americas,...,,,9.0,1960,9.0,5.0,1250.0,35000.0,320674,South America
2,1960,Meteorological,Storm,Tropical cyclone,,Donna,Netherlands Antilles,ANT,Caribbean,Americas,...,,,9.0,1960,9.0,,,,0,South America
3,1960,Meteorological,Storm,Tropical cyclone,,Donna,Antigua and Barbuda,ATG,Caribbean,Americas,...,,,9.0,1960,9.0,2.0,,,0,South America
4,1960,Meteorological,Storm,Tropical cyclone,,,Bangladesh,BGD,Southern Asia,Asia,...,,,10.0,1960,10.0,3000.0,,,0,Asia


In [None]:
df_disaster['ISO_copy']= df_disaster['ISO']
df_disaster['ISO']= df_disaster['ISO'].replace(['SUN','HKG','SCG','DFR','DDR','YUG','ANT','AZO','BLM','CSK',
                                       'IMN','MAF','SPI','SXM','YMD','YMN'],['RUS','CHI','SRB','DEU',
                                       'DEU','SRB','NLD','PRT','AIA','CZE','GBR','AIA','CAF','AIA','YEM','YEM',])

In [None]:
df_disaster['month']= df_disaster['End Month']
df_disaster['year']= df_disaster['End Year']

In [None]:
df_disaster_temp = pd.merge(df_disaster, df_temperature_by_country, how="left", on=["year", "month","ISO"])
df_disaster_temp.shape

(15373, 29)

In [None]:
df_disaster_temp.info()
df_disaster_temp.head()


<class 'pandas.core.frame.DataFrame'>
Int64Index: 15373 entries, 0 to 15372
Data columns (total 29 columns):
 #   Column                              Non-Null Count  Dtype  
---  ------                              --------------  -----  
 0   Year                                15373 non-null  int64  
 1   Disaster Subgroup                   15373 non-null  object 
 2   Disaster Type                       15373 non-null  object 
 3   Disaster Subtype                    12385 non-null  object 
 4   Disaster Subsubtype                 1069 non-null   object 
 5   Event Name                          3804 non-null   object 
 6   Country                             15373 non-null  object 
 7   ISO                                 15373 non-null  object 
 8   Region                              15373 non-null  object 
 9   Continent                           15373 non-null  object 
 10  Location                            13855 non-null  object 
 11  Origin                              3850 

Unnamed: 0,Year,Disaster Subgroup,Disaster Type,Disaster Subtype,Disaster Subsubtype,Event Name,Country,ISO,Region,Continent,...,End Month,Total Deaths,Total Affected,Total Damages ('000 US$),"Total Damages, Adjusted ('000 US$)",Continent_max,ISO_copy,month,year,mean_temp
0,1960,Geophysical,Earthquake,Tsunami,,,Chile,CHL,South America,Americas,...,5.0,6000.0,2003000.0,550000.0,5039165,South America,CHL,5.0,1960,6.42
1,1960,Meteorological,Storm,Tropical cyclone,,Donna,Anguilla,AIA,Caribbean,Americas,...,9.0,5.0,1250.0,35000.0,320674,South America,AIA,9.0,1960,26.4
2,1960,Meteorological,Storm,Tropical cyclone,,Donna,Netherlands Antilles,NLD,Caribbean,Americas,...,9.0,,,,0,South America,ANT,9.0,1960,13.54
3,1960,Meteorological,Storm,Tropical cyclone,,Donna,Antigua and Barbuda,ATG,Caribbean,Americas,...,9.0,2.0,,,0,South America,ATG,9.0,1960,26.7
4,1960,Meteorological,Storm,Tropical cyclone,,,Bangladesh,BGD,Southern Asia,Asia,...,10.0,3000.0,,,0,Asia,BGD,10.0,1960,27.09


In [None]:
temp_by_year1 = df_disaster_temp.groupby(['year'])['mean_temp'].mean().to_frame().reset_index()
temp_by_year1.head(10)

temp_by_year2=(df_disaster_temp.groupby(['year']).agg({'mean_temp': ['mean', 'count']})).reset_index()
temp_by_year2.head(10)


Unnamed: 0_level_0,year,mean_temp,mean_temp
Unnamed: 0_level_1,Unnamed: 1_level_1,mean,count
0,1960,22.360294,34
1,1961,19.984,25
2,1962,18.595556,27
3,1963,20.275526,38
4,1964,19.094528,53
5,1965,18.045,58
6,1966,20.577746,71
7,1967,20.644028,72
8,1968,19.9975,72
9,1969,19.690161,62


In [None]:
Biological_df1 = df_disaster_temp[df_disaster_temp["Disaster Subgroup"]=='Biological']

In [None]:
temp_by_year_cont_bio = Biological_df1.groupby(['year','Continent'])['mean_temp'].mean().to_frame().reset_index()
temp_by_year_cont_bio.head(10)

Unnamed: 0,year,Continent,mean_temp
0,1963,Americas,23.59
1,1963,Asia,9.28
2,1964,Americas,24.04
3,1964,Asia,9.86
4,1965,Africa,27.65
5,1965,Americas,24.01
6,1965,Asia,14.08
7,1966,Africa,17.385
8,1967,Americas,22.79
9,1967,Asia,19.593333
