Analyse de l’impact du changement climatique à travers les Open Datas

Phase 1 — Recherche et acquisition de données

Analyse et Exploration des jeux de données

In [1]:
# librairies necessaires
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
# Analyse du dataset: Temperature change

temp_change = pd.read_csv('temperature_change.csv')
temp_change.head()

Unnamed: 0,Domain Code,Domain,Area,Element,Months,Year,Unit,Value,Flag,Flag Description
0,ET,Temperature change on land,Afghanistan,Temperature change,January,1961,°c,0.745,E,Estimated value
1,ET,Temperature change on land,Afghanistan,Temperature change,January,1962,°c,0.015,E,Estimated value
2,ET,Temperature change on land,Afghanistan,Temperature change,January,1963,°c,2.706,E,Estimated value
3,ET,Temperature change on land,Afghanistan,Temperature change,January,1964,°c,-5.25,E,Estimated value
4,ET,Temperature change on land,Afghanistan,Temperature change,January,1965,°c,1.854,E,Estimated value


In [3]:
temp_change.isnull().sum()

Domain Code             0
Domain                  0
Area                    0
Element                 0
Months                  0
Year                    0
Unit                    0
Value               10260
Flag                    0
Flag Description        0
dtype: int64

In [None]:
# Suppression des lignes avec trop de valeurs manquantes
temp_change = temp_change.dropna(axis=0, subset="Value")
temp_change.isnull().sum()

In [16]:
# Analyse du dataset: Natural disasters
nat_disaster = pd.read_excel('natural_disasters.xlsx')
nat_disaster.head()

Unnamed: 0,DisNo.,Classification Key,Disaster Group,Disaster Subgroup,Disaster Type,Disaster Subtype,ISO,Country,Subregion,Region,Magnitude,Magnitude Scale,Start Year,Start Month,Total Deaths,No. Homeless,Total Affected,Total Damage ('000 US$),CPI
0,1960-0001-BGD,nat-hyd-flo-flo,Natural,Hydrological,Flood,Flood (General),BGD,Bangladesh,Southern Asia,Asia,,Km2,1960,,10000.0,,,,9.706216
1,1960-0002-IRN,nat-geo-ear-gro,Natural,Geophysical,Earthquake,Ground movement,IRN,Iran (Islamic Republic of),Southern Asia,Asia,6.0,Moment Magnitude,1960,4.0,480.0,,,20000.0,9.706216
2,1960-0007-PER,nat-geo-ear-gro,Natural,Geophysical,Earthquake,Ground movement,PER,Peru,Latin America and the Caribbean,Americas,7.8,Moment Magnitude,1960,1.0,63.0,,200.0,,9.706216
3,1960-0010-MAR,nat-geo-ear-gro,Natural,Geophysical,Earthquake,Ground movement,MAR,Morocco,Northern Africa,Africa,5.9,Moment Magnitude,1960,2.0,13100.0,,,120000.0,9.706216
4,1960-0012-PHL,nat-met-sto-tro,Natural,Meteorological,Storm,Tropical cyclone,PHL,Philippines,South-eastern Asia,Asia,,Kph,1960,4.0,56.0,,,2000.0,9.706216


In [None]:
nat_disaster.isnull().sum()

In [19]:
# Number of rows : 16061
# gestions des valeurs manquantes
nat_disaster['Magnitude'] = nat_disaster.groupby('Disaster Type')['Magnitude'].transform(lambda x: x.fillna(x.mean()))
nat_disaster['Total Affected'] = nat_disaster.groupby('Disaster Type')['Total Affected'].transform(lambda x: x.fillna(x.mean()))

# Suppression des lignes avec trop de valeurs manquantes
nat_disaster = nat_disaster.dropna(axis=0, subset="CPI")
nat_disaster = nat_disaster.dropna(axis=0, subset="Start Month")

nat_disaster.isnull().sum()

DisNo.                         0
Classification Key             0
Disaster Group                 0
Disaster Subgroup              0
Disaster Type                  0
Disaster Subtype               0
ISO                            0
Country                        0
Subregion                      0
Region                         0
Magnitude                   1075
Magnitude Scale             1821
Start Year                     0
Start Month                    0
Total Deaths                4285
No. Homeless               13014
Total Affected                 0
Total Damage ('000 US$)    10138
CPI                            0
dtype: int64

Fusion des datasets

In [24]:
# Renommage des colonnes
nat_disaster.rename(columns={'Start Year': 'Year', 'Start Month': 'Month'}, inplace=True)

temp_change.rename(columns={'Area': 'Country'}, inplace=True)

print("Colonnes de temp_change :", temp_change.columns)
print("Colonnes de nat_disaster :", nat_disaster.columns)


Colonnes de temp_change : Index(['Domain Code', 'Domain', 'Country', 'Element', 'Months', 'Year', 'Unit',
       'Value', 'Flag', 'Flag Description'],
      dtype='object')
Colonnes de nat_disaster : Index(['DisNo.', 'Classification Key', 'Disaster Group', 'Disaster Subgroup',
       'Disaster Type', 'Disaster Subtype', 'ISO', 'Country', 'Subregion',
       'Region', 'Magnitude', 'Magnitude Scale', 'Year', 'Month',
       'Total Deaths', 'No. Homeless', 'Total Affected',
       'Total Damage ('000 US$)', 'CPI'],
      dtype='object')


In [None]:
# Merge des deux datasets
merged_data = pd.merge(temp_change, nat_disaster, on=['Country', 'Year'], how='inner')
merged_data.head()

merged_data = merged_data.to_csv('merged_data.csv', index=False)

