In [6]:
# Import the required libraries and dependencies
import pandas as pd
import hvplot.pandas
from pathlib import Path

# Climate Change Analysis

### Import Data

In [7]:
# Using the read_csv function and Path module, create a DataFrame 
# by importing the GlobalLandTemperatureByCountry.csv, GlobalLandTemperatureByMajorCity.csv and GlobalTemperature.csv file from the Resources folder
temperature_by_country_df = pd.read_csv(Path("./Resources/GlobalLandTemperaturesByCountry.csv"))
temperature_by_major_city_df = pd.read_csv(Path("./Resources/GlobalLandTemperaturesByMajorCity.csv"))
global_temperature_df = pd.read_csv(Path("./Resources/GlobalTemperatures.csv"))


In [8]:
# Review the first and last five rows of the temperature_by_country_df DataFrame
display(temperature_by_country_df.head())
display(temperature_by_country_df.tail())

Unnamed: 0,dt,AverageTemperature,AverageTemperatureUncertainty,Country
0,1743-11-01,4.384,2.294,Åland
1,1743-12-01,,,Åland
2,1744-01-01,,,Åland
3,1744-02-01,,,Åland
4,1744-03-01,,,Åland


Unnamed: 0,dt,AverageTemperature,AverageTemperatureUncertainty,Country
577457,2013-05-01,19.059,1.022,Zimbabwe
577458,2013-06-01,17.613,0.473,Zimbabwe
577459,2013-07-01,17.0,0.453,Zimbabwe
577460,2013-08-01,19.759,0.717,Zimbabwe
577461,2013-09-01,,,Zimbabwe


In [9]:
# Review the first and last five rows of the temperature_by_major_city_df DataFrame
display(temperature_by_major_city_df.head())
display(temperature_by_major_city_df.tail())

Unnamed: 0,dt,AverageTemperature,AverageTemperatureUncertainty,City,Country,Latitude,Longitude
0,1849-01-01,26.704,1.435,Abidjan,Côte D'Ivoire,5.63N,3.23W
1,1849-02-01,27.434,1.362,Abidjan,Côte D'Ivoire,5.63N,3.23W
2,1849-03-01,28.101,1.612,Abidjan,Côte D'Ivoire,5.63N,3.23W
3,1849-04-01,26.14,1.387,Abidjan,Côte D'Ivoire,5.63N,3.23W
4,1849-05-01,25.427,1.2,Abidjan,Côte D'Ivoire,5.63N,3.23W


Unnamed: 0,dt,AverageTemperature,AverageTemperatureUncertainty,City,Country,Latitude,Longitude
239172,2013-05-01,18.979,0.807,Xian,China,34.56N,108.97E
239173,2013-06-01,23.522,0.647,Xian,China,34.56N,108.97E
239174,2013-07-01,25.251,1.042,Xian,China,34.56N,108.97E
239175,2013-08-01,24.528,0.84,Xian,China,34.56N,108.97E
239176,2013-09-01,,,Xian,China,34.56N,108.97E


In [10]:
# Review the first and last five rows of the global_temperature_df DataFrame
display(global_temperature_df.head())
display(global_temperature_df.tail())

Unnamed: 0,dt,LandAverageTemperature,LandAverageTemperatureUncertainty,LandMaxTemperature,LandMaxTemperatureUncertainty,LandMinTemperature,LandMinTemperatureUncertainty,LandAndOceanAverageTemperature,LandAndOceanAverageTemperatureUncertainty
0,1750-01-01,3.034,3.574,,,,,,
1,1750-02-01,3.083,3.702,,,,,,
2,1750-03-01,5.626,3.076,,,,,,
3,1750-04-01,8.49,2.451,,,,,,
4,1750-05-01,11.573,2.072,,,,,,


Unnamed: 0,dt,LandAverageTemperature,LandAverageTemperatureUncertainty,LandMaxTemperature,LandMaxTemperatureUncertainty,LandMinTemperature,LandMinTemperatureUncertainty,LandAndOceanAverageTemperature,LandAndOceanAverageTemperatureUncertainty
3187,2015-08-01,14.755,0.072,20.699,0.11,9.005,0.17,17.589,0.057
3188,2015-09-01,12.999,0.079,18.845,0.088,7.199,0.229,17.049,0.058
3189,2015-10-01,10.801,0.102,16.45,0.059,5.232,0.115,16.29,0.062
3190,2015-11-01,7.433,0.119,12.892,0.093,2.157,0.106,15.252,0.063
3191,2015-12-01,5.518,0.1,10.725,0.154,0.287,0.099,14.774,0.062


### Clean Data

In [11]:
# Format dt column as Datetime data format
global_temperature_df['time'] = pd.to_datetime(global_temperature_df['dt'], infer_datetime_format=True)

# Set time as index 
global_temperature_df = global_temperature_df.set_index('time')

In [12]:
# Drop unnecessary columns
global_temperature_df = global_temperature_df['LandAndOceanAverageTemperature'].dropna()

# Rename DataFrame columns so units show correctly in hvplot
global_temperature_df = pd.DataFrame({'temperature':global_temperature_df})

In [13]:
# Review the first 5 and last 5 rows the DataFrame
display(global_temperature_df.head())
display(global_temperature_df.tail())

Unnamed: 0_level_0,temperature
time,Unnamed: 1_level_1
1850-01-01,12.833
1850-02-01,13.588
1850-03-01,14.043
1850-04-01,14.667
1850-05-01,15.507


Unnamed: 0_level_0,temperature
time,Unnamed: 1_level_1
2015-08-01,17.589
2015-09-01,17.049
2015-10-01,16.29
2015-11-01,15.252
2015-12-01,14.774


In [14]:
# Plot Land and Ocean Daily Averages to get a visualization of the raw data.
global_temperature_df.hvplot(title = "Land and Ocean Daily Average Temperatures (1850 - 2016)",
                            xlabel = "Date",
                            ylabel = "Temperature (Celsius)")

As seen from the following screenshot, seasons have more variation than the overall trend of land temperature.  Due to this, we will look at the mean average temperature per year, and use this information to evaluate overall trends. 

![A screenshot depicting the impact of seasons on overall trend analysis.](./Images/land_average_temp_no_average.PNG)

In [15]:
# Average values by year to acommodate for seasonal changes get mean yearly temperatures
global_temperature_year_df = global_temperature_df.resample('A').mean()

# Review the first 5 and last 5 rows of DataFrame
display(global_temperature_year_df.head())
display(global_temperature_year_df.tail())

Unnamed: 0_level_0,temperature
time,Unnamed: 1_level_1
1850-12-31,14.867167
1851-12-31,14.991833
1852-12-31,15.0065
1853-12-31,14.955167
1854-12-31,14.991


Unnamed: 0_level_0,temperature
time,Unnamed: 1_level_1
2011-12-31,15.7695
2012-12-31,15.802333
2013-12-31,15.854417
2014-12-31,15.913
2015-12-31,16.058583


In [16]:
# Review the length the DataFrame which corresponds to the number of years analyzed. 
print(f"The length of the land_global_temperature_year is {len(global_temperature_year_df.index)}.")

The length of the land_global_temperature_year is 166.


In [17]:
# Plot Land and Ocean Daily Averages to get a visualization of the raw data.
global_temperature_year_df.hvplot(title = "Land and Ocean Yearly Average Temperatures (1850 - 2016)",
                            xlabel = "Date",
                            ylabel = "Temperature (Celsius)")

After further research into climate analysis, it was discovered that climate is not typically analyzed in average absolute values, but rather in what is know as "temperature anomalies".  A temperature anomoly is measured by the distance that a temperature deviates from the "normal average temperature".  The "normal average temperature" is the mean value of a temperature range that spans over atleast a 30 year period.  This distance from mean can be normalized to the standard deviation which is know as "Standardized Anomolies".  Standardized anomolies will be how we will analyze trends for our data.   

Different organizations use different normal average temperature timeframes; for example the World Meteorological Organization (WMO) uses 1918-2010, NASA uses 1951-1980 and NOAA uses data spanning over the entire 20th century.

For the sake of this analysis, we will use the same timeframe as NOAA (1900-01-01 to 2000-01-01) for our normal average temperature reference.   

In [18]:
# Extract the 20th century from the global_temperature_year_df to use as reference.  
normal_average_temperature = global_temperature_year_df.loc['1900-01-01':'2000-01-01']

# Review the first 5 and last 5 rows of DataFrame
display(normal_average_temperature.head())
display(normal_average_temperature.tail())

Unnamed: 0_level_0,temperature
time,Unnamed: 1_level_1
1900-12-31,15.143917
1901-12-31,15.073333
1902-12-31,14.958333
1903-12-31,14.836583
1904-12-31,14.810417


Unnamed: 0_level_0,temperature
time,Unnamed: 1_level_1
1995-12-31,15.637833
1996-12-31,15.524667
1997-12-31,15.713833
1998-12-31,15.826
1999-12-31,15.600333


In [19]:
# Find the mean value and standard deviation of the normal_average_temperature DataFrame
mean = normal_average_temperature.mean()
std_dev = normal_average_temperature.std()

# Review the mean value and standard deviation
print(f"The mean value of the normal_average_temperature DataFrame is {mean}.")
print(f"The standard devation of the normal_average_temperature DataFrame is {std_dev}.")

The mean value of the normal_average_temperature DataFrame is temperature    15.236225
dtype: float64.
The standard devation of the normal_average_temperature DataFrame is temperature    0.223864
dtype: float64.


In [20]:
# Create a new DataFrame containing standardized anomolies normalized to the normal_average_temperature DataFrame.  
standardized_anomoly = (global_temperature_year_df - mean)/std_dev

# Rename column in df
standardized_anomoly = standardized_anomoly.rename(columns={'temperature':'standardized_anomoly'})

# Review the first 5 and last 5 rows of DataFrame
display(standardized_anomoly.head())
display(standardized_anomoly.tail())

Unnamed: 0_level_0,standardized_anomoly
time,Unnamed: 1_level_1
1850-12-31,-1.648582
1851-12-31,-1.091697
1852-12-31,-1.026181
1853-12-31,-1.255487
1854-12-31,-1.095419


Unnamed: 0_level_0,standardized_anomoly
time,Unnamed: 1_level_1
2011-12-31,2.382138
2012-12-31,2.528804
2013-12-31,2.76146
2014-12-31,3.023152
2015-12-31,3.673472


In [21]:
# Plot Land and Ocean Standardized Anomolies to get a visualization of the normalized data.
standardized_anomoly.hvplot(title = "Land and Ocean Standardized Temperature Anomolies (1850 - 2016)",
                            xlabel = "Date",
                            ylabel = "Standardized Anomolies")