# This notebook aims at preparing the data for tableau

The intended visualization is a dual-axis bar and line chart. The bars represent the difference between consecutive five-year average global temperatures, highlighting the fact that this difference tends to be positive—in other words, that Earth's temperature is consistently rising. The plot is inpsired by a famous plot of the GIEC (see project report)
 
The line chart displays the average temperature over half-decades. The 0°C reference is omitted, as the focus is on temperature evolution rather than absolute values.

## Importing the libraries

In [None]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import matplotlib.colors as mcolors

## Loading the data
missing values are replaced by the previous valid values

In [4]:
global_temp = pd.read_csv('Dataset/GlobalTemperatures.csv')
global_temp_filled = global_temp.fillna(method='ffill')
global_temp_filled

  global_temp_filled = global_temp.fillna(method='ffill')


Unnamed: 0,dt,LandAverageTemperature,LandAverageTemperatureUncertainty,LandMaxTemperature,LandMaxTemperatureUncertainty,LandMinTemperature,LandMinTemperatureUncertainty,LandAndOceanAverageTemperature,LandAndOceanAverageTemperatureUncertainty
0,1750-01-01,3.034,3.574,,,,,,
1,1750-02-01,3.083,3.702,,,,,,
2,1750-03-01,5.626,3.076,,,,,,
3,1750-04-01,8.490,2.451,,,,,,
4,1750-05-01,11.573,2.072,,,,,,
...,...,...,...,...,...,...,...,...,...
3187,2015-08-01,14.755,0.072,20.699,0.110,9.005,0.170,17.589,0.057
3188,2015-09-01,12.999,0.079,18.845,0.088,7.199,0.229,17.049,0.058
3189,2015-10-01,10.801,0.102,16.450,0.059,5.232,0.115,16.290,0.062
3190,2015-11-01,7.433,0.119,12.892,0.093,2.157,0.106,15.252,0.063


## Defining a few useful functions

In [5]:
def retrieveJulyData(data):
    july_temp = [d for i, d in enumerate(data) if i % 12 == 6]
    return july_temp

def calculateMovingAverage(temp, window_size):
    average_values = []
    num_period = len(temp)//window_size
    for i in range(num_period):
        average_values.append(np.mean(temp[i*window_size:(i+1)*window_size]))
    return average_values, num_period

def calculateDifference(temp):
    difference = [0]
    for i in range(1, len(temp)):
        difference.append(temp[i] - temp[i-1])
    return difference

def plotDifference(diff):
    cmap_pos = plt.cm.Reds  
    cmap_neg = plt.cm.Blues  

    difference_pos = np.array([d for d in diff if d >= 0])
    difference_neg = np.array([d for d in diff if d < 0])

    norm_red = mcolors.Normalize(vmin=min(difference_pos), vmax=max(difference_pos))  
    norm_blue = mcolors.Normalize(vmin=min(np.abs(difference_neg)), vmax=max(np.abs(difference_neg)))
    colors = [cmap_pos(norm_red(d)) if d >= 0 else cmap_neg(norm_blue(abs(d))) for d in diff] 
    plt.bar(range(len(diff)), diff, color=colors)

def repeatAverageValues(data, period):
    repeated = []
    for d in data:
        for i in range(period):
            repeated.append(d)
    return repeated

In [6]:
starting_year = 1750
ending_year = 2015
period = 5

temp = global_temp_filled['LandAverageTemperature']
july_temp = retrieveJulyData(temp)

average_temp, num_period = calculateMovingAverage(july_temp, period)

diff = calculateDifference(july_temp)

average_diff = calculateDifference(average_temp)

repeated_average_temp = repeatAverageValues(average_temp, period)
repeated_average_diff = repeatAverageValues(average_diff, period)

half_decades = [f"{start}-{start+5}" for start in range(starting_year, ending_year, 5)]
years = [list(range(start, start+5)) for start in range(starting_year, ending_year, 5)]

## Reformating for tableau

In [None]:
df3 = pd.DataFrame({'Year' : range(1750, 2015), 'Temperature' : july_temp[:-1]})
df3.to_csv('Dataset/World_temp_july.csv')

In [177]:
df3

Unnamed: 0,Year,Temperature
0,1750,15.868
1,1751,13.827
2,1752,8.265
3,1753,15.092
4,1754,14.681
...,...,...
260,2010,15.213
261,2011,15.482
262,2012,15.076
263,2013,15.003
