# Predicting Daily Land Average Temperature

##  Load the Data
Load the daily land-surface average anomaly data provided by Berkeley Earth collected from 1880 to 2022.

In [None]:
import pandas as pd

url = "https://berkeley-earth-temperature.s3.us-west-1.amazonaws.com/Global/Complete_TAVG_daily.txt"

'''
read the data from the url link
ignore the comments starting with '%'
ignore the header in the comments and assign manually
'''
df = pd.read_csv(url, sep=r"\s+", comment="%", header=None)

# assign column headers
column_names = ["Date Number", "Year", "Month", "Day", "Day of Year", "Anomaly"]
df.columns = column_names

# df.to_csv("../data/raw.csv", index=False)

In [None]:
df

## Data Preprocessing

In [None]:
df.isna().sum()

In [None]:
df.dtypes

In [None]:
# df = df.drop(columns=['Date Number'])

In [None]:
BASELINE_TEMP = 8.59  # Jan 1951â€“Dec 1980 land-average temperature in celsius

df['Temperature'] = df['Anomaly'] + BASELINE_TEMP

In [None]:
month_dict = {
    1: 'January',
    2: 'February',
    3: 'March',
    4: 'April',
    5: 'May',
    6: 'June',
    7: 'July',
    8: 'August',
    9: 'September',
    10: 'October',
    11: 'November',
    12: 'December'
}

df['Month_Name'] = df['Month'].map(month_dict)

In [None]:
df

# Exploratory Data Analysis (EDA)

In [None]:
df.info()

In [None]:
df.describe()

In [None]:
df.isna().any()

In [None]:
# month order from above dict values
month_order = list(month_dict.values())
month_order

In [None]:
# Order the month names as categorical
df['Month_Name'] = pd.Categorical(df['Month_Name'], categories=month_order, ordered=True)
df['Month_Name']

In [None]:
# Split into test and train dataframes based on cutoff
# beyond which we would like to score our model and use on unseen examples outside the test_df
cutoff = 2018
test_df = df[df['Year']>=cutoff]
test_df

In [None]:
train_df = df[df['Year']<cutoff]
train_df

In [None]:
import altair as alt

# Simplify Working with Large Datasets 
alt.data_transformers.enable('vegafusion')

# Ignore all warnings from the altair package for pdf rendering, warnings validated first
# resource for implementation:
# https://stackoverflow.com/questions/3920502/how-to-suppress-a-third-party-warning-using-warnings-filterwarnings
import warnings
warnings.filterwarnings('ignore', module='altair')

# Configure Plot Sizes (D.R.Y)
small_plot_size = {'width': 500, 'height': 300}
facet_plot_size = {'width': 250, 'height': 200}

In [None]:
alt.Chart(train_df).mark_line().encode(
    x = 'Year:T',
    y = 'Temperature:Q'
).properties(**small_plot_size)

# Suffering from Overplotting, perhaps we should take the mean of the temperature

In [None]:
temp_plot = alt.Chart(train_df).mark_line().encode(
    x = 'Date Number:Q',
    y = 'Temperature:Q'
).properties(**small_plot_size)

temp_plot

In [None]:
temp_plot = alt.Chart(train_df).mark_line().encode(
    x = 'Year:T',
    y = 'mean(Temperature)'
).properties(**small_plot_size)

temp_plot

In [None]:
temp_plot = alt.Chart(train_df).mark_line().encode(
    x = 'Year:T',
    y = 'Temperature:Q'
).properties(**facet_plot_size).facet('Month_Name', columns=2)

temp_plot

# Suffering from Overplotting, perhaps we should take the mean of the temperature

In [None]:
mean_per_month = train_df.groupby(['Year','Month_Name'])['Temperature'].mean().reset_index()

temp_plot = alt.Chart(mean_per_month).mark_line().encode(
    x = 'Year:T',
    y = 'Temperature'
).properties(**facet_plot_size).facet('Month_Name', columns=2)

temp_plot

In [None]:
temp_plot = alt.Chart(train_df).mark_rect().encode(
    x = 'Month_Name',
    y = alt.Y('mean(Temperature)').scale(zero=False)
).properties(**small_plot_size)

temp_plot

In [None]:
temp_points = alt.Chart(train_df).mark_point(opacity=0.5, size=1).encode(
    alt.X('Year:T'),
    alt.Y('Temperature:Q')
).properties(**small_plot_size)

temp_points

In [None]:
temp_points_mean = alt.Chart(train_df).mark_point(opacity=1, size=1).encode(
    alt.X('Year:T'),
    alt.Y('mean(Temperature)')
).properties(**small_plot_size)

temp_points_mean

In [None]:
temp_points +  temp_points.mark_line(size=3, color='red').transform_loess(
    'Year',
    'Temperature'
).properties(**small_plot_size)

In [None]:
temp_points +  temp_points.mark_line(size=3, color='red').transform_regression(
    'Year',
    'Temperature'
).properties(**small_plot_size)

In [None]:
mean_per_year = train_df.groupby(['Year'])['Temperature'].mean().reset_index()

temp_points_avg = alt.Chart(mean_per_year).mark_point(size=2).encode(
    alt.X('Year:T'),
    alt.Y('Temperature:Q').scale(zero=False)
).properties(**small_plot_size)
temp_points_avg



In [None]:
reg = temp_points_avg+temp_points_avg.mark_line(size=2, color='red').transform_regression(
    'Year',
    'Temperature'
)
reg

In [None]:
# 30-day rolling average example adapted from source documentation  below:
# Adapated from 5331 Lecture 5 Notes
# https://altair-viz.github.io/gallery/scatter_with_rolling_mean.html

mean_per_year = train_df.groupby(['Year'])['Temperature'].mean().reset_index()

roll_line = alt.Chart(mean_per_year).mark_line(
    color='red',
    size=2
).transform_window(
    rolling_mean='mean(Temperature)',
    frame=[-1, 1]
).encode(
    x='Year:T',
    y='rolling_mean:Q'
).properties(**small_plot_size)

# Three-year moving/rolling average (day-based moving average not easy to plot with our data)
# And day of year and day reset every new year or month
# Daily averages may be to noisy anyway we can see the rolling average smooth out with the 10-year below 
temp_points+roll_line

In [None]:
mean_per_year = train_df.groupby(['Year'])['Temperature'].mean().reset_index()

roll_line = alt.Chart(mean_per_year).mark_line(
    color='red',
    size=2
).transform_window(
    rolling_mean='mean(Temperature)',
    frame=[-5, 4]
).encode(
    x='Year:T',
    y='rolling_mean:Q'
).properties(**small_plot_size)

# Ten-year moving/rolling average (day-based moving average not easy to plot with our data)
# And day of year and day reset every new year or month
temp_points+roll_line

In [None]:
temp_max = alt.Chart(train_df).mark_rect(color='orange', opacity=1).encode(
    x = 'Month_Name',
    y = 'max(Temperature)'
).properties(**small_plot_size)

temp_min = alt.Chart(train_df).mark_rect().encode(
    x = 'Month_Name',
    y = 'min(Temperature)'
)

temp_max+temp_min

In [None]:
temp_max = alt.Chart(train_df).mark_line(color='orange').encode(
    x = 'Year:T',
    y = 'max(Temperature)'
).properties(**small_plot_size)

temp_min = alt.Chart(train_df).mark_line().encode(
    x = 'Year:T',
    y = 'min(Temperature)'
)

temp_mean = alt.Chart(train_df).mark_line(color='green').encode(
    x = 'Year:T',
    y = 'mean(Temperature)'
)


all = temp_min+temp_mean+temp_max
all

In [None]:
years_selection = [1880, 1920, 1960, 2000, cutoff-1]

alt.Chart(train_df[train_df['Year'].isin(years_selection)]).mark_line(size=1).encode(
    x = 'Day of Year',
    y = alt.Y('Temperature').scale(zero=False),
    color = 'Year:N'
).properties(**small_plot_size)

In [None]:
years_selection = [1880, 1920, 1960, 2000, cutoff-1]

box = alt.Chart(train_df[train_df['Year'].isin(years_selection)]).mark_boxplot().encode(
    x = 'Year:N',
    y = alt.Y('Temperature').scale(zero=False),
    color = 'Year:N'
)
box_point = alt.Chart(train_df[train_df['Year'].isin(years_selection)]).mark_point(size=2).encode(
    x = 'Year:N',
    y = alt.Y('mean(Temperature)').scale(zero=False),
    color = alt.value('black')
)

comb = box+box_point
comb.properties(**small_plot_size)

In [None]:
alt.Chart(train_df).mark_rect().encode(
    alt.X('Day of Year').bin(maxbins=100),
    alt.Y('Temperature').bin(maxbins=100),
    alt.Color('Year')
).properties(height = 300, width = 400)

In [None]:
temp_density  = alt.Chart(train_df[train_df['Year'].isin(years_selection)]).transform_density(
    'Temperature',
    groupby=['Year'],
    as_=['Temperature', 'density']
).mark_area(opacity=0.6).encode(
    x=alt.X('Temperature',axis=alt.Axis(format='~s')),
    y=alt.Y('density:Q').stack(False),
    color = alt.Color('Year:N',
    legend = alt.Legend(orient='right', 
                        title='Year', 
                        direction='vertical'))
).properties(**small_plot_size, title = 'Annual Global Average Temperature Distributions')
temp_density