In [87]:
import sklearn.model_selection
%reload_ext autoreload
%autoreload 2
import sys
sys.path.append("../")

In [88]:
import IMLearn.learners.regressors.linear_regression
from IMLearn.learners.regressors import PolynomialFitting
from IMLearn.utils import split_train_test

import numpy as np
import pandas as pd
import plotly.express as px
import plotly.io as pio
pio.templates.default = "simple_white"

In [89]:
df = pd.read_csv("../datasets/city_temperature.csv", parse_dates=['Date']).dropna().drop_duplicates()
df = df[df['Temp'] > -50]  # remove illegal or absurd temperature measurements
df['Year'] = df['Year'].astype(str)  # important for discrete coloring in the graph
df['DayOfYear'] = df['Date'].dt.dayofyear

In [90]:
df.dtypes

Country              object
City                 object
Date         datetime64[ns]
Year                 object
Month                 int64
Day                   int64
Temp                float64
DayOfYear             int64
dtype: object

In [100]:
israel_df = df[df['Country'] == "Israel"]
fig_1 = px.scatter(israel_df,
                   x='DayOfYear', y='Temp',
                   title='Average Daily Temperature as a Function of Day of the Year',
                   color='Year')
fig_1.update_traces(
    marker=dict(size=5)
)
# pio.write_image(fig=fig_1, engine='orca', file='./daily_avg_temp_to_day_of_year_israel.png')
fig_1.show()

In [122]:
month_std_df = israel_df.groupby('Month', as_index=False).agg(std=pd.NamedAgg(column="Temp", aggfunc="std"))
fig_2 = px.bar(month_std_df,
           x='Month', y='std',
           title="Standard deviation of temperature per month")
fig_2.show()
# pio.write_image(fig=fig_2, engine='orca', file='./israel_temp_std_per_month.png')

In [135]:
month_temp_df = df.groupby(['Country', 'Month'], as_index=False)\
        .agg(avg=pd.NamedAgg(column='Temp', aggfunc='mean'), std=pd.NamedAgg(column='Temp', aggfunc='std'))

fig_3 = px.line(month_temp_df,
                x='Month', y='avg',
                title='Average Temperature per Month',
                color='Country',
                error_y='std')
fig_3.update_layout(yaxis_title='Average Temperature')
fig_3.show()
pio.write_image(fig=fig_3, engine='orca', file='./avg_temp_by_country.png')

"""
Based on this graph, not all countries have the same pattern. The easiest to spot is South Africa, which has an almost "opposite"
high and low months of the year's behavior regarding the other countries (This makes sense as Africa's seasons are opposite of Israel's).
The model has the highest chance to work on Jordan well (Not surprising, it has a similar climate to Israel),
as we can see that their graphs are pretty close.
It might do an OK job on The Netherlands - we can see the distribution is very similar, but the intercept seems different (about 10 deg. difference over all months).
And of course, it is easy to see that it will be very bad at predicting the temperature in South Africa, as I previously mentioned.
"""

In [171]:
train_X, train_y, test_X, test_y = split_train_test(israel_df.DayOfYear, israel_df.Temp, 0.75)
loss_arr = []

for i, k in enumerate(range(1, 11)):
    fitted = PolynomialFitting(k).fit(train_X.to_numpy(), train_y.to_numpy())
    loss_arr.append([k, np.round(fitted.loss(test_X.to_numpy(), test_y.to_numpy()), 2)])

loss_arr = pd.DataFrame.from_records(loss_arr, columns=['k', 'MSE loss'])
print(loss_arr)

fig_4 = px.bar(loss_arr,
               x='k', y='MSE loss',
               title='MSE loss over ranging k (highest deg.) of polynomial fitting')
# pio.write_image(fig=fig_4, engine='orca', file='./israel_k_loss.png')
fig_4.show()

    k  MSE loss
0   1     21.03
1   2      7.32
2   3      3.93
3   4     31.30
4   5     54.56
5   6     84.92
6   7    119.00
7   8    242.53
8   9    241.04
9  10    240.94


In [218]:
israel_fitted = PolynomialFitting(k=3).fit(israel_df.DayOfYear, israel_df.Temp)
countries = ['South Africa', 'The Netherlands', 'Jordan']

all_losses = df.groupby('Country').apply(lambda x: np.round(israel_fitted.loss(x.DayOfYear, x.Temp), 2))
countries_losses = pd.DataFrame({'Country': all_losses.index, 'Loss': all_losses.values})
countries_losses = countries_losses[countries_losses['Country'].isin(countries)].reset_index(drop=True)

fig_5 = px.bar(countries_losses,
       x='Country', y='Loss',
       text='Loss',
       title='Loss of Other Countries over Israel-Fitted model',
       color='Country')
fig_5.show()
pio.write_image(fig=fig_5, engine='orca', file='other_countries_loss_using_israel_fitted.png')