# Time Series End-to-End Exercise
***

## Forecasting Tokyo Temperatures

In [1]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns
from pandas.plotting import register_matplotlib_converters

from sklearn.metrics import mean_squared_error
from math import sqrt

import statsmodels.api as sm
from statsmodels.tsa.api import Holt

import warnings
warnings.filterwarnings('ignore')

### Acquire Data

The data was acquired from https://www.kaggle.com/berkeleyearth/climate-change-earth-surface-temperature-data and saved as a .csv.

In [2]:
temps = pd.read_csv('GlobalLandTemperaturesByCity.csv')

In [3]:
temps.head()

Unnamed: 0,dt,AverageTemperature,AverageTemperatureUncertainty,City,Country,Latitude,Longitude
0,1743-11-01,6.068,1.737,Århus,Denmark,57.05N,10.33E
1,1743-12-01,,,Århus,Denmark,57.05N,10.33E
2,1744-01-01,,,Århus,Denmark,57.05N,10.33E
3,1744-02-01,,,Århus,Denmark,57.05N,10.33E
4,1744-03-01,,,Århus,Denmark,57.05N,10.33E


In [4]:
temps.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8599212 entries, 0 to 8599211
Data columns (total 7 columns):
 #   Column                         Dtype  
---  ------                         -----  
 0   dt                             object 
 1   AverageTemperature             float64
 2   AverageTemperatureUncertainty  float64
 3   City                           object 
 4   Country                        object 
 5   Latitude                       object 
 6   Longitude                      object 
dtypes: float64(2), object(5)
memory usage: 459.2+ MB


In [6]:
#Grab only the tokyo data
tokyo = temps[temps.City == 'Tokyo']

In [9]:
#Save the tokyo data as a .csv for easy access later
tokyo.to_csv('tokyo_temps.csv')

In [7]:
tokyo.head()

Unnamed: 0,dt,AverageTemperature,AverageTemperatureUncertainty,City,Country,Latitude,Longitude
7659221,1845-01-01,2.377,2.006,Tokyo,Japan,36.17N,139.23E
7659222,1845-02-01,1.312,2.968,Tokyo,Japan,36.17N,139.23E
7659223,1845-03-01,5.276,1.506,Tokyo,Japan,36.17N,139.23E
7659224,1845-04-01,10.387,1.508,Tokyo,Japan,36.17N,139.23E
7659225,1845-05-01,14.923,1.448,Tokyo,Japan,36.17N,139.23E


In [8]:
tokyo.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2025 entries, 7659221 to 7661245
Data columns (total 7 columns):
 #   Column                         Non-Null Count  Dtype  
---  ------                         --------------  -----  
 0   dt                             2025 non-null   object 
 1   AverageTemperature             2020 non-null   float64
 2   AverageTemperatureUncertainty  2020 non-null   float64
 3   City                           2025 non-null   object 
 4   Country                        2025 non-null   object 
 5   Latitude                       2025 non-null   object 
 6   Longitude                      2025 non-null   object 
dtypes: float64(2), object(5)
memory usage: 126.6+ KB


### Prepare Data

In [10]:
#Are there any nulls I need to worry about?
tokyo.isnull().sum()

dt                               0
AverageTemperature               5
AverageTemperatureUncertainty    5
City                             0
Country                          0
Latitude                         0
Longitude                        0
dtype: int64

There are 5 null values in the 'AverageTemperature' and 'AverageTemperatureUncertainty' columns. Since there are only 5, and I don't want any null or missing values in my dataset, I will simply use the average temperatures for those months to fill them in.

In [14]:
#Take a look at the null values
tokyo[pd.isnull(tokyo.AverageTemperature)]

Unnamed: 0,dt,AverageTemperature,AverageTemperatureUncertainty,City,Country,Latitude,Longitude
7659521,1870-01-01,,,Tokyo,Japan,36.17N,139.23E
7659522,1870-02-01,,,Tokyo,Japan,36.17N,139.23E
7659523,1870-03-01,,,Tokyo,Japan,36.17N,139.23E
7659524,1870-04-01,,,Tokyo,Japan,36.17N,139.23E
7661245,2013-09-01,,,Tokyo,Japan,36.17N,139.23E
