In [1]:
import os 
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import RandomForestRegressor

In [2]:
df = pd.read_csv("data/city_temps_cleaned.csv", index_col=0)

In [5]:
df.head()

Unnamed: 0,dt,AverageTemperature,AverageTemperatureUncertainty,City,Country,Latitude,Longitude,Latitude-Direction,Longitude-Direction,Latitude-Value,...,11,12,2,3,4,5,6,7,8,9
1875,1900-02-01,-2.799,0.882,Århus,Denmark,57.05N,10.33E,N,E,57.05,...,0,0,1,0,0,0,0,0,0,0
1876,1900-03-01,0.592,0.429,Århus,Denmark,57.05N,10.33E,N,E,57.05,...,0,0,0,1,0,0,0,0,0,0
1877,1900-04-01,4.63,0.417,Århus,Denmark,57.05N,10.33E,N,E,57.05,...,0,0,0,0,1,0,0,0,0,0
1878,1900-05-01,9.576,0.521,Århus,Denmark,57.05N,10.33E,N,E,57.05,...,0,0,0,0,0,1,0,0,0,0
1879,1900-06-01,15.888,0.592,Århus,Denmark,57.05N,10.33E,N,E,57.05,...,0,0,0,0,0,0,1,0,0,0


In [4]:
df['Latitude-Direction'] = df['Latitude'].str[-1]
df['Longitude-Direction'] = df['Longitude'].str[-1]
df['Latitude-Value'] = pd.to_numeric(df['Latitude'].str[0:-1])
df['Longitude-Value'] = pd.to_numeric(df['Longitude'].str[0:-1])
df['dt'] = pd.to_datetime(df['dt'])
df['year'] = df['dt'].dt.year
df['month'] = df['dt'].dt.month.astype(str)
long_direc = pd.get_dummies(df['Longitude-Direction'])
lat_direc = pd.get_dummies(df['Latitude-Direction'])
month = pd.get_dummies(df['month'])
df = df.join([long_direc, lat_direc, month])

df = df[df['dt'] > '1900-01-01']

In [6]:
final = df[['AverageTemperature', 'Latitude-Value', 'Longitude-Value', 'year', 'E', 'N', 'S', 'W', '1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12']].copy()

In [8]:
del df 

In [9]:
final.head()

Unnamed: 0,AverageTemperature,Latitude-Value,Longitude-Value,year,E,N,S,W,1,2,3,4,5,6,7,8,9,10,11,12
1875,-2.799,57.05,10.33,1900,1,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0
1876,0.592,57.05,10.33,1900,1,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0
1877,4.63,57.05,10.33,1900,1,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0
1878,9.576,57.05,10.33,1900,1,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0
1879,15.888,57.05,10.33,1900,1,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0


In [10]:
X_train, X_test, y_train, y_test = train_test_split(final.iloc[:,1:], final.iloc[:,0], test_size= 0.2, shuffle=True)

In [11]:
rf = RandomForestRegressor()
rf.fit(X_train, y_train)
predictYrf = rf.predict(X_test)
MSESqrf = mean_squared_error(y_test, predictYrf)
print(MSESqrf)

0.30943034196811564


#### Use the trained model to predict the temperatures for the next 100ish years. 

In [12]:
# Since the df only contains information about the month and geographical location, just increasing the year 
# value should give us the required data for prediction in the future 
final['year'] = final['year'] + 114

In [13]:
future_test = final.iloc[:, 1:]

In [14]:
future_test.head()

Unnamed: 0,Latitude-Value,Longitude-Value,year,E,N,S,W,1,2,3,4,5,6,7,8,9,10,11,12
1875,57.05,10.33,2014,1,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0
1876,57.05,10.33,2014,1,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0
1877,57.05,10.33,2014,1,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0
1878,57.05,10.33,2014,1,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0
1879,57.05,10.33,2014,1,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0


In [15]:
future_pred = rf.predict(future_test)

In [16]:
del final 
del future_test 

In [17]:
df = pd.read_csv("data/city_temps_cleaned.csv", index_col=0)
useful_cols = ['dt', 'AverageTemperature', 'City', 'Country', 'Latitude', 'Longitude']
df = df[useful_cols]
df['dt'] = pd.to_datetime(df['dt'])
df = df[df['dt'] > '1900-01-01']

df.to_csv("data/original.csv", index=False, header=True)

In [19]:
df.head()

Unnamed: 0,dt,AverageTemperature,City,Country,Latitude,Longitude
1875,1900-02-01,-2.799,Århus,Denmark,57.05N,10.33E
1876,1900-03-01,0.592,Århus,Denmark,57.05N,10.33E
1877,1900-04-01,4.63,Århus,Denmark,57.05N,10.33E
1878,1900-05-01,9.576,Århus,Denmark,57.05N,10.33E
1879,1900-06-01,15.888,Århus,Denmark,57.05N,10.33E


In [20]:
df['dt'] = df['dt'] + pd.offsets.DateOffset(years=114)

In [21]:
df['AverageTemperature'] = future_pred 

In [22]:
df.head()

Unnamed: 0,dt,AverageTemperature,City,Country,Latitude,Longitude
1875,2014-02-01,-0.36499,Århus,Denmark,57.05N,10.33E
1876,2014-03-01,-0.35974,Århus,Denmark,57.05N,10.33E
1877,2014-04-01,5.52227,Århus,Denmark,57.05N,10.33E
1878,2014-05-01,12.7988,Århus,Denmark,57.05N,10.33E
1879,2014-06-01,15.20946,Århus,Denmark,57.05N,10.33E


In [23]:
df.to_csv("data/predictions.csv", index=False, header=True)

#### Read back the original and merged data and break it into a file for each year 

In [2]:
original = pd.read_csv("data/original.csv")
prediction = pd.read_csv("data/predictions.csv")

In [3]:
df = pd.concat([original, prediction], axis=0)

In [7]:
df['dt'] = pd.to_datetime(df['dt'])
df['year'] = df['dt'].dt.year
df['month'] = df['dt'].dt.month.astype(str)

In [8]:
df.head()

Unnamed: 0,dt,AverageTemperature,City,Country,Latitude,Longitude,year,month
0,1900-02-01,-2.799,Århus,Denmark,57.05N,10.33E,1900,2
1,1900-03-01,0.592,Århus,Denmark,57.05N,10.33E,1900,3
2,1900-04-01,4.63,Århus,Denmark,57.05N,10.33E,1900,4
3,1900-05-01,9.576,Århus,Denmark,57.05N,10.33E,1900,5
4,1900-06-01,15.888,Århus,Denmark,57.05N,10.33E,1900,6


In [9]:
df.to_csv("data/orig+pred.csv", index=False, header=True)

#### Break the complete data (original + predictions) from 1900-2127 into mutliple csv files for easier loading. 

In [10]:
for i, x in df.groupby('year'):
    p = os.path.join(os.getcwd(), "data/data_{}.csv".format(i))
    x.to_csv(p, index=False)