In [23]:
%matplotlib inline
import pandas as pd
import numpy as np
import statsmodels.formula.api as sm

In [26]:
climate = pd.read_csv('./data/climate_change.csv')
renamed = {'CFC-11': 'CFC11', 'CFC-12': 'CFC12'}
climate.rename(columns=renamed, inplace=True)

In [27]:
climate_train = climate.loc[climate.Year <= 2006]
climate_test = climate.loc[climate.Year > 2006]

In [28]:
climate_model = sm.ols(formula='Temp ~ MEI + CO2 + CH4 + N2O + CFC11 + CFC12 + TSI + Aerosols', data=climate_train)
climate_fitting = climate_model.fit()
climate_fitting.summary()

0,1,2,3
Dep. Variable:,Temp,R-squared:,0.751
Model:,OLS,Adj. R-squared:,0.744
Method:,Least Squares,F-statistic:,103.6
Date:,"Thu, 30 Jun 2016",Prob (F-statistic):,1.94e-78
Time:,08:08:08,Log-Likelihood:,280.1
No. Observations:,284,AIC:,-542.2
Df Residuals:,275,BIC:,-509.4
Df Model:,8,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5
,coef,std err,t,P>|t|,[95.0% Conf. Int.]
Intercept,-124.5943,19.887,-6.265,0.000,-163.744 -85.445
MEI,0.0642,0.006,9.923,0.000,0.051 0.077
CO2,0.0065,0.002,2.826,0.005,0.002 0.011
CH4,0.0001,0.001,0.240,0.810,-0.001 0.001
N2O,-0.0165,0.009,-1.930,0.055,-0.033 0.000
CFC11,-0.0066,0.002,-4.078,0.000,-0.010 -0.003
CFC12,0.0038,0.001,3.757,0.000,0.002 0.006
TSI,0.0931,0.015,6.313,0.000,0.064 0.122
Aerosols,-1.5376,0.213,-7.210,0.000,-1.957 -1.118

0,1,2,3
Omnibus:,8.74,Durbin-Watson:,0.956
Prob(Omnibus):,0.013,Jarque-Bera (JB):,10.327
Skew:,0.289,Prob(JB):,0.00572
Kurtosis:,3.733,Cond. No.,8530000.0


In [29]:
# We can see thar R2 is 0.75
# Also, CH4 is not significant. We could put N2O in the same bag.
# But we can see that CFC11 is negative... well, make us think that if it increases the temperature will decrease...
# Something is affecting this. Maybe there is a strong correlation between variables,

In [30]:
# Checking correlation. Which one the variables are highly correlated with N2O
climate_train.corr()['N2O']

Year        0.993845
Month       0.013632
MEI        -0.050820
CO2         0.976720
CH4         0.899839
N2O         1.000000
CFC11       0.522477
CFC12       0.867931
TSI         0.199757
Aerosols   -0.337055
Temp        0.778639
Name: N2O, dtype: float64

In [31]:
# CO2, CH4, CFC12 have strong correlation with N2O

# Checking correlation. Which one the variables are highly correlated with CFC11
climate_train.corr()['CFC11']

Year        0.569106
Month      -0.013111
MEI         0.069000
CO2         0.514060
CH4         0.779904
N2O         0.522477
CFC11       1.000000
CFC12       0.868985
TSI         0.272046
Aerosols   -0.043921
Temp        0.407710
Name: CFC11, dtype: float64

In [32]:
# CH4 and CFC12

# Let's try to simplify the model with only MEI, TSI, Aerosols and N2O
climate_model = sm.ols(formula='Temp ~ MEI + N2O + TSI + Aerosols', data=climate_train)
climate_fitting = climate_model.fit()
climate_fitting.summary()

0,1,2,3
Dep. Variable:,Temp,R-squared:,0.726
Model:,OLS,Adj. R-squared:,0.722
Method:,Least Squares,F-statistic:,184.9
Date:,"Thu, 30 Jun 2016",Prob (F-statistic):,3.52e-77
Time:,08:21:08,Log-Likelihood:,266.64
No. Observations:,284,AIC:,-523.3
Df Residuals:,279,BIC:,-505.0
Df Model:,4,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5
,coef,std err,t,P>|t|,[95.0% Conf. Int.]
Intercept,-116.2269,20.223,-5.747,0.000,-156.036 -76.418
MEI,0.0642,0.007,9.649,0.000,0.051 0.077
N2O,0.0253,0.001,19.307,0.000,0.023 0.028
TSI,0.0795,0.015,5.344,0.000,0.050 0.109
Aerosols,-1.7017,0.218,-7.806,0.000,-2.131 -1.273

0,1,2,3
Omnibus:,10.908,Durbin-Watson:,0.842
Prob(Omnibus):,0.004,Jarque-Bera (JB):,15.097
Skew:,0.289,Prob(JB):,0.000527
Kurtosis:,3.971,Cond. No.,5000000.0


In [37]:
# We can notice that removing many variables the sign of N2O switches and the model has not lost its explanatory power.
# The R2 lowered a bit, but it's ok

# Removing CH4
climate_model = sm.ols(formula='Temp ~ MEI + CO2 + N2O + CFC11 + CFC12 + TSI + Aerosols', data=climate_train)
climate_fitting = climate_model.fit()

# Now, let's test our model
predictions = climate_fitting.predict(climate_test)

# R2
SSE = sum((predictions - climate_test.Temp)**2)
SST = sum((np.mean(climate_train.Temp) - climate_test.Temp)**2)
R2 = 1 - SSE/SST
R2

0.62860512251000555