In [82]:
%matplotlib inline
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score
import statsmodels.formula.api as sm
from math import sqrt

In [83]:
wine = pd.read_csv('./data/wine.csv')
wine_test = pd.read_csv('./data/wine_test.csv')

In [84]:
wine.describe()

Unnamed: 0,Year,Price,WinterRain,AGST,HarvestRain,Age,FrancePop
count,25.0,25.0,25.0,25.0,25.0,25.0,25.0
mean,1965.8,7.067224,605.28,16.509336,148.56,17.2,49694.43676
std,7.691987,0.650341,132.277965,0.675397,74.419464,7.691987,3665.270243
min,1952.0,6.2049,376.0,14.9833,38.0,5.0,43183.569
25%,1960.0,6.5188,536.0,16.2,89.0,11.0,46583.995
50%,1966.0,7.1211,600.0,16.5333,130.0,17.0,50254.966
75%,1972.0,7.495,697.0,17.0667,187.0,23.0,52894.183
max,1978.0,8.4937,830.0,17.65,292.0,31.0,54602.193


In [85]:
# Creates a linear model with one variable
lm1 = LinearRegression()
lm1.fit(wine[['AGST']], wine.Price)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [86]:
# Make prediction (with traning data)
prediction = lm1.predict(wine[['AGST']])

# R2
print("R2: ", r2_score(wine.Price, prediction))

# Or we can use statsmodels to calculate summary statistics for our model
model_1 = sm.ols(formula='Price ~ AGST', data=wine)
fitted_1 = model_1.fit()
fitted_1.summary()

R2:  0.4350231678


0,1,2,3
Dep. Variable:,Price,R-squared:,0.435
Model:,OLS,Adj. R-squared:,0.41
Method:,Least Squares,F-statistic:,17.71
Date:,"Wed, 29 Jun 2016",Prob (F-statistic):,0.000335
Time:,16:12:38,Log-Likelihood:,-17.07
No. Observations:,25,AIC:,38.14
Df Residuals:,23,BIC:,40.58
Df Model:,1,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5
,coef,std err,t,P>|t|,[95.0% Conf. Int.]
Intercept,-3.4178,2.494,-1.371,0.184,-8.576 1.740
AGST,0.6351,0.151,4.208,0.000,0.323 0.947

0,1,2,3
Omnibus:,0.986,Durbin-Watson:,1.758
Prob(Omnibus):,0.611,Jarque-Bera (JB):,0.79
Skew:,0.055,Prob(JB):,0.674
Kurtosis:,2.136,Cond. No.,414.0


In [87]:
# Creates linear model with two variables
lm2 = LinearRegression()
lm2.fit(wine[['AGST', 'HarvestRain']], wine.Price)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [88]:
# Make prediction
prediction = lm2.predict(wine[['AGST', 'HarvestRain']])

# R2
print("R2: ", r2_score(wine.Price, prediction))

# Or we can use statsmodels to calculate summary statistics for our model
model_2 = sm.ols(formula='Price ~ AGST + HarvestRain', data=wine)
fitted_2 = model_2.fit()
fitted_2.summary()

R2:  0.707370766205


0,1,2,3
Dep. Variable:,Price,R-squared:,0.707
Model:,OLS,Adj. R-squared:,0.681
Method:,Least Squares,F-statistic:,26.59
Date:,"Wed, 29 Jun 2016",Prob (F-statistic):,1.35e-06
Time:,16:12:39,Log-Likelihood:,-8.8461
No. Observations:,25,AIC:,23.69
Df Residuals:,22,BIC:,27.35
Df Model:,2,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5
,coef,std err,t,P>|t|,[95.0% Conf. Int.]
Intercept,-2.2027,1.854,-1.188,0.248,-6.048 1.643
AGST,0.6026,0.111,5.415,0.000,0.372 0.833
HarvestRain,-0.0046,0.001,-4.525,0.000,-0.007 -0.002

0,1,2,3
Omnibus:,1.897,Durbin-Watson:,1.775
Prob(Omnibus):,0.387,Jarque-Bera (JB):,1.096
Skew:,-0.511,Prob(JB):,0.578
Kurtosis:,3.08,Cond. No.,4200.0


In [89]:
# Creates a linear model with all variables
variables = ['AGST', 'HarvestRain', 'WinterRain', 'Age', 'FrancePop']

lm3 = LinearRegression()
lm3.fit(wine[variables], wine.Price)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [90]:
# Make prediction
prediction = lm3.predict(wine[variables])

# R2
print("R2: ", r2_score(wine.Price, prediction))

# Or we can use statsmodels to calculate summary statistics for our model
v = ' + '.join(variables)
model_3 = sm.ols(formula='Price ~ '+v, data=wine)
fitted_3 = model_3.fit()
fitted_3.summary()

R2:  0.82935922233


0,1,2,3
Dep. Variable:,Price,R-squared:,0.829
Model:,OLS,Adj. R-squared:,0.784
Method:,Least Squares,F-statistic:,18.47
Date:,"Wed, 29 Jun 2016",Prob (F-statistic):,1.04e-06
Time:,16:12:39,Log-Likelihood:,-2.1043
No. Observations:,25,AIC:,16.21
Df Residuals:,19,BIC:,23.52
Df Model:,5,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5
,coef,std err,t,P>|t|,[95.0% Conf. Int.]
Intercept,-0.4504,10.189,-0.044,0.965,-21.776 20.875
AGST,0.6012,0.103,5.836,0.000,0.386 0.817
HarvestRain,-0.0040,0.001,-4.523,0.000,-0.006 -0.002
WinterRain,0.0010,0.001,1.963,0.064,-6.89e-05 0.002
Age,0.0006,0.079,0.007,0.994,-0.165 0.166
FrancePop,-4.953e-05,0.000,-0.297,0.770,-0.000 0.000

0,1,2,3
Omnibus:,1.769,Durbin-Watson:,2.792
Prob(Omnibus):,0.413,Jarque-Bera (JB):,1.026
Skew:,-0.005,Prob(JB):,0.599
Kurtosis:,2.008,Cond. No.,8410000.0


In [91]:
# Is it good to keep all those variables?
# Take a look at FrancePop feature. Its P > |t| value is too small

# Creates a linear model with all variables but FrancePop
variables = ['AGST', 'HarvestRain', 'WinterRain', 'Age']

lm4 = LinearRegression()
lm4.fit(wine[variables], wine.Price)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [92]:
# Make prediction
prediction = lm4.predict(wine[variables])

# R2
print("R2: ", r2_score(wine.Price, prediction))

# Or we can use statsmodels to calculate summary statistics for our model
v = ' + '.join(variables)
model_4 = sm.ols(formula='Price ~ '+v, data=wine)
fitted_4 = model_4.fit()
fitted_4.summary()

R2:  0.828566219342


0,1,2,3
Dep. Variable:,Price,R-squared:,0.829
Model:,OLS,Adj. R-squared:,0.794
Method:,Least Squares,F-statistic:,24.17
Date:,"Wed, 29 Jun 2016",Prob (F-statistic):,2.04e-07
Time:,16:12:39,Log-Likelihood:,-2.1622
No. Observations:,25,AIC:,14.32
Df Residuals:,20,BIC:,20.42
Df Model:,4,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5
,coef,std err,t,P>|t|,[95.0% Conf. Int.]
Intercept,-3.4300,1.766,-1.942,0.066,-7.114 0.254
AGST,0.6072,0.099,6.152,0.000,0.401 0.813
HarvestRain,-0.0040,0.001,-4.652,0.000,-0.006 -0.002
WinterRain,0.0011,0.001,2.120,0.047,1.73e-05 0.002
Age,0.0239,0.008,2.956,0.008,0.007 0.041

0,1,2,3
Omnibus:,1.814,Durbin-Watson:,2.797
Prob(Omnibus):,0.404,Jarque-Bera (JB):,1.041
Skew:,0.034,Prob(JB):,0.594
Kurtosis:,2.003,Cond. No.,19100.0


## Correlation

In [93]:
# Lets check correlation between WinterRain and Price
wine.corr()['WinterRain']['Price']

0.13665054738762955

In [94]:
# Age and FrancePop
wine.corr()['Age']['FrancePop']

-0.99448509711145461

In [95]:
# All of them
wine.corr()

Unnamed: 0,Year,Price,WinterRain,AGST,HarvestRain,Age,FrancePop
Year,1.0,-0.447768,0.01697,-0.246916,0.028009,-1.0,0.994485
Price,-0.447768,1.0,0.136651,0.659563,-0.563322,0.447768,-0.466862
WinterRain,0.01697,0.136651,1.0,-0.321091,-0.275441,-0.01697,-0.001622
AGST,-0.246916,0.659563,-0.321091,1.0,-0.064496,0.246916,-0.259162
HarvestRain,0.028009,-0.563322,-0.275441,-0.064496,1.0,-0.028009,0.041264
Age,-1.0,0.447768,-0.01697,0.246916,-0.028009,1.0,-0.994485
FrancePop,0.994485,-0.466862,-0.001622,-0.259162,0.041264,-0.994485,1.0


In [96]:
# Creates a linear model with all variables but Age and FrancePop
variables = ['AGST', 'HarvestRain', 'WinterRain']

lm5 = LinearRegression()
lm5.fit(wine[variables], wine.Price)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [100]:
# Make prediction
prediction = lm5.predict(wine[variables])

# R2
print("R2: ", r2_score(wine.Price, prediction))

# Or we can use statsmodels to calculate summary statistics for our model
v = ' + '.join(variables)
model_5 = sm.ols(formula='Price ~ '+v, data=wine)
fitted_5 = model_5.fit()
fitted_5.summary()

R2:  0.753689442638


0,1,2,3
Dep. Variable:,Price,R-squared:,0.754
Model:,OLS,Adj. R-squared:,0.719
Method:,Least Squares,F-statistic:,21.42
Date:,"Wed, 29 Jun 2016",Prob (F-statistic):,1.36e-06
Time:,16:27:09,Log-Likelihood:,-6.6922
No. Observations:,25,AIC:,21.38
Df Residuals:,21,BIC:,26.26
Df Model:,3,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5
,coef,std err,t,P>|t|,[95.0% Conf. Int.]
Intercept,-4.3016,2.037,-2.112,0.047,-8.537 -0.066
AGST,0.6810,0.112,6.097,0.000,0.449 0.913
HarvestRain,-0.0039,0.001,-3.953,0.001,-0.006 -0.002
WinterRain,0.0012,0.001,1.987,0.060,-5.47e-05 0.002

0,1,2,3
Omnibus:,0.621,Durbin-Watson:,2.022
Prob(Omnibus):,0.733,Jarque-Bera (JB):,0.419
Skew:,-0.303,Prob(JB):,0.811
Kurtosis:,2.81,Cond. No.,18800.0


In [99]:
# By removing Age and FrancePop with have decreased R2. So, model 4 wins! \o/

## Making Predictions

In [102]:
# We have used data that is already seen. We must use a test data to assess our model.
# That's why we are going to use a test data. (Actually we should use a validation data as well)
wine_test


Unnamed: 0,Year,Price,WinterRain,AGST,HarvestRain,Age,FrancePop
0,1979,6.9541,717,16.1667,122,4,54835.832
1,1980,6.4979,578,16.0,74,3,55110.236


In [104]:
variables = ['AGST', 'HarvestRain', 'WinterRain', 'Age']
prediction = lm4.predict(wine_test[variables])
prediction

array([ 6.76892463,  6.6849104 ])

In [112]:
# Sum Squared Error (SSE)
SSE = sum((wine_test.Price - prediction)**2)

# Total Sum of Square
SST = sum((wine_test.Price - np.mean(wine.Price))**2)

# R2
1 - (SSE/SST)

# Keep in mind that our test set is small (2 data points)

0.79442776026330031

In [None]:
# it should be good to plot a graph to illustrate differences between R2 models (train and test data)