
# Competition 2 - Ridge Regression 

## Name: OyWithThePoodles 
===========================================

**MSE train set**: 1.03

**R2 train set**: 20% 

-------------------------------------------
**MSE test set**: 0.80

**R2 test set**: 26% 

-------> **Best alpha found**: 0.99

*comparison with Lasso Regression* 

|RegressionType|Dataset|MSE|R2|
|--------------|-------|-----|-----|
|Lasso|Train|0.79|25%|
|Lasso|Test|0.65|29%|
|Ridge|Train|1.03|20%|
|Ridge|Test|0.80|26%|

## overall, Lasso worked better. 

In [42]:
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt 
from sklearn.linear_model import Ridge 
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score

ds = pd.read_csv('trainset.csv')

xs = ['x']
xs.extend(['x_%d' % i for i in range(1,14)])

y = ds['LOGVALUE']
x = ds.iloc[:,1:] 

print(type(y))
print(type(x))

y = np.array(y)

print(y[:5])
print(x.head())

<class 'pandas.core.series.Series'>
<class 'pandas.core.frame.DataFrame'>
[12.89921983 11.22524339 11.56171563 13.12236338 11.8913619 ]
   BATHS  BEDRMS  BUILT  UNITSF      LOT  ROOMS  REGION  KITCHEN  FLOORS  \
0      2       3   2001  1531.0   2712.0      6       4        1       2   
1      1       3   1975  1125.0  11000.0      5       3        1       1   
2      2       3   1995  1300.0  44000.0      5       3        1       1   
3      4       5   2001  8299.0  11000.0     11       4        1       2   
4      2       4   1990  1200.0  11000.0      7       2        1       4   

   LAUNDY  RECRM  METRO  METRO3  
0       0      0      7       2  
1       0      0      1       1  
2       0      0      7       2  
3       0      0      7       2  
4       0      0      7       2  


In [43]:
type(x.iloc[4,2])

numpy.int64

In [44]:
#ok, now I'll build the model using all the columns - no normalizations 

#I have 1 categorical variable, so I'll have to treat that differently. 

#encoding year built - if it's before 1970s, it's old (0), if it's after it's ok (1)

x.loc[x['BUILT'] >= 1970, 'BUILT'] = 1
x.loc[x['BUILT'] < 1970, 'BUILT'] = 0

x.head()

Unnamed: 0,BATHS,BEDRMS,BUILT,UNITSF,LOT,ROOMS,REGION,KITCHEN,FLOORS,LAUNDY,RECRM,METRO,METRO3
0,2,3,0,1531.0,2712.0,6,4,1,2,0,0,7,2
1,1,3,0,1125.0,11000.0,5,3,1,1,0,0,1,1
2,2,3,0,1300.0,44000.0,5,3,1,1,0,0,7,2
3,4,5,0,8299.0,11000.0,11,4,1,2,0,0,7,2
4,2,4,0,1200.0,11000.0,7,2,1,4,0,0,7,2


In [46]:
sum(x['BUILT'])

2909690

In [47]:
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedKFold
from sklearn.model_selection import GridSearchCV

#fit model 

#training and validation set 
x_train, x_val, y_train, y_val = train_test_split(x, y, test_size = 0.3, random_state = 0)

def find_alpha(x, y):
    """
    Find the best alpha value based on training set.
    """
    scores = []
    model = Ridge()
    cv = RepeatedKFold(n_splits = 10, n_repeats = 5, random_state = 1)
    grid = dict()
    grid['alpha'] = np.arange(0, 1, 0.01) #possible alpha values 
    search = GridSearchCV(model, grid, scoring='neg_mean_squared_error', cv = cv, n_jobs = -1)
    results = search.fit(x, y)
    
    print('MSE: ', results.best_score_)
    print('alpha: ', results.best_params_)

find_alpha(x_train, y_train)

MSE:  -0.8769061927749798
alpha:  {'alpha': 0.99}


In [59]:
#modeling 
ridge_model = Ridge(alpha = 0.99)

ridge_fit = ridge_model.fit(x_train, y_train)
ridge_pred = ridge_fit.predict(x_val) 

#how well does this do? 
mse = mean_squared_error(y_val, ridge_pred)
r2 = r2_score(y_val,ridge_pred)

print('MSE: ', mse)
print('R2: ', r2)

MSE:  1.0345934916059325
R2:  0.19917720021292884


In [60]:
#lets see how well it does on test set 

testset = pd.read_csv('testset.csv')
testset.loc[x['BUILT'] >= 1970, 'BUILT'] = 1
testset.loc[x['BUILT'] < 1970, 'BUILT'] = 0

ytest = testset['LOGVALUE']
xtest = testset.iloc[:,1:]

test_pred = ridge_fit.predict(xtest)
test_mse = mean_squared_error(ytest, test_pred)
test_r2 = r2_score(ytest, test_pred)

print("test MSE:", test_mse)
print("test R2:", test_r2)

test MSE: 0.7909599117538261
test R2: 0.2589496445468772


In [61]:
#well, doesn't make much sense that my test set works so much better! 