# Linear Modeling

In [None]:
import pandas as pd
import numpy as np

In [None]:
housing_train = pd.read_csv('housing_train.csv')
housing_test = pd.read_csv('housing_test.csv')

y_housing = pd.read_csv('y_housing.csv')

In [None]:
print('Full Train dimension:', housing_train.shape)
print('Y train dimension:', y_housing.shape)
print('Test data dimension:', housing_test.shape)



In [None]:
from sklearn import linear_model
from sklearn.linear_model import ElasticNet, Lasso, Ridge, SGDRegressor
from sklearn.ensemble import RandomForestRegressor,  GradientBoostingRegressor
from sklearn.kernel_ridge import KernelRidge
from sklearn.base import BaseEstimator, TransformerMixin, RegressorMixin, clone
from sklearn.model_selection import  train_test_split
from sklearn.metrics import mean_squared_error, log_loss

import sklearn.model_selection as ms

from sklearn import ensemble
from sklearn.ensemble import RandomForestRegressor
from pprint import pprint
from sklearn.svm import SVR

In [None]:
X_train, X_test, y_train, y_test = train_test_split(housing_train, y_housing, test_size = 0.30, random_state = 42)

In [None]:
ridge = linear_model.Ridge(alpha = 1, normalize=True)
lasso = linear_model.Lasso(alpha= 0, normalize = True)
elasticnet = linear_model.ElasticNet(alpha = 0.01, l1_ratio=0.5, normalize=False)
clf = linear_model.SGDRegressor(max_iter=1000, tol=1e-3)

gbm = ensemble.GradientBoostingRegressor()
rf = ensemble.RandomForestRegressor(n_estimators = 800, min_samples_split=5,
                                    min_samples_leaf = 1, max_features = 'sqrt',
                                    max_depth = 92, bootstrap = False, random_state=42)



In [None]:
housing_train.shape

In [None]:
y_housing.shape

In [None]:
modelList =[ridge, lasso, elasticnet, clf, gbm, rf]
modelSeries= pd.Series(modelList, index =[ 'Ridge', 'Lasso', 'Elasticnet', 'SGD', 'XGboost', 'Random Forest'])



In [None]:
modelSeries.apply(lambda x:x.fit(X_train, y_train))

In [None]:
ans = pd.concat([modelSeries.apply(lambda x: x.score(X_train,y_train)),modelSeries.apply(lambda x: x.score(X_test,y_test)), 
                                    modelSeries.apply(lambda x: np.sqrt(mean_squared_error(x.predict(X_test), y_test)))],axis=1)

ans.columns = ['train score', 'test score', 'rmse']
ans

In [None]:
###################################################################################################################

In [None]:
from sklearn import preprocessing
from sklearn.preprocessing import LabelEncoder
from sklearn import utils

lab_enc = preprocessing.LabelEncoder()
y_test_encoded = lab_enc.fit_transform(y_test)
print(y_test_encoded)

print(utils.multiclass.type_of_target(y_test))
print(utils.multiclass.type_of_target(y_test.astype('int')))
print(utils.multiclass.type_of_target(y_test_encoded))

In [None]:
from sklearn import metrics
from sklearn.metrics import confusion_matrix
mistakes = 0 # the mistakes each individual tree makes
confusion_pair = 0

n_pairs = 0.5 * rf.n_estimators * (rf.n_estimators-1)
for idx, tree in enumerate(rf.estimators_):
    mistakes += confusion_matrix(tree.predict(X_test), y_test)
    for idx2, tree2 in enumerate(rf.estimators_):
            if idx2 == idx: continue
            confusion_pair += confusion_matrix(tree.predict(X_test), tree2.predict(X_test))   
print("Average per Tree Confusion:")
print(mistakes/rf.n_estimators*1.0)
print("Aggregate Pairwise Confusion:")
print(confusion_pair/n_pairs)

In [None]:
en_prediction = rf.predict(X_test)
en_mse = mean_squared_error(en_prediction, y_test)
en_rsme = np.sqrt(en_mse)
print('RF RMSE: %.2f' % en_rsme)

## Hyperparameter tuning RF

In [None]:
from sklearn.model_selection import RandomizedSearchCV

In [None]:
print('Parameters currently in use:\n')
pprint(rf.get_params())

In [None]:
###### number of trees in random forest
n_estimators =[int(x) for x in np.linspace(start =200, stop = 
                                           2000, num=10)]

In [None]:
##### Number of features to consider at every split
max_features = ['auto', 'sqrt']

In [None]:
#### Max number of levels in tree
max_depth = [int(x) for x in np.linspace(20, 110, num=11)]
max_depth.append(None)

In [None]:
###### min samples split a node
min_samples_split = [2, 5, 10]

In [None]:
#### Minimum number of samples required to split a node
min_samples_leaf = [1, 2, 4]

In [None]:
#### Method of selecting samples for training each tree
bootstrap = [True, False]

In [None]:
#### Create random grid
random_grid ={'n_estimators':n_estimators,
             'max_features': max_features,
             'max_depth': max_depth,
             'min_samples_split': min_samples_split,
             'min_samples_leaf': min_samples_leaf,
             'bootstrap':bootstrap}
pprint(random_grid)

In [None]:
#### Using random grid to search for best hyperparameters
### base mode
rf_random = RandomizedSearchCV(estimator = rf, param_distributions =
                              random_grid, n_iter = 100, cv = 3, verbose =2,
                              random_state=42,
                              n_jobs=-1)

In [None]:
#### Value error fix for refit model into rf_random

from sklearn import preprocessing
from sklearn.preprocessing import LabelEncoder
from sklearn import utils

lab_enc = preprocessing.LabelEncoder()
y_train_encoded = lab_enc.fit_transform(y_train)
print(y_train_encoded)

print(utils.multiclass.type_of_target(y_train))
print(utils.multiclass.type_of_target(y_train.astype('int')))
print(utils.multiclass.type_of_target(y_train_encoded))

### New Model Fitting

In [None]:
rf_random.fit(X_train, y_train_encoded)

In [None]:
#### best parameters
rf_random.best_params_

### Ridge hyper parameters

In [None]:
from scipy.stats import uniform as sp_rand
print('Parameters currently in use:\n')
pprint(ridge.get_params())

In [None]:
param_grid ={'alpha':sp_rand()}
model = Ridge()
research = RandomizedSearchCV(estimator=model, param_distributions=param_grid,
                             n_iter = 100)

In [None]:
research.fit(X_train, y_train)
print(research)

In [None]:
print(research.best_score_)
print(research.best_estimator_.alpha)