In [10]:
import random
from random import sample, seed

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import statsmodels.api as sm
import statsmodels.formula.api as smf

from sklearn import linear_model
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from sklearn.model_selection import StratifiedKFold
from sklearn.datasets import load_boston
from sklearn.datasets import make_classification
from sklearn.feature_extraction import DictVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.metrics import classification_report
from sklearn.metrics import r2_score
from sklearn.metrics import make_scorer
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import LabelBinarizer
from sklearn.preprocessing import PolynomialFeatures
from sklearn.impute import SimpleImputer 
from sklearn.decomposition import PCA

%matplotlib inline
%load_ext autoreload
%autoreload 2

np.set_printoptions(precision=5, suppress=True)

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [2]:
boston = load_boston()
dataset = pd.DataFrame(boston.data, columns=boston.feature_names)
dataset['target'] = boston.target
dataset

observations = len(dataset)
variables = dataset.columns[:-1]
X = dataset.iloc[:, :-1]
y = dataset['target'].values
labels = boston.feature_names

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=101)
print("Train dataset sample size: {}".format(len(X_train)))
print("Test dataset sample size: {}".format(len(X_test)))

Train dataset sample size: 354
Test dataset sample size: 152


In [8]:
X_train, X_out_sample, y_train, y_out_sample = train_test_split(X, y, test_size=0.40, random_state=101)
X_validation, X_test, y_validation, y_test = train_test_split(X_out_sample, y_out_sample, test_size=0.50, random_state=101)
print("Train dataset sample size: {}".format(len(X_train)))
print("Validation dataset sample size: {}".format(len(X_validation)))
print("Test dataset sample size: {}".format(len(X_test)))

Train dataset sample size: 303
Validation dataset sample size: 101
Test dataset sample size: 102


In [9]:
def RMSE(y_true, y_pred):
    return np.sum((y_true - y_pred) ** 2)

In [16]:
lm = LinearRegression()
cv_iterator = KFold(n_splits=10, shuffle=True, random_state=101)
cv_iterator

KFold(n_splits=10, random_state=101, shuffle=True)

In [17]:
edges = np.histogram(y, bins=5)[1]
edges

array([ 5., 14., 23., 32., 41., 50.])

In [18]:
binning = np.digitize(y, edges)
binning

array([3, 2, 4, 4, 4, 3, 2, 3, 2, 2, 2, 2, 2, 2, 2, 2, 3, 2, 2, 2, 1, 2,
       2, 2, 2, 1, 2, 2, 2, 2, 1, 2, 1, 1, 1, 2, 2, 2, 3, 3, 4, 3, 3, 3,
       2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 2, 4, 3, 3, 3, 2, 2, 2, 2, 3, 4, 3,
       2, 2, 2, 2, 3, 2, 2, 3, 3, 2, 2, 2, 2, 2, 3, 3, 3, 2, 3, 3, 2, 2,
       3, 3, 2, 2, 2, 3, 2, 3, 2, 4, 5, 4, 3, 3, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       3, 2, 2, 2, 2, 2, 1, 2, 2, 2, 1, 2, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 1, 5, 3, 3, 3, 6, 6, 6, 2, 3, 6, 3, 3, 2, 2, 2, 3, 3, 2, 3,
       3, 3, 3, 4, 4, 4, 4, 4, 3, 3, 6, 4, 3, 4, 4, 3, 4, 3, 3, 6, 4, 3,
       4, 4, 4, 3, 5, 5, 6, 2, 3, 2, 3, 2, 2, 2, 2, 3, 3, 3, 3, 3, 2, 3,
       3, 2, 3, 3, 5, 6, 4, 3, 5, 3, 3, 3, 5, 5, 3, 3, 3, 3, 3, 3, 2, 2,
       2, 3, 2, 2, 3, 2, 3, 3, 3, 3, 3, 5, 2, 2, 5, 6, 4, 3, 4, 5, 5, 3,
       4, 2, 3, 6, 5, 2, 2, 3, 3, 4, 4, 4, 4, 4, 3, 4, 5, 4, 5, 6, 4, 2,
       2, 3, 2, 3, 3, 4, 3, 3, 2, 3, 3, 2, 2, 3, 3,

In [21]:
stratified_cv_iterator = StratifiedKFold(n_splits=10, shuffle=True, random_state=101)

In [22]:
stratified_cv_iterator

StratifiedKFold(n_splits=10, random_state=101, shuffle=True)

In [23]:
second_order = PolynomialFeatures(degree=2, interaction_only=False)
third_order = PolynomialFeatures(degree=3, interaction_only=True)