## End-to-end Machine Learning project

Notebook adapted and inspired from: https://github.com/ageron/handson-ml2/blob/master/02_end_to_end_machine_learning_project.ipynb

In [None]:
# some imports
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

# Python ≥3.5 is required
import sys
assert sys.version_info >= (3, 5)

# Scikit-Learn ≥0.20 is required
import sklearn
assert sklearn.__version__ >= "0.20"

# Common imports
import numpy as np
import os

# to make this notebook's output stable across runs
np.random.seed(42)

# To plot pretty figures
%matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt
mpl.rc('axes', labelsize=14)
mpl.rc('xtick', labelsize=12)
mpl.rc('ytick', labelsize=12)
plt.rc('font', size=12) 
plt.rc('figure', figsize = (12, 5))

# Settings for the visualizations
import seaborn as sns
sns.set_style("whitegrid")
sns.set_context("notebook", font_scale=1, rc={"lines.linewidth": 2,'font.family': [u'times']})

import pandas as pd
pd.set_option('display.max_rows', 25)
pd.set_option('display.max_columns', 500)
pd.set_option('display.max_colwidth', 50)

# Ignore useless warnings (see SciPy issue #5998)
import warnings
warnings.filterwarnings(action="ignore", message="^internal gelsd")

# create output folder
if not os.path.exists('output'):
    os.makedirs('output')
if not os.path.exists('output/session1'):
    os.makedirs('output/session1')

In [None]:
# to make this notebook's output identical at every run
np.random.seed(42)

## Get the data

In [None]:
housing = pd.read_csv('dataset/housing-snapshot/train_set.csv',index_col=0)

In [None]:
housing.shape

In [None]:
housing.head(10)

In [None]:
housing.info()

In [None]:
housing["Type"].value_counts()

In [None]:
housing.describe()

In [None]:
## POSTCODE IS NOT NUMERIC, we should change it to categorica
housing['Postcode'] = pd.Categorical(housing.Postcode)
housing.describe()

In [None]:
housing.hist(bins=10, figsize=(20,15))
plt.show()

## Divide into train - validation

There are several ways to divide your training data. Sometimes it is important to create a stratified sampling.

In [None]:
# For illustration only. Sklearn has train_test_split()
def split_train_test(data, test_ratio):
    shuffled_indices = np.random.permutation(len(data))
    test_set_size = int(len(data) * test_ratio)
    test_indices = shuffled_indices[:test_set_size]
    train_indices = shuffled_indices[test_set_size:]
    return data.iloc[train_indices], data.iloc[test_indices]

In [None]:
train_set, test_set = split_train_test(housing, 0.2)
len(train_set),len(test_set)

In [None]:
## Create a function that divides the data with an id
## checks that id is not train and test set
from zlib import crc32

def test_set_check(identifier, test_ratio):
    return crc32(np.int64(identifier)) & 0xffffffff < test_ratio * 2**32

def split_train_test_by_id(data, test_ratio, id_column):
    ids = data[id_column]
    in_test_set = ids.apply(lambda id_: test_set_check(id_, test_ratio))
    return data.loc[~in_test_set], data.loc[in_test_set]

In [None]:
housing_with_id = housing.reset_index()   # adds an `index` column
train_set, test_set = split_train_test_by_id(housing_with_id, 0.2, "index")

## create an id base on latitude and longitude
housing_with_id["id"] = housing["Longtitude"] * 1000 + housing["Lattitude"]
train_set, test_set = split_train_test_by_id(housing_with_id, 0.2, "id")

In [None]:
test_set.head()

In [None]:
## divide using the scikit learn function
from sklearn.model_selection import train_test_split

train_set, test_set = train_test_split(housing, test_size=0.2, random_state=42)

In [None]:
housing["Price"].hist()

In [None]:
housing["price_cat"] = pd.cut(housing["Price"],
                               bins=[0., 500000, 1000000, 1500000, 2000000., np.inf],
                               labels=[1, 2, 3, 4, 5])

In [None]:
housing["price_cat"].value_counts()

In [None]:
housing["price_cat"].hist()

In [None]:
from sklearn.model_selection import StratifiedShuffleSplit

split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
for train_index, test_index in split.split(housing, housing["price_cat"]):
    strat_train_set = housing.loc[train_index]
    strat_test_set = housing.loc[test_index]

In [None]:
strat_test_set["price_cat"].value_counts() / len(strat_test_set)


In [None]:
housing["price_cat"].value_counts() / len(housing)

In [None]:
def price_cat_proportions(data):
    return data["price_cat"].value_counts() / len(data)

train_set, test_set = train_test_split(housing, test_size=0.2, random_state=42)

compare_props = pd.DataFrame({
    "Overall": price_cat_proportions(housing),
    "Stratified": price_cat_proportions(strat_test_set),
    "Random": price_cat_proportions(test_set),
}).sort_index()
compare_props["Rand. %error"] = 100 * compare_props["Random"] / compare_props["Overall"] - 100
compare_props["Strat. %error"] = 100 * compare_props["Stratified"] / compare_props["Overall"] - 100

In [None]:
## check the proportion of each category with the different approaches
compare_props

In [None]:
for set_ in (strat_train_set, strat_test_set):
    set_.drop("price_cat", axis=1, inplace=True)

## Discover and visualize the data to gain insights

In [None]:
housing = strat_train_set.copy()

In [None]:
sns.distplot(train_set["Price"])
plt.show()

In [None]:
# We can check how many different type there is in the dataset using the folliwing line
train_set["Type"].value_counts()

In [None]:
sns.countplot(y="Type", data=train_set, color="c")

In [None]:
housing.plot(kind="scatter", x="Longtitude", y="Lattitude")
plt.show()

In [None]:
housing.plot(kind="scatter", x="Longtitude", y="Lattitude", alpha=0.1)
plt.show()

In [None]:
housing.plot(kind="scatter", x="Longtitude", y="Lattitude", alpha=0.3,
             figsize=(25,10),
             c="Price", cmap=plt.get_cmap("jet"), colorbar=True,
             sharex=False)
plt.legend()

In [None]:
housing.plot(kind="scatter", x="Longtitude", y="Lattitude", alpha=0.3,
             s=housing["Landsize"]/25, label="Landsize", 
             figsize=(25,10),
             c="Price", cmap=plt.get_cmap("jet"), colorbar=True,
             sharex=False)
plt.legend()

In [None]:
housing.plot(kind="scatter", x="Longtitude", y="Lattitude", alpha=0.3,
             s=housing["BuildingArea"]/5, label="BuildingArea", 
             figsize=(25,10),
             c="Price", cmap=plt.get_cmap("jet"), colorbar=True,
             sharex=False)
plt.legend()

In [None]:
housing.plot(kind="scatter", x="Longtitude", y="Lattitude", alpha=0.3,
             s=housing["Distance"]**2, label="BuildingArea", 
             figsize=(25,10),
             c="Price", cmap=plt.get_cmap("jet"), colorbar=True,
             sharex=False)
plt.legend()

In [None]:
corr_matrix = housing.corr()

In [None]:
corr_matrix["Price"].sort_values(ascending=False)

In [None]:
# from pandas.tools.plotting import scatter_matrix # For older versions of Pandas
from pandas.plotting import scatter_matrix

attributes = ["Price", "Rooms", "Bedroom2","Bathroom",
              "YearBuilt"]
scatter_matrix(housing[attributes], figsize=(16, 8))
plt.show()

In [None]:
housing.plot(kind="scatter", x="Price", y="Rooms",
             alpha=0.1)
plt.show()

In [None]:
sns.scatterplot(data=housing, x="Rooms", y="Price")
plt.show()

In [None]:
sns.regplot(data=housing, x="Rooms", y="Price")
plt.show()

In [None]:
## Plot landsize vs Price

In [None]:
sns.regplot(data=housing, x="Landsize", y="Price")
plt.show()

In [None]:
sns.regplot(data=housing, x="Landsize", y="Price")
plt.axis([-10, 1000, 0, 5e6])
plt.show()

In [None]:
## mmm there is a lot of 0 at Landsize.... 

In [None]:
## Let's see BuildingArea

In [None]:
sns.regplot(data=housing, x="BuildingArea", y="Price")
plt.show()

In [None]:
sns.regplot(data=housing, x="BuildingArea", y="Price",)
plt.axis([-10, 1000, 0, 5e6])
plt.show()

In [None]:
## Now there is no 0 but NaNs.. NaNs are not displayed, neither taked into account to estimate the fit

In [None]:
housing[['Price','Landsize','BuildingArea']].head(10)

In [None]:
housing.describe()

## Prepare the data for Machine Learning algorithms

In [None]:
housing = strat_train_set.drop("Price", axis=1) # drop labels for training set
housing_labels = strat_train_set["Price"].copy()

### Data Cleaning 

What to do with data with missing values
* Option 1: Remove rows
* Option 2: Remove columns
* Option 3: Impute missing values

In [None]:
sample_incomplete_rows = housing[housing.isnull().any(axis=1)]
print(housing.shape[0],sample_incomplete_rows.shape[0])
sample_incomplete_rows.head()

In [None]:
# # option 1 : remove those rows with nans 
cols_wiht_nans = housing.columns[housing.isna().any()].tolist()
housing.dropna()    

In [None]:
# only from a subset of columns
cols_wiht_nans = housing.columns[housing.isna().any()].tolist()
housing.dropna(subset=["BuildingArea"])    

In [None]:
## option 2 : remove those columns
cols_wiht_nans = housing.columns[housing.isna().any()].tolist()
housing.drop(cols_wiht_nans, axis=1)       # option 2


In [None]:
## option 3 : Set values to some values
median = housing["BuildingArea"].median()
housing["BuildingArea"].fillna(median, inplace=True) 
housing.head()

In [None]:
median = housing["YearBuilt"].median()
housing["YearBuilt"].fillna(median, inplace=True) 
housing.head()

In [None]:
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(strategy="median")

Remove the text attribute because median can only be calculated on numerical attributes:

In [None]:
housing_num = housing.select_dtypes(include=[np.number])

In [None]:
imputer.fit(housing_num)
imputer.statistics_

Check that this is the same as manually computing the median of each attribute:

In [None]:
housing_num.median().values

Transform the training set:



In [None]:
X = imputer.transform(housing_num)

In [None]:
housing_tr = pd.DataFrame(X, columns=housing_num.columns,
                          index=housing.index)

In [None]:
housing_tr.loc[sample_incomplete_rows.index.values]


In [None]:
imputer.strategy


In [None]:
housing_tr = pd.DataFrame(X, columns=housing_num.columns,
                          index=housing_num.index)

In [None]:
housing_tr.head()

### Categorical data

Now let's preprocess the categorical input feature, CouncilArea:

In [None]:
housing_cat = housing[["CouncilArea"]]
# There is missing values in this category, we will create a new cateogry for those.
housing_cat.fillna('Unknown').head(10)

In [None]:
from sklearn.preprocessing import OrdinalEncoder

ordinal_encoder = OrdinalEncoder()
housing_cat_encoded = ordinal_encoder.fit_transform(housing_cat.fillna('Unknown'))
housing_cat_encoded[:10]

In [None]:
ordinal_encoder.categories_

In [None]:
from sklearn.preprocessing import OneHotEncoder

cat_encoder = OneHotEncoder(handle_unknown='ignore')
housing_cat_1hot = cat_encoder.fit_transform(housing_cat.fillna('Unknown'))
housing_cat_1hot

In [None]:
housing_cat_1hot.toarray()

Alternatively, you can set sparse=False when creating the OneHotEncoder:

In [None]:
cat_encoder = OneHotEncoder(sparse=False,handle_unknown='ignore')
housing_cat_1hot = cat_encoder.fit_transform(housing_cat.fillna('Unknown'))
housing_cat_1hot

In [None]:
cat_encoder.categories_

Let's create a custom transformer to add extra attributes:


In [None]:
from sklearn.base import BaseEstimator, TransformerMixin

# column index
Rooms_ix, Bedroom2_ix, Bathroom_ix, BuildingArea_ix = 0, 2, 3, 6

class CombinedAttributesAdder(BaseEstimator, TransformerMixin):
    def __init__(self, add_bedrooms_per_room = True): # no *args or **kargs
        self.add_bedrooms_per_room = add_bedrooms_per_room
        
    def fit(self, X, y=None):
        return self  # nothing else to do
    
    def transform(self, X):
        rooms_per_building_area = X[:, Rooms_ix] / (1.0 +X[:, BuildingArea_ix])# add 1 to avoid 0 division
        if self.add_bedrooms_per_room:
            bedrooms_per_room = X[:, Bedroom2_ix] / (1.0 + X[:, Bathroom_ix]) # add 1 to avoid 0 division
            return np.c_[X, rooms_per_building_area, bedrooms_per_room]
        else:
            return np.c_[X, rooms_per_building_area]
        

attr_adder = CombinedAttributesAdder(add_bedrooms_per_room=True)
housing_extra_attribs = attr_adder.transform(housing_num.values)

In [None]:
housing_extra_attribs = pd.DataFrame(
    housing_extra_attribs,
    columns=list(housing_num.columns)+["rooms_per_building_area", "bedrooms_per_room"],
    index=housing.index)
housing_extra_attribs.head()

Now let's build a pipeline for preprocessing the numerical attributes:

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import FunctionTransformer


## create a function to replace 0 by NaN
def replace_0_2_NaN(data):
    data[data == 0] = np.nan
    return data


num0_pipeline = Pipeline([
        ('zeros2NaN',FunctionTransformer(func = replace_0_2_NaN,validate=False)),
        ('imputer', SimpleImputer(strategy="median")),
        ('log',FunctionTransformer(np.log1p, validate=True)),
        ('std_scaler', StandardScaler()),
    ])

housing_num_tr = num0_pipeline.fit_transform(housing[['BuildingArea','Landsize']])
plt.hist(housing_num_tr,bins=20)
plt.show()

In [None]:
num_pipeline = Pipeline([
        ('imputer', SimpleImputer(strategy="median")),
        ('attribs_adder', CombinedAttributesAdder()),
        ('std_scaler', StandardScaler()),
    ])

housing_num_tr = num_pipeline.fit_transform(housing_num)

In [None]:
housing_num_tr

In [None]:
cat_pipeline = Pipeline([
        ('imputer', SimpleImputer(strategy="constant",fill_value='Unknown')),
        ('one_hot_encoder', OneHotEncoder(handle_unknown='ignore')),
    ])

In [None]:
from sklearn.compose import ColumnTransformer

num_attribs0 = ['Landsize','BuildingArea']
num_attribs1 = list(housing_num)
cat_attribs = ["CouncilArea",'Type','Suburb','Postcode']


full_pipeline = ColumnTransformer([
        ("num0", num0_pipeline, num_attribs0),
        ("num1", num_pipeline, num_attribs1),
        ("cat", cat_pipeline, cat_attribs),
    ])

housing_prepared = full_pipeline.fit_transform(housing)

housing_prepared

## Select and train a model


In [None]:
from sklearn.linear_model import LinearRegression

lin_reg = LinearRegression()
lin_reg.fit(housing_prepared, housing_labels)

In [None]:
# let's try the full preprocessing pipeline on a few training instances
some_data = housing.iloc[:25]
some_labels = housing_labels.iloc[:25]
some_data_prepared = full_pipeline.transform(some_data)

print("Predictions:", lin_reg.predict(some_data_prepared))

Compare against the actual values:

In [None]:
print("Labels:", list(some_labels))


In [None]:
plt.scatter(lin_reg.predict(some_data_prepared),list(some_labels))

In [None]:
some_data_prepared


In [None]:
from sklearn.metrics import mean_squared_error

housing_predictions = lin_reg.predict(housing_prepared)
lin_mse = mean_squared_error(housing_labels, housing_predictions)
lin_rmse = np.sqrt(lin_mse)
lin_rmse

In [None]:
from sklearn.metrics import mean_absolute_error

lin_mae = mean_absolute_error(housing_labels, housing_predictions)
lin_mae

In [None]:
## K-Nearest Neighbour
from sklearn import neighbors

n_neighbors = 3
knn_reg = neighbors.KNeighborsRegressor(n_neighbors)
knn_reg.fit(housing_prepared, housing_labels)

In [None]:
housing_predictions = knn_reg.predict(housing_prepared)
knn_mse = mean_squared_error(housing_labels, housing_predictions)
knn_rmse = np.sqrt(knn_mse)
knn_rmse


In [None]:
## Decision Tree
from sklearn.tree import DecisionTreeRegressor

tree_reg = DecisionTreeRegressor(random_state=42)
tree_reg.fit(housing_prepared, housing_labels)

In [None]:
housing_predictions = tree_reg.predict(housing_prepared)
tree_mse = mean_squared_error(housing_labels, housing_predictions)
tree_rmse = np.sqrt(tree_mse)
tree_rmse

## Fine-tune your model

Find the best parameters for your model. In order to do so, we will use the 10-fold cross-validation

In [None]:
from sklearn.model_selection import cross_val_score

scores = cross_val_score(tree_reg, housing_prepared, housing_labels,
                         scoring="neg_root_mean_squared_error", cv=10)


In [None]:
def display_scores(scores,model_name = None):
    if(model_name):
        print("----",model_name,"----")
    print("Mean:", scores.mean())
    print("Standard deviation:", scores.std())

display_scores(-scores,'Decision Tree')

In [None]:
lin_scores = cross_val_score(lin_reg, housing_prepared, housing_labels,
                             scoring="neg_root_mean_squared_error", cv=10)
display_scores(-lin_scores,'Linear Regression')

In [None]:
knn_scores = cross_val_score(knn_reg, housing_prepared, housing_labels,
                             scoring="neg_root_mean_squared_error", cv=10)
display_scores(-knn_scores,'Knn Regression')

In [None]:
## Let's try another model: Random Forest
from sklearn.ensemble import RandomForestRegressor

forest_reg = RandomForestRegressor(n_estimators=20, random_state=42)
forest_reg.fit(housing_prepared, housing_labels)

In [None]:
housing_predictions = forest_reg.predict(housing_prepared)
forest_mse = mean_squared_error(housing_labels, housing_predictions)
forest_rmse = np.sqrt(forest_mse)
forest_rmse

In [None]:
forest_scores = cross_val_score(forest_reg, housing_prepared, housing_labels,
                                scoring="neg_root_mean_squared_error", cv=10)
display_scores(-forest_scores, 'Random Forest')

In [None]:
scores = cross_val_score(lin_reg, housing_prepared, housing_labels, scoring="neg_root_mean_squared_error", cv=10)
results = pd.Series(-scores)

In [None]:
results.describe()

In [None]:
plt.boxplot(results)
plt.show()

In [None]:
from sklearn.svm import SVR

svm_reg = SVR(kernel="linear")
svm_reg.fit(housing_prepared, housing_labels)
housing_predictions = svm_reg.predict(housing_prepared)
svm_mse = mean_squared_error(housing_labels, housing_predictions)
svm_rmse = np.sqrt(svm_mse)
svm_rmse

# Comparamos todos los modelos

In [None]:
## lets comare all of them 

models = [(lin_reg,"lin_reg"),
          (tree_reg,"Decision Tree"),
          (knn_reg,"KNN-Regressor"),
          (forest_reg,'Random Forest'),
          (svm_reg,'SVM Regressor')]
for model in models:
    scores = cross_val_score(model[0], housing_prepared, housing_labels, scoring="neg_root_mean_squared_error", cv=10)
    display_scores(-scores, model[1])

In [None]:
## Random forest looks to be the best model, but this model has several parametrs. Lets find the best parameters

# Encontrar los mejores parametros de RandomForest
- Falta hacer para los demás modelos

In [None]:
from sklearn.model_selection import GridSearchCV

#K-NN Regressor (k)
#Decision Trees (maxdepth, minsamplessplit,minsamplesleaf, maxleaf_nodes)
#SVM Regressor (kernel, degree, gamma, C)
#Random Forest (nestimators, maxfeatures, maxdepth, minsamplessplit, minsamples_leaf, bootstrap)

param_grid = [
    # try 12 (3×4) combinations of hyperparameters
    {'n_estimators': [3, 10, 30,50, 200, 205], 'max_features': [2, 4, 6, 8, 10, 12], 'max_depth':[3,5,7,10,300,400,500]},
    # then try 6 (2×3) combinations with bootstrap set as False
    {'bootstrap': [False], 'n_estimators': [3, 10], 'max_features': [2, 3, 4]},
  ]

forest_reg = RandomForestRegressor(random_state=42)
# train across 5 folds, that's a total of (12+6)*5=90 rounds of training 
grid_search = GridSearchCV(forest_reg, param_grid, cv=5,
                           scoring='neg_root_mean_squared_error',
                           return_train_score=True, n_jobs = -1)
grid_search.fit(housing_prepared, housing_labels)

In [None]:
print("the best parameters are:")
print(grid_search.best_params_)

In [None]:
print("Best Score")
np.sqrt(-grid_search.best_score_)

In [None]:
print("the best trained model:")
grid_search.best_estimator_

Let's look at the score of each hyperparameter combination tested during the grid search:

In [None]:
cvres = grid_search.cv_results_
for mean_score, params in zip(cvres["mean_test_score"], cvres["params"]):
    print(-mean_score, params)

In [None]:
pd.DataFrame(grid_search.cv_results_)


In [None]:
grid_search = GridSearchCV(RandomForestRegressor(random_state=42,n_estimators=200,max_depth=400),
                  param_grid={'max_features': range(2, 50, 2)},
                  scoring='neg_root_mean_squared_error', return_train_score=True,n_jobs = -1)
grid_search.fit(housing_prepared, housing_labels)
results = grid_search.cv_results_

In [None]:
#plot the results
plt.figure(figsize=(13, 13))
plt.title("GridSearchCV",
          fontsize=16)

plt.xlabel("max_features")
plt.ylabel("Score")

ax = plt.gca()
ax.set_xlim(0, 50)


# Get the regular numpy array from the MaskedArray
X_axis = np.array(results['param_max_features'].data, dtype=float)


for sample, style in (('train', '--'), ('test', '-')):
    sample_score_mean = (-results['mean_%s_score' % (sample)])
    sample_score_std = (results['std_%s_score' % (sample)])
    ax.fill_between(X_axis, sample_score_mean - sample_score_std,
                    sample_score_mean + sample_score_std,
                    alpha=0.1 if sample == 'test' else 0)
    ax.plot(X_axis, sample_score_mean, style,
            alpha=1 if sample == 'test' else 0.7,
            label="(%s)" % ( sample))

best_index = np.nonzero(results['rank_test_score' ] == 1)[0][0]
best_score =  (-results['mean_test_score' ][best_index])

# Plot a dotted vertical line at the best score for that scorer marked by x
ax.plot([X_axis[best_index], ] * 2, [best_score, best_score],
        linestyle='-.',  marker='x', markeredgewidth=3, ms=8)

# Annotate the best score for that scorer
ax.annotate("%0.2f" % best_score,
            (X_axis[best_index], best_score + 0.005))

plt.legend(loc="best")
plt.grid(False)
plt.show()

In [None]:
## Best features according to Random Forest
feature_importances = grid_search.best_estimator_.feature_importances_
feature_importances

In [None]:
extra_attribs = ["rooms_per_building_area", "bedrooms_per_room"]
cat_encoder = full_pipeline.named_transformers_["cat"]
cat_one_hot_attribs = list(full_pipeline.named_transformers_["cat"]['one_hot_encoder'].categories_[0])
attributes = ["log_" + t for t in num_attribs0 ] + num_attribs1 + extra_attribs + cat_one_hot_attribs
sorted(zip(feature_importances, attributes), reverse=True)

 ## Final Model 
 Create Final Model and evaluate it (You shoud do this only once)

In [None]:
housing_test = pd.read_csv('dataset/housing-snapshot/test_set.csv',index_col=0)

In [None]:
final_model = grid_search.best_estimator_

X_test = strat_test_set.drop("Price", axis=1)
y_test = strat_test_set["Price"].copy()

X_test_prepared = full_pipeline.transform(X_test)
final_predictions = final_model.predict(X_test_prepared)

final_mse = mean_squared_error(y_test, final_predictions)
final_rmse = np.sqrt(final_mse)

In [None]:
from sklearn.metrics import r2_score

coefficient_of_dermination = r2_score(y_test, final_predictions)

In [None]:
coefficient_of_dermination

In [None]:
final_rmse

We can compute a 95% confidence interval for the test RMSE:


In [None]:
from scipy import stats
confidence = 0.95
squared_errors = (final_predictions - y_test) ** 2
np.sqrt(stats.t.interval(confidence, len(squared_errors) - 1,
                         loc=squared_errors.mean(),
                         scale=stats.sem(squared_errors)))



In [None]:
final_model = grid_search.best_estimator_

X_test = strat_test_set.drop("Price", axis=1)
y_test = strat_test_set["Price"].copy()

X_test_prepared = full_pipeline.transform(housing_test)
final_predictions = final_model.predict(X_test_prepared)

df_output = pd.DataFrame(final_predictions)
df_output = df_output.reset_index()
df_output.columns = ['index','Price']

df_output.to_csv('baseline.csv',index=False)

# Conclusió:

Crec que el notebook está molt be treballat, i no he tocat res de preprocessat de les dades ja que amb el full pipeline fa tot allò que vam fer pas per pas en la practica 1. Però si que he estat estudiant els diferents algoritmes. 

Crec que com hem vist en l'anterior practica, el linear regression funciona molt be amb moltes features knn-regressor  amb poques. Ara, entra en la ecuació de models, 3 models més. 

1. SVM juga molt amb hiperplans, i forma com hiperplans linealment separables. Aixo amb un model amb moltes features ho veig molt complicat de millorar.

2. Decision Tree em sembla un molt bon algoritme crea (aixo es important destacar) 1 arbre per fer la regressió de les dades.

3. Random Forest. Aquest com ja en el notebook estava com el que millor resultat dóna es perquè el algoritme crea de forma aleatoria diferents arbres(Decisions Trees) que formen a ser sub-arbres quan es juntin tots formant un arbre final. Aquest permet fer una búsqueda/classificació més exhaustiva en regions petites que fa que el model al complet millori moltísim.

Per tant he decidit potenciar random forest fent que tingui més numeros de estimadors i més numero de profunditat que crec que son els més rellevants en l'algorisme.