# Setup

In [1]:
# To support both python 2 and python 3
from __future__ import division, print_function, unicode_literals

# Common imports
import numpy as np
import os

# to make this notebook's output stable across runs
np.random.seed(42)

# To plot pretty figures
%matplotlib inline
import matplotlib
import matplotlib.pyplot as plt
plt.rcParams['axes.labelsize'] = 14
plt.rcParams['xtick.labelsize'] = 12
plt.rcParams['ytick.labelsize'] = 12

# Where to save the figures
PROJECT_ROOT_DIR = "/home/leandro/ML/"
CHAPTER_ID = "end_to_end_project"
IMAGES_PATH = os.path.join(PROJECT_ROOT_DIR, "images", CHAPTER_ID)

def save_fig(fig_id, tight_layout=True, fig_extension="png", resolution=300):
    path = os.path.join(IMAGES_PATH, fig_id + "." + fig_extension)
    print("Saving figure", fig_id)
    if tight_layout:
        plt.tight_layout()
    plt.savefig(path, format=fig_extension, dpi=resolution)

# Ignore useless warnings (see SciPy issue #5998)
import warnings
warnings.filterwarnings(action="ignore", message="^internal gelsd")

ModuleNotFoundError: No module named 'matplotlib'

# Load the data

In [2]:
# write here a function to load the data

# Exploring the data

In [3]:
# Call the function to load the data

# Print the first ten rows from the dataset

In [4]:
# Call the function info from the data frame housing

In [5]:
# Call the function value_counts to attribute ocean_proximity

In [6]:
# Call the function describe from the data frame housing

In [7]:
%matplotlib inline
import matplotlib.pyplot as plt
# Plot histograms for each attribute here
plt.show()

In [8]:
# to make this notebook's output identical at every run
np.random.seed(42)

In [9]:
from sklearn.model_selection import train_test_split

# Divide train and test here

In [10]:
# Print the size of the train and test

In [11]:
# Call the function head from the test data frame

In [12]:
# Plot the histogram from the median_income attribute

In [13]:
# Divide by 1.5 to limit the number of income categories

# Label those above 5 as 5

In [14]:
# Call the function value_counts for income_cat attribute


In [15]:
# Plot the hist from the income_cat


In [16]:
from sklearn.model_selection import StratifiedShuffleSplit

# Divide the data in train and test using stratification by column income_cat

In [17]:
# Print the samples percent of the income_cat attribute on the test data


In [18]:
# Print the samples percent of the income_cat attribute on the housing data frame

# Discover and visualize the data to gain insights

In [19]:
# Create a new df called housing from the start_train_set

In [20]:
# Plot a scatter chart using longitude and latitude attributes

In [21]:
# Plot a scatter chart using longitude and latitude attributes and alpha channel

In [22]:
# Plot a california.png image below
# ![title](california.png)

In [23]:
# Looking for Correlations

In [24]:
# ![title](correlation.png)

In [25]:
# Get a correlation matrix form the housing df

In [26]:
# Print the median_house_value column on the correlation matrix

In [27]:
# from pandas.plotting import scatter_matrix

# attributes = ["median_house_value", "median_income", "total_rooms",
#               "housing_median_age"]
# scatter_matrix(housing[attributes], figsize=(12, 8))


# Experimenting with Attribute Combinations

In [28]:
# Create three attributes called rooms_per_household, bedrooms_per_room and bedrooms_per_room


In [29]:
# And now let’s look at the correlation matrix again


In [30]:
# Call the function describe again for the housing df

# Prepare the Data for Machine Learning Algorithms

In [31]:
# Create a housing df from the start_train_set and remove the median_house_value column

# Create a housing_labels from the start_train_set


# Data Cleaning

In [32]:
# Get the first ten incomplete rows that are null

# Print the rows

### Option 1

In [33]:
# Drop all nan rows

### Option 2

In [34]:
# Drop the column that contains nan values

### Option 3

In [35]:
# Use the median to replace nan values

# Using the Imputer Class from the Sklearn

In [36]:
from sklearn.preprocessing import Imputer

# Create a imputer using median strategy

In [37]:
# Get the housing data frame and remove the ocean proximity

In [38]:
# Apply the fit method on the housing data frame

In [39]:
# Print the statistics_ attribute from the imputer

In [40]:
# Print the median values from the housing data frame 

In [41]:
# Create a training set applying the transformation on the housing data frame

In [42]:
#housing_tr = pd.DataFrame(X, columns=housing_num.columns, index = list(housing.index.values))
#housing_tr.loc[sample_incomplete_rows.index.values]

# Preprocess the categorical input feature

In [43]:
# Get the attribute ocean_proximity

# Call the function head

In [44]:
from sklearn.preprocessing import OrdinalEncoder

ImportError: cannot import name 'OrdinalEncoder'

In [45]:
# Create a OrdinalEnconder and apply it on the housing_cat data frame
# Print the first ten rows

In [46]:
# Print the categories_ attribute

In [46]:
from sklearn.preprocessing import OneHotEncoder

In [47]:
# Create a OneHotEncoder using sparse=False and apply it on the housing_cat data frame
# Print 

In [48]:
# Print the categories_ attribute

# custom transformer

In [55]:
from sklearn.base import BaseEstimator, TransformerMixin

# column index
rooms_ix, bedrooms_ix, population_ix, household_ix = 3, 4, 5, 6

class CombinedAttributesAdder(BaseEstimator, TransformerMixin):
    def __init__(self, add_bedrooms_per_room = True): # no *args or **kargs
        self.add_bedrooms_per_room = add_bedrooms_per_room
    def fit(self, X, y=None):
        return self  # nothing else to do
    def transform(self, X, y=None):
        rooms_per_household = X[:, rooms_ix] / X[:, household_ix]
        population_per_household = X[:, population_ix] / X[:, household_ix]
        if self.add_bedrooms_per_room:
            bedrooms_per_room = X[:, bedrooms_ix] / X[:, rooms_ix]
            return np.c_[X, rooms_per_household, population_per_household,
                         bedrooms_per_room]
        else:
            return np.c_[X, rooms_per_household, population_per_household]

#attr_adder = CombinedAttributesAdder(add_bedrooms_per_room=False)
#housing_extra_attribs = attr_adder.transform(housing.values)

## Let's build a pipeline for preprocessing the numerical and categorical attributes

In [50]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

In [51]:
# Write a numerical pipeline

In [52]:
# Write a categorical pipeline and merged it with numerical pipeline

In [53]:
from sklearn.base import BaseEstimator, TransformerMixin

# Create a class to select numerical or categorical columns 
class OldDataFrameSelector(BaseEstimator, TransformerMixin):
    def __init__(self, attribute_names):
        self.attribute_names = attribute_names
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        return X[self.attribute_names].values

In [54]:
# Join all these components into a big pipeline that will preprocess both the numerical and the categorical features