In [4]:
from importlib import import_module
from operator import attrgetter

import pandas as pd
from sklearn.impute import SimpleImputer

dataset = pd.read_csv('data/housing.csv')
dataset.head()

FileNotFoundError: [Errno 2] No such file or directory: 'data/housing.csv'

In [None]:
import numpy as np
np.random.seed(42) # for reproducibility
import os

# to make this notebook's output beautiful
%matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt
mpl.rc('axes', labelsize=14)
mpl.rc('xtick', labelsize=12)
mpl.rc('ytick', labelsize=12)

In [None]:
dataset.shape

In [None]:
dataset.info() # to see if there are any missing values. total badrooms has some missing values

In [None]:
set(dataset['ocean_proximity']) # to see if there are any categories

In [None]:
dataset["ocean_proximity"].value_counts()

In [None]:
dataset.describe()

In [None]:
dataset.hist(bins=50, figsize=(20,15))

## Separate training and test sets

In [None]:
from sklearn.model_selection import train_test_split

df_train, df_test = train_test_split(dataset, test_size=0.2, random_state=7)

In [None]:
print(len(df_train), "train +", len(df_test), "test")

In [None]:
dataset["median_income"].hist()

In [None]:
dataset["income_cat"] = np.ceil(dataset["median_income"] / 1.5)
dataset["income_cat"].where(dataset["income_cat"] < 5, 5.0, inplace=True)
dataset["income_cat"].hist()

In [None]:
dataset["income_cat"].value_counts()

In [None]:
from sklearn.model_selection import StratifiedShuffleSplit

split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
for train_index, test_index in split.split(dataset, dataset["income_cat"]):
    strat_train_set = dataset.loc[train_index]
    strat_test_set = dataset.loc[test_index]

In [None]:
strat_test_set["income_cat"].value_counts() / len(strat_test_set)

In [None]:
# analysing proportions
strat_test_set["income_cat"].value_counts() / len(strat_test_set) 

In [None]:
strat_train_set["income_cat"].value_counts() / len(strat_train_set)

In [None]:
# removing auxiliary variable income_cat
for set in (strat_train_set, strat_test_set):
    set.drop("income_cat", axis=1, inplace=True)

## Analyzing geographic data

In [None]:
housing = strat_train_set.copy()

In [None]:
housing.plot(kind="scatter", x="longitude", y="latitude")

In [None]:
housing.plot(kind="scatter", x="longitude", y="latitude", alpha=0.1)

### Analysing imobiliary prices

In [None]:
housing.plot(kind="scatter", x="longitude", y="latitude", alpha=0.4,
             s=housing["population"]/100, label="population", figsize=(10,7),
             c="median_house_value", cmap=plt.get_cmap("jet"), colorbar=True,
             sharex=False)
plt.legend()

### Looking for correlations

In [None]:

housing = housing.drop("ocean_proximity", axis=1)
housing
# for set in housing:
#     set.drop("ocean_proximity", axis=1, inplace=True)


In [None]:
corr_matrix = housing.corr()

In [None]:
from pandas.plotting import scatter_matrix

attributes = ["median_house_value", "median_income", "total_rooms",
              "housing_median_age"]
scatter_matrix(housing[attributes], figsize=(12, 8))

In [None]:
housing.plot(kind="scatter", x="median_income", y="median_house_value", alpha=0.1)
plt.axis([0, 16, 0, 550000])

## Preparing the data

In [None]:
housing = strat_train_set.drop("median_house_value", axis=1) # apagando a target para a base de treino (nosso x)
housing_labels = strat_train_set["median_house_value"].copy() #armazenando a target (nosso y)

In [None]:
sample_incomplete_rows = housing[housing.isnull().any(axis=1)].head()
sample_incomplete_rows

In [None]:
housing.isnull().sum()

In [None]:
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(strategy="median")

In [None]:
housing_num = housing.drop('ocean_proximity', axis=1)

In [None]:
imputer.fit(housing_num) # calculando a mediana de cada atributo e armazenando o resultado na variável statistics_

In [None]:
imputer.statistics_

In [None]:
X = imputer.transform(housing_num) # o resultado é um array.

In [None]:
X

In [None]:
housing_tr = pd.DataFrame(X, columns=housing_num.columns,
                          index=housing.index)

In [None]:
housing_tr

In [None]:
# verificando os resultados
housing_tr.loc[sample_incomplete_rows.index.values]

In [None]:
imputer.strategy

In [None]:
housing_tr = pd.DataFrame(X, columns=housing_num.columns,
                          index=housing_num.index)
housing_tr.head()

## Pre-processing categories

In [None]:
housing_cat = housing[['ocean_proximity']]
housing_cat.head(10)

In [None]:
from sklearn.preprocessing import OrdinalEncoder
ordinal_encoder = OrdinalEncoder()
housing_cat_encoded = ordinal_encoder.fit_transform(housing_cat)
housing_cat_encoded[:10]

In [None]:
ordinal_encoder.categories_

In [None]:
from sklearn.preprocessing import OneHotEncoder

cat_encoder = OneHotEncoder()
housing_cat_1hot = cat_encoder.fit_transform(housing_cat)
housing_cat_1hot

In [None]:
cat_encoder.categories_

In [None]:
housing_num

## Creating preproccess pipeline

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder

num_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy="median")), # to fill in missing values
    ('std_scaler', StandardScaler()), # to normalize the data
])

housing_num_tr = num_pipeline.fit_transform(housing_num)

In [None]:
housing_num_tr

In [None]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

num_attribs = list(housing_num)
cat_attribs = ["ocean_proximity"]

full_pipeline = ColumnTransformer([
    ("num", num_pipeline, num_attribs),
    ("cat", OneHotEncoder(), cat_attribs),
])

housing_prepared = full_pipeline.fit_transform(housing)

In [None]:
housing_prepared

In [None]:
type(housing_prepared)

In [None]:
column_names = [
    'longitude', 'latitude', 'housing_median_age', 'total_rooms', 'total_bedrooms',
    'population', 'households', 'median_income', '<1H OCEAN', 'INLAND', 'ISLAND', 'NEAR BAY', 'NEAR OCEAN']

# Transformar o array em DataFrame
housing_df = pd.DataFrame(data=housing_prepared, columns=column_names)

# Exibir o DataFrame resultante
print(housing_df.shape)

In [None]:
housing_df.head()