# DATASET: California housing dataset
Dataset from: https://scikit-learn.org/stable/modules/generated/sklearn.datasets.fetch_california_housing.html

In this repo this dataset will be used to modeling regression models

OBS: each feature is independient but in some a analysis generated to evaluate the model, some of them consering that each feature is correlated in the time

In [None]:
import os
# fix root path to save outputs
actual_path = os.path.abspath(os.getcwd())
list_root_path = actual_path.split('\\')[:-1]
root_path = '\\'.join(list_root_path)
os.chdir(root_path)
print('root path: ', root_path)

## Generate the data, basic codes to cleaning the data and split into train/test. Generate data already to modeling

In [None]:
from sklearn.datasets import fetch_california_housing
import json
import pandas as pd
import pickle
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split

### 1. load data

In [None]:
data_X, data_y = fetch_california_housing(return_X_y=True, as_frame=True)

# joinn and rename dataset
data = data_X.copy()
data['Price'] = data_y
data.head()

### 2. Drop duplicates

In [None]:
data.shape

In [None]:
# dropduplicates - some kind of plots in plotly return error with duplicated values
data = data.drop_duplicates()
data.shape

### 3. Histograms

In [None]:
for i in data.columns:
    
    plt.figure()
    plt.title(f'{i}')
    plt.hist(data[i])

### 4. drop outliers

In [None]:
#!pip install ydata_profiling
from ydata_profiling import ProfileReport

In [None]:
# describe percentiles
data.describe(percentiles = [0.01, 0.05, 0.1, 0.25, 0.5, 0.75, 0.90, 0.95, 0.99])

In [None]:
# y data profiling
profile = ProfileReport(data, title="Profiling Report.html", minimal = True)
profile

In [None]:
### DELETE DATA BELOW PERCENTILE 1 AND ABOVE PERCENITLE 99 - ALL FEATURES

# Calculate percentile
percentil_lower = data.quantile(0.01)
percentil_upper = data.quantile(0.99)

# Filter data
data_filtered = data[(data >= percentil_lower) & (data <= percentil_upper)]
data_filtered.dropna(inplace = True)

In [None]:
data_filtered

In [None]:
data_filtered.describe()

In [None]:
data_filtered.shape

In [None]:
#### review prolifing - GOAL: DELETE EXTREME VALUES IN FEAUTURES DATA SKEWED

In [None]:
for i in data_filtered.columns:
    
    plt.figure()
    plt.title(f'{i}')
    plt.hist(data_filtered[i])

In [None]:
# profile_filtered = ProfileReport(data, title="Profiling Report.html")
# profile_filtered

### 5. Save data filtered as data_raw

In [None]:
data_raw = data_filtered.copy()
data_raw.to_pickle('artifacts/data/data_raw.pkl')

### 6. Split into train-test

In [None]:
# features - target
target = 'Price'
list_features = list(set(data_raw.columns) - set([target]))
list_features

In [None]:
X_train, X_test, y_train, y_test = train_test_split(data_raw[list_features], 
                                                    data_raw[[target]], 
                                                    test_size = 0.2, 
                                                    random_state=42
                                                   )

In [None]:
X_train.head(3)

In [None]:
y_train.head(3)

### 7. Save train test

In [None]:
X_train.to_pickle('artifacts/data/X_train.pkl')
X_test.to_pickle('artifacts/data/X_test.pkl')
y_train.to_pickle('artifacts/data/y_train.pkl')
y_test.to_pickle('artifacts/data/y_test.pkl')