In [160]:
from sklearn.datasets import make_classification
from sklearn.datasets import make_regression
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder

import warnings
warnings.filterwarnings("ignore")

### -1. Root repo

In [2]:
import os
# get actual path
actual_path = os.path.abspath(os.getcwd())

# divide each folder in the path in a list
# delete the last element to get the root path of the local folder
list_root_path = actual_path.split('\\')[:-1]

# join the list in a string with the root
root_path = '\\'.join(list_root_path)

# pararme en el root obtenido
os.chdir(root_path)

print('before: ', actual_path)
print('after: ', root_path)

before:  D:\Github-mi-repo\Discovery-ensembles-stackings-methods\discovery
after:  D:\Github-mi-repo\Discovery-ensembles-stackings-methods


# 1. Data Example 1

#### PARAMETERS

In [3]:
# parameters my example
N_SAMPLES = 1000
N_FEATURES = 20
N_INFORMATIVE = 15
N_REDUNDANT = 5 # for classification
NOISE = 0.1 # for regression
RANDOM_STATE = 1

In [4]:
# parameters save file
path_data_folder = 'data'
path_data_example = 'example1'

#### GENERATE DATA X, y

In [5]:
def get_dataset_regression(n_samples, n_features, n_informative, noise, random_state):
    
    # generate data
    X, y = make_regression(n_samples=n_samples, 
                           n_features=n_features, 
                           n_informative=n_informative, 
                           noise=noise, 
                           random_state=random_state)
    
    # save in a dataframe X
    columns_name = ['feature_' + str(x + 1) for x in range(n_features)]
    X = pd.DataFrame(X, columns = columns_name)
    
    # save in a dataframe y
    target_name = ['target']
    y = pd.DataFrame(y, columns = target_name)
    
    return X, y

In [6]:
data_X, data_y = get_dataset_regression(n_samples = N_SAMPLES, 
                                        n_features = N_FEATURES, 
                                        n_informative = N_INFORMATIVE, 
                                        noise = NOISE, 
                                        random_state = RANDOM_STATE)

#### SPLIT TRAIN AND TEST

In [10]:
X_train, X_test, y_train, y_test = train_test_split(
    data_X, data_y, test_size = 0.2, random_state = 42, shuffle = True
)

#### SAVE DATA

In [40]:
# Path folder where to save
path_save = path_data_folder + '/' + path_data_example

# save 
X_train.to_csv(path_save + '/' + 'X_train.csv')
y_train.to_csv(path_save + '/' + 'y_train.csv')
X_test.to_csv(path_save + '/' + 'X_test.csv')
y_test.to_csv(path_save + '/' + 'y_test.csv')

# 2. Data Example 2

In [41]:
### the same of data 1 but has different parameters

# 3. Data Example 3
- Dataset diamonds (tensorflow)

- Its the same of previous data examples, there is a function data generate the data X and data y (this step is the only difference), then divide in train/test and save it

In [42]:
import tensorflow_datasets as tfds

In [72]:
def get_dataset_diamonds():
    
    # load data
    dataset_diamonds_tf = tfds.load('diamonds', split = 'train')
    dataset_diamonds_tf = tfds.as_dataframe(dataset_diamonds_tf)
    
    # define list features and target
    target = 'price'
    features = list(set(list(dataset_diamonds_tf.columns)) - set([target]))
    
    # get data X and y
    X = dataset_diamonds_tf[features]
    y = dataset_diamonds_tf[[target]]

    # print shape
    print('Shape X: ', X.shape)
    print('Shape y: ', y.shape)
    
    return X, y

### 4. Data Example 4
- This dataset is different because is downloaded from Kaggle, so in the folder of example 4 are the raw data downloaded direcly from kaggle. 
- Source: https://www.kaggle.com/datasets/dansbecker/melbourne-housing-snapshot

In [232]:
# PATH parameters save file
path_data_folder = 'data'
path_data_example = 'example4'

In [233]:
# read data
path = path_data_folder + '/' + path_data_example
path_raw_data = path + '/melb_data.csv'

data = pd.read_csv(path_raw_data)

In [235]:
# there 2 features (priori very important features) that have a lot of nulls, simple solution, I delete this columns
columns_to_delete = ['BuildingArea', 'YearBuilt']
data = data[list(set(list(data.columns)) - set(columns_to_delete))]

In [236]:
# there are some null values, so I delete the rows that have nulls
data.dropna(inplace = True)
data.reset_index(inplace = True)

In [237]:
# delete features that I consider NO important: 
# - date is no important, maybe the older sold is cheaper but it need more feature engineer, so, I delete that
# - method
# - Suburb: there are a lot, it need to be grouped. Need more feature engineer
# - Address: already exist longitud and latitud
# - SellerG: a lot of people. Need more feature engineer
# - CouncilArea: a lot. Need more feature engineer
features_no_important = ['Date', 'Method', 'Suburb', 'Address', 'SellerG', 'CouncilArea']
data = data[list(set(list(data.columns)) - set(features_no_important))]

In [238]:
# there 2 feature that are categorical -> one hot label encoded
list_categorical_features = ['Regionname', 'Type']

# separated data numeric - categorical
data_numeric = data[list(set(list(data.columns)) - set(list_categorical_features))]
data_categorical = data[list_categorical_features]

# one hot encoder categorical
enc = OneHotEncoder()
data_categorical_enc = pd.DataFrame(enc.fit_transform(data_categorical).toarray())

# merge dataframe numeric - categorical enc in unique data
data = pd.merge(data_numeric, data_categorical_enc, left_index=True, right_index=True)

In [265]:
# separate in X and y
target = 'Price'
X = data[list(set(list(data.columns)) - set([target]))]
y = data[[target]]

# print shape
print('Shape X: ', X.shape)
print('Shape y: ', y.shape)

Shape X:  (12211, 22)
Shape y:  (12211, 1)
