In [19]:
# Defines
#=================================================#
IS_TRAIN_NOT_TEST = True

ENV_LOCAL_MACHINE   = 1
ENV_GOOGLE_COLLABS  = 2
ENV_KAGGLE          = 3
ENVIRONMENT = ENV_LOCAL_MACHINE

#================================== =======s========#
CSV_DATASET_INPUT   = "dataset_train.csv"
CSV_DATASET_OUTPUT  = "dataset_transformed.csv"

if ENVIRONMENT == ENV_LOCAL_MACHINE:
    PATH_DATASET_INPUT  = "./dataset_raw/"
    PATH_DATASET_OUTPUT = "./outputs/"
if ENVIRONMENT == ENV_GOOGLE_COLLABS:
    PATH_DATASET_INPUT  = "drive/MyDrive/UTN_Finales/[F] Aprendizaje Automatico/Repositorio/dataset_raw/"
    PATH_DATASET_OUTPUT = "drive/MyDrive/UTN_Finales/[F] Aprendizaje Automatico/Repositorio/outputs/"

FEATURE_TARGET = "is_click"

In [20]:
if ENVIRONMENT != ENV_LOCAL_MACHINE:
    !pip install category_encoders

In [21]:
# Import libraries
import pandas as pd
import numpy as np

import category_encoders as ce
from sklearn.impute import SimpleImputer

In [22]:
# Import dataset
if ENVIRONMENT == ENV_GOOGLE_COLLABS:
    from google.colab import drive
    drive.mount('/content/drive')

dataset = pd.read_csv(PATH_DATASET_INPUT+CSV_DATASET_INPUT)
dataset_size = dataset.size

# Convert object columns to string
for col in dataset.select_dtypes(include=['object']).columns:
    dataset[col] = dataset[col].astype("string")

dataset_t = dataset

In [23]:
# Functions
def remove_feature(ds, feature):
    return ds.drop([feature], axis=1)

def fill_na(ds, feature, filler):
    return ds[feature].fillna(filler)

def encode_frequencyEncoding(ds, feature):
    frequency_encoding = ds[feature].value_counts() / len(dataset_t)
    return ds[feature].map(frequency_encoding)


In [24]:
# Remove columns which represent too many missing values
dataset_t = remove_feature(dataset_t, "product_category_2")

# Remove random parameters
dataset_t = remove_feature(dataset_t, "session_id")
dataset_t = remove_feature(dataset_t, "user_id")

In [25]:
# Extract dateTime column into minutes column
dataset_t["DateTime"] = pd.to_datetime(dataset_t["DateTime"])
dataset_t["minutes"] = dataset_t["DateTime"].dt.hour * 60 + dataset_t["DateTime"].dt.minute

# Apply sine and cosine transformations
period = 60*24
dataset_t["minutes_sin"] = np.sin(2 * np.pi * dataset_t["minutes"] / period)
dataset_t["minutes_cos"] = np.cos(2 * np.pi * dataset_t["minutes"] / period)

# Remove datetime and minutes column
dataset_t = remove_feature(dataset_t, "DateTime")
dataset_t = remove_feature(dataset_t, "minutes")

In [26]:
# Fill not specified gender with string "Not Specified"
dataset_t["gender"] = fill_na(dataset_t, "gender", "Not Specified")
dataset_t["is_male"]    = dataset_t["gender"].apply(lambda x: 1 if x == 'Male' else 0)
dataset_t["is_female"]  = dataset_t["gender"].apply(lambda x: 1 if x == 'Female' else 0)

# Remove gender column
dataset_t = remove_feature(dataset_t, "gender")

In [27]:
# Frequency encoding
dataset_t['webpage_id_encoded'] = encode_frequencyEncoding(dataset_t, "webpage_id")

# Remove webpage_id column
dataset_t = remove_feature(dataset_t, "webpage_id")

In [28]:
# Frequency encoding
dataset_t['campaign_id_encoded'] = encode_frequencyEncoding(dataset_t, "campaign_id")

# Remove campaign_id column
dataset_t = remove_feature(dataset_t, "campaign_id")

In [29]:
# Fill with 0.0
dataset_t["user_group_id"] = fill_na(dataset_t, "user_group_id", 0.0)

# Binary encoding
encoder = ce.BinaryEncoder(cols=['user_group_id'])
df_encoded = encoder.fit_transform(dataset_t['user_group_id'])
dataset_t = pd.concat([df_encoded, dataset_t], axis=1)

# Remove user_group_id column
dataset_t = remove_feature(dataset_t, "user_group_id")

  X[column] = X[column].astype("object").fillna(np.nan).map(col_mapping)
  X[column] = X[column].astype("object").fillna(np.nan).map(col_mapping)
  X[column] = X[column].astype("object").fillna(np.nan).map(col_mapping)


In [30]:
# Binary encoding
encoder = ce.BinaryEncoder(cols=['product'])
df_encoded = encoder.fit_transform(dataset_t['product'])
dataset_t = pd.concat([df_encoded, dataset_t], axis=1)

# Remove product column
dataset_t = remove_feature(dataset_t, "product")

In [31]:
# Fill with most frequent value
mode_imputer = SimpleImputer(strategy='most_frequent')
dataset_t['age_level'] = mode_imputer.fit_transform(dataset_t[["age_level"]])

In [32]:
# Fill with 1.0
dataset_t["user_depth"] = fill_na(dataset_t, "user_depth", 0.0)

In [33]:
# Fill with most frequent value
mode_imputer = SimpleImputer(strategy='most_frequent')
dataset_t['city_development_index'] = mode_imputer.fit_transform(dataset_t[["city_development_index"]])

In [34]:
# Check missing values
print(pd.concat([dataset_t.isna().sum()/dataset_size*100, dataset_t.isna().sum()], axis=1))

                          0  1
product_0               0.0  0
product_1               0.0  0
product_2               0.0  0
product_3               0.0  0
user_group_id_0         0.0  0
user_group_id_1         0.0  0
user_group_id_2         0.0  0
user_group_id_3         0.0  0
product_category_1      0.0  0
age_level               0.0  0
user_depth              0.0  0
city_development_index  0.0  0
var_1                   0.0  0
is_click                0.0  0
minutes_sin             0.0  0
minutes_cos             0.0  0
is_male                 0.0  0
is_female               0.0  0
webpage_id_encoded      0.0  0
campaign_id_encoded     0.0  0


In [35]:
if IS_TRAIN_NOT_TEST == True:
    # Append the column to move at the end of the list
    cols = list(dataset_t.columns)
    cols.remove(FEATURE_TARGET)
    cols.append(FEATURE_TARGET)

    dataset_t = dataset_t[cols]

In [36]:
# Save dataset
dataset_t.to_csv(PATH_DATASET_OUTPUT+CSV_DATASET_OUTPUT, index=False)
