# 0.0 Imports

In [49]:
import pandas as pd
import numpy as np
from typing import Tuple
from functools import reduce 
from sklearn import preprocessing as pp
from sklearn.manifold import TSNE
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from sklearn.mixture import GaussianMixture
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import silhouette_score
from google.cloud import bigquery

## 0.1 Helper Functions

In [3]:
def coluna_para_int(dataframe: pd.DataFrame, nome_coluna: str) -> None: #mudar:
    """
    Descricao
    
    Args:
    
    Retornos:
    """
    
    if nome_coluna not in dataframe.columns:
        raise ValueError(f'Coluna {nome_coluna} nao foi encontrada no dataframe de entrada')

    try:
        dataframe[nome_coluna] = dataframe[nome_coluna].astype(int)
    except (ValueError, TypeError):
        raise ValueError(f'Nao foi possivel converter a coluna {nome_coluna} para inteiro')
    
    return True # mudar tipagem inicial
    
def coluna_para_date(dataframe: pd.DataFrame, nome_coluna: str, formato_data: str = None) -> bool: 
    """
    Descricao
    
    Args:
    
    Retornos:
    """
    
    if nome_coluna not in dataframe.columns:
        raise ValueError(f'Coluna {nome_coluna} nao foi encontrada no dataframe de entrada')

    try:
        if formato_data:
            dataframe[nome_coluna] = pd.to_datetime(dataframe[nome_coluna], format=formato_data)
        else:
            dataframe[nome_coluna] = pd.to_datetime(dataframe[nome_coluna])
    except (ValueError, TypeError):
        raise ValueError(f"Could not convert column '{nome_coluna}'")

    # Retorna True se a conversão foi bem sucedida
    return True

def keep_features(dataframe,keep_columns):
    """Retorna um dataframe apenas com as colunas escolhidas"""
    return dataframe[keep_columns]

def filtering_features(dataframe_raw: pd.DataFrame) -> Tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]:
    """
    Filters and preprocesses the input dataframe.

    Args:
        dataframe_raw: A pandas DataFrame containing raw sales data.

    Returns:
        Three pandas DataFrames containing the filtered returns and purchases data, and the filtered main data.
    """
    # Filter returns and purchases data
    df_returns = dataframe_raw.loc[dataframe_raw['Quantity'] < 0, ['CustomerID', 
                                                                   'Quantity']]
    df_purchases = dataframe_raw.loc[dataframe_raw['Quantity'] >= 0, :]
    
    # Filter main data
    df_filtered = keep_features(dataframe_raw, ['InvoiceNo', 'StockCode', 'Quantity',
                                                'InvoiceDate', 'UnitPrice', 
                                                'CustomerID', 'Country'])
    
    return df_filtered, df_purchases, df_returns

def calculate_gross_revenue(dataframe_purchases: pd.DataFrame) -> pd.DataFrame:
    """
    Calcula a receita bruta de cada cliente com base nas colunas 'Quantity' e 'UnitPrice' e retorna
    um DataFrame com as colunas 'CustomerID' e 'gross_revenue'.

    Args:
        dataframe_purchases (pd.DataFrame): O DataFrame das compras contendo as colunas 'CustomerID', 'Quantity' e 'UnitPrice'.

    Returns:
        pd.DataFrame: O DataFrame resultante contendo as colunas 'CustomerID' e 'gross_revenue'.
    """
    # Verifica se as colunas necessárias estão presentes no DataFrame de entrada
    required_columns = {'CustomerID', 'Quantity', 'UnitPrice'}
    missing_columns = required_columns - set(dataframe_purchases.columns)
    if missing_columns:
        raise ValueError(f"O DataFrame de entrada está faltando as seguintes colunas: {missing_columns}")
    
    # Calcula a receita bruta e agrupa por CustomerID
    df = dataframe_purchases.copy()
    df.loc[:, 'gross_revenue'] = df.loc[:, 'Quantity'] * df.loc[:, 'UnitPrice']
    grouped_df = df.groupby('CustomerID').agg({'gross_revenue': 'sum'}).reset_index().copy()
    
    return grouped_df

def create_recency(dataframe_purchases: pd.DataFrame, dataframe_filtered: pd.DataFrame) -> pd.DataFrame:
    """
    Calcula a recência da última compra para cada cliente.

    Args:
        dataframe_purchases (pd.DataFrame): DataFrame com as informações de compras de todos os clientes.
        dataframe_filtered (pd.DataFrame): DataFrame filtrado apenas com as informações dos clientes que desejamos calcular a recência.

    Returns:
        pd.DataFrame: DataFrame com as colunas 'CustomerID' e 'recency_days', indicando a recência em dias da última compra para cada cliente.

    """
    required_columns = {'CustomerID', 'InvoiceDate'}
    missing_columns = required_columns - set(dataframe_purchases.columns)
    if missing_columns:
        raise ValueError(f"O DataFrame de entrada está faltando as seguintes colunas: {missing_columns}")
    # calcula a data da última compra de cada cliente
    df_recency = dataframe_purchases.loc[:, ['CustomerID', 'InvoiceDate']].groupby('CustomerID').max().reset_index()
    
    # calcula a recência em dias da última compra de cada cliente em relação à data mais recente da base de dados filtrada
    df_recency['recency_days'] = (dataframe_filtered['InvoiceDate'].max() - df_recency['InvoiceDate']).dt.days
    
    # retorna o DataFrame apenas com as colunas 'CustomerID' e 'recency_days'
    return df_recency[['CustomerID', 'recency_days']]

def create_quantity_purchased(dataframe_purchases: pd.DataFrame) -> pd.DataFrame:
    """
    Calcula a quantidade de produtos adquiridos por cada cliente.

    Args:
        dataframe_purchases (pd.DataFrame): DataFrame com as informações de compras de todos os clientes.

    Returns:
        pd.DataFrame: DataFrame com as colunas 'CustomerID' e 'qty_products', indicando a quantidade de produtos adquiridos por cada cliente.
    """
    required_columns = {'CustomerID', 'StockCode'}
    missing_columns = required_columns - set(dataframe_purchases.columns)
    if missing_columns:
        raise ValueError(f"O DataFrame de entrada está faltando as seguintes colunas: {missing_columns}")
    # agrupa as informações de compras por CustomerID e conta o número de StockCode para cada grupo
    qty_purchased = dataframe_purchases.loc[:, ['CustomerID', 'StockCode']].groupby('CustomerID').count()
    
    # renomeia a coluna StockCode para qty_products e reseta o índice para transformar o CustomerID em uma coluna
    qty_purchased = qty_purchased.reset_index().rename(columns={'StockCode': 'qty_products'})
    
    # retorna o DataFrame com as colunas 'CustomerID' e 'qty_products'
    return qty_purchased

def create_freq_purchases(dataframe_purchases: pd.DataFrame) -> pd.DataFrame:
    """
    Calculates the purchase frequency of each customer based on the purchase history.

    Parameters
    ----------
    dataframe_purchases : pd.DataFrame
        DataFrame with purchase history of each customer, containing columns CustomerID, InvoiceNo, and InvoiceDate.

    Returns
    -------
    pd.DataFrame
        DataFrame with the purchase frequency of each customer, containing columns CustomerID and frequency.
    """
    required_columns = {'CustomerID', 'InvoiceNo', 'InvoiceDate'}
    missing_columns = required_columns - set(dataframe_purchases.columns)
    if missing_columns:
        raise ValueError(f"O DataFrame de entrada está faltando as seguintes colunas: {missing_columns}")

    # Calculate time range of purchases for each customer
    df_aux = (dataframe_purchases[['CustomerID', 'InvoiceNo', 'InvoiceDate']]
              .drop_duplicates()
              .groupby('CustomerID')
              .agg(max_=('InvoiceDate', 'max'),
                   min_=('InvoiceDate', 'min'),
                   days_=('InvoiceDate', lambda x: ((x.max() - x.min()).days) + 1),
                   buy_=('InvoiceNo', 'count'))
              .reset_index())

    # Calculate frequency of purchases for each customer
    df_aux['frequency'] = df_aux[['buy_', 'days_']].apply(
        lambda x: x['buy_'] / x['days_'] if x['days_'] != 0 else 0, axis=1)

    return df_aux

def create_qty_returns(dataframe_returns: pd.DataFrame) -> pd.DataFrame:
    """
    Computes the total quantity of returned products for each customer.

    Args:
        dataframe_returns: A pandas DataFrame containing information about returns.

    Returns:
        A pandas DataFrame with the total quantity of returned products for each customer.
    """
    # Validate input data
    
    if not all(col in dataframe_returns.columns for col in ['CustomerID', 'Quantity']):
        raise ValueError("Input DataFrame must contain 'CustomerID' and 'Quantity' columns")
    
    # Compute quantity of returns
    df_returns = dataframe_returns[['CustomerID', 'Quantity']].groupby('CustomerID').sum().reset_index().rename(columns={'Quantity': 'qty_returns'})
    df_returns['qty_returns'] = df_returns['qty_returns']* -1
    
    return df_returns

def run_feature_engineering(dataframe_filtered: pd.DataFrame, dataframe_purchases: pd.DataFrame, dataframe_returns: pd.DataFrame) -> pd.DataFrame:
    """
    Performs feature engineering on the input dataframes and returns a new dataframe with the engineered features.

    Args:
        dataframe_filtered: A pandas DataFrame containing filtered customer order data.
        dataframe_purchases: A pandas DataFrame containing customer purchase data.
        dataframe_returns: A pandas DataFrame containing customer return data.

    Returns:
        A pandas DataFrame with the engineered features for each customer.
    """
    # Check if input dataframes are empty
    if dataframe_filtered.empty:
        raise ValueError("Input DataFrame 'dataframe_filtered' is empty")
    if dataframe_purchases.empty:
        raise ValueError("Input DataFrame 'dataframe_purchases' is empty")

    # Check if required columns are present in input dataframes
    required_columns = ['CustomerID', 'InvoiceDate', 'StockCode', 'Quantity', 'UnitPrice']
    for df, name in zip([dataframe_filtered, dataframe_purchases], ['dataframe_filtered', 'dataframe_purchases']):
        missing_columns = set(required_columns) - set(df.columns)
        if missing_columns:
            raise ValueError(f"Missing columns {missing_columns} in input DataFrame '{name}'")
    if 'CustomerID' not in dataframe_returns.columns:
        raise ValueError("Column 'CustomerID' not found in input DataFrame 'dataframe_returns'")
    if 'Quantity' not in dataframe_returns.columns:
        raise ValueError("Column 'Quantity' not found in input DataFrame 'dataframe_returns'")

    # Perform feature engineering
    df_fengi = keep_features(dataframe_filtered, ['CustomerID']).drop_duplicates(ignore_index=True)
    gross_revenue = calculate_gross_revenue(dataframe_purchases)
    df_recency = create_recency(dataframe_purchases, dataframe_filtered)
    df_qty_products = create_quantity_purchased(dataframe_purchases)
    df_freq = create_freq_purchases(dataframe_purchases)
    returns = create_qty_returns(dataframe_returns)

    # Merge dataframes
    dfs = [df_fengi, gross_revenue, df_recency, df_qty_products, df_freq, returns]
    df_fengi = reduce(lambda left,right: pd.merge(left, right, on='CustomerID', how='left'), dfs)

    # Fill NaN values
    df_fengi['qty_returns'] = df_fengi['qty_returns'].fillna(0)

    # Select final features and return dataframe
    features = ['CustomerID', 'gross_revenue', 'recency_days', 'qty_products', 'frequency', 'qty_returns']
    return keep_features(df_fengi, features).dropna()

# 1.0 Loading Data

In [4]:
%%bigquery df_raw
select * from `comunidade-ds-420801.tabela_teste.ecommerce_criado`
where InvoiceDate <= '2023-05-15'

Query is running:   0%|          |

Downloading:   0%|          |

In [5]:
df_raw.head()

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country
0,C539576,85123A,WHITE HANGING HEART T-LIGHT HOLDER,-24,2016-12-18,2.95,14911.0,EIRE
1,C539576,84032B,CHARLIE + LOLA RED HOT WATER BOTTLE,-24,2016-12-18,2.95,14911.0,EIRE
2,C539576,84032A,CHARLIE+LOLA PINK HOT WATER BOTTLE,-24,2016-12-18,2.95,14911.0,EIRE
3,C539576,72818,CHRISTMAS DECOUPAGE CANDLE,-36,2016-12-18,0.72,14911.0,EIRE
4,C539576,72817,SET OF 2 CHRISTMAS DECOUPAGE CANDLE,-24,2016-12-18,1.25,14911.0,EIRE


In [6]:
df_raw.shape

(404714, 8)

# 2.0 Data Preparation

In [7]:
df_raw.dtypes

InvoiceNo       object
StockCode       object
Description     object
Quantity         Int64
InvoiceDate     dbdate
UnitPrice      float64
CustomerID     float64
Country         object
dtype: object

In [8]:
df_raw.columns

Index(['InvoiceNo', 'StockCode', 'Description', 'Quantity', 'InvoiceDate',
       'UnitPrice', 'CustomerID', 'Country'],
      dtype='object')

In [9]:
#troca de tipos
#invoice date
coluna_para_date(df_raw, 'InvoiceDate', '%d-%b-%y')
#customer_id
coluna_para_int(df_raw,'CustomerID')

True

In [10]:
#criação dos dataframes de compras e retornos
df_filtered, df_purchases, df_returns = filtering_features(df_raw)

In [11]:
df_filtered.head(2)

Unnamed: 0,InvoiceNo,StockCode,Quantity,InvoiceDate,UnitPrice,CustomerID,Country
0,C539576,85123A,-24,2016-12-18,2.95,14911,EIRE
1,C539576,84032B,-24,2016-12-18,2.95,14911,EIRE


In [12]:
df_purchases.head(2)

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country
21,539551,85135C,RED DRAGONFLY HELICOPTER,2,2016-12-18,7.95,12721,France
22,539551,15056N,EDWARDIAN PARASOL NATURAL,3,2016-12-18,5.95,12721,France


In [13]:
df_returns.head(2)

Unnamed: 0,CustomerID,Quantity
0,14911,-24
1,14911,-24


# 3.0 Feature Engeneering

In [14]:
df_fengi = run_feature_engineering(df_filtered,df_purchases,df_returns)
df_fengi.head()

Unnamed: 0,CustomerID,gross_revenue,recency_days,qty_products,frequency,qty_returns
0,14911,140450.72,1.0,5673.0,0.533512,3332
1,12721,2386.13,31.0,155.0,0.018519,5
2,12681,12365.54,14.0,616.0,0.062147,15
3,12481,5020.32,22.0,209.0,0.02907,14
4,12471,17424.05,2.0,439.0,0.07438,246


In [15]:
df_fengi.columns

Index(['CustomerID', 'gross_revenue', 'recency_days', 'qty_products',
       'frequency', 'qty_returns'],
      dtype='object')

# 4.0 EDA

In [16]:
df_eda = keep_features(df_fengi,['gross_revenue', 'recency_days', 'qty_products',
       'frequency', 'qty_returns'])
df_eda.head()

Unnamed: 0,gross_revenue,recency_days,qty_products,frequency,qty_returns
0,140450.72,1.0,5673.0,0.533512,3332
1,2386.13,31.0,155.0,0.018519,5
2,12365.54,14.0,616.0,0.062147,15
3,5020.32,22.0,209.0,0.02907,14
4,17424.05,2.0,439.0,0.07438,246


# 5.0 Experimentação

## 5.1 K Clusters

In [22]:
k_clusters = np.arange(2,5,1)
k_clusters

array([2, 3, 4])

## 5.2 Scaler

In [21]:
scaler = pp.MinMaxScaler()

## 5.3 Algoritmos de redução de dimensionalidade

In [26]:
def train_pca(
    scaler, dataframe: pd.DataFrame, to_scaling: bool = True, **kwargs
) -> pd.DataFrame:
    """
    Treina um modelo PCA no dataframe de entrada e aplica-o aos dados usando o scaler especificado.

    Args:
        scaler: objeto scaler para transformar os dados
        dataframe (pd.DataFrame): dataframe de entrada
        to_scaling (bool): indica se os dados devem ser escalados antes de aplicar o PCA (padrão True)
        **kwargs: argumentos adicionais para a inicialização do modelo PCA
    Raises:
        ValueError: se o dataframe de entrada estiver vazio
    """
    if dataframe.empty:
        raise ValueError("Input dataframe is empty.")

    if to_scaling:
        X = scaler.fit_transform(dataframe)
    else:
        X = dataframe.copy()
    pca = PCA(**kwargs)
    principal_components = pca.fit_transform(X)
    df_pca = pd.DataFrame(principal_components)
    return df_pca


def train_umap(
    scaler, dataframe: pd.DataFrame, to_scaling: bool = True, **kwargs
) -> pd.DataFrame:
    """ """
    if dataframe.empty:
        raise ValueError("Input dataframe is empty.")

    if to_scaling:
        X = scaler.fit_transform(dataframe)
    else:
        X = dataframe.copy()

    reducer = umap.UMAP(random_state=42, **kwargs)
    embedding = reducer.fit_transform(X)

    df_umap = pd.DataFrame()
    df_umap["embedding_x"] = embedding[:, 0]
    df_umap["embedding_y"] = embedding[:, 1]
    return df_umap


def train_tsne(
    scaler, dataframe: pd.DataFrame, to_scaling: bool = True, **kwargs
) -> pd.DataFrame:
    """ """
    if dataframe.empty:
        raise ValueError("Input dataframe is empty.")

    if to_scaling:
        X = scaler.fit_transform(dataframe)
    else:
        X = dataframe.copy()

    reducer = TSNE(random_state=42, **kwargs)
    embedding = reducer.fit_transform(X)

    df_tsne = pd.DataFrame()
    df_tsne["embedding_x"] = embedding[:, 0]
    df_tsne["embedding_y"] = embedding[:, 1]
    return df_tsne

def train_tree_embedding(scaler, dataframe: pd.DataFrame,
    target: str = "gross_revenue", **kwargs
) -> pd.DataFrame:
    """
    
    """
    
    # Separate features and target variable
    X = dataframe.drop(columns=[target], axis=1)
    X = pd.DataFrame(scaler.fit_transform(X.values), 
                     columns=X.columns, 
                     index=X.index)
    y = dataframe[target]
    
    # Define and train the model
    rf_model = RandomForestRegressor(random_state=42, **kwargs)
    rf_model.fit(X, y)

    # Compute leaf indices for each sample
    leaf_indices = rf_model.apply(X)

    # Create DataFrame with leaf indices
    df_leaf = pd.DataFrame(leaf_indices)

    return df_leaf

In [27]:
df_pca = train_pca(scaler, df_eda, n_components = 2)

In [28]:
df_umap = train_umap(scaler, df_eda)

  warn(f"n_jobs value {self.n_jobs} overridden to 1 by setting random_state. Use no seed for parallelism.")


In [31]:
df_tsne = train_tsne(scaler, df_eda)

In [32]:
df_tree = train_tree_embedding(scaler, df_eda)

In [34]:
df_tree.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
0,5390,5436,5370,5480,5380,5518,5376,5376,5428,5474,...,5479,5419,5429,5382,5353,5380,5401,5416,5446,5464
1,4500,4724,4360,4414,4658,4595,4203,4732,4424,4320,...,4572,4774,4696,4275,4353,4775,4935,4328,4628,4457
2,5144,5365,5239,5323,5229,5397,5213,5258,5301,5200,...,5280,5314,5340,5174,5216,5218,5244,5292,5299,5260
3,4928,4942,4965,5134,4904,4895,5034,4795,5051,5091,...,5144,4837,4847,5011,5102,5084,4963,5029,5141,5066
4,5286,5394,5281,5409,5302,5436,5304,5323,5353,5347,...,5425,5356,5374,5307,5292,5326,5331,5366,5382,5403


## 5.4 Algoritmos de clusterização

In [35]:
def train_kmeans(dataframe: pd.DataFrame, k_clusters: list, **kwargs) -> list:
    """
    Treina o modelo KMenas para diferentes valores de k_clusters e calcula a pontuação de silhueta para cada um.

    Args:
        dataframe: um dataframe com as features de treinamento
        k_clusters: uma lista de inteiros que representa o número de clusters a serem testados
        **kwargs: argumentos adicionais para a inicialização do modelo de mistura gaussiana

    Returns:
        Uma lista de pontuações de silhueta correspondentes a cada valor de k_clusters.
    """

    silhouette_scores = []

    for k in k_clusters:
        # Model training
        kmeans_model = KMeans(n_clusters=k, random_state=42, **kwargs).fit(dataframe)

        # Model predict
        labels = kmeans_model.predict(dataframe)

        # Model performance
        silhouette = silhouette_score(dataframe, labels, metric="euclidean")
        silhouette_scores.append(silhouette)

    return silhouette_scores

def train_gmm(dataframe: pd.DataFrame, k_clusters: list, **kwargs) -> list:
    """
    
    """

    silhouette_scores = []

    for k in k_clusters:
        # Model training
        gmm_model = GaussianMixture(n_components=k, random_state=42, **kwargs).fit(dataframe)

        # Model predict
        labels = gmm_model.predict(dataframe)

        # Model performance
        silhouette = silhouette_score(dataframe, labels, metric="euclidean")
        silhouette_scores.append(silhouette)

    return silhouette_scores

In [42]:
kmeans_list = train_kmeans(df_pca, k_clusters, n_init = 100)
gmm_list = train_gmm(df_pca, k_clusters, n_init = 100)
df_result_pca = pd.DataFrame({
'scaler': str(scaler),
'reducer': 'PCA',
'cluster': k_clusters,
'KMeans': kmeans_list,
'GMM': gmm_list
})
df_result_pca

Unnamed: 0,scaler,reducer,cluster,KMeans,GMM
0,MinMaxScaler(),PCA,2,0.729026,-0.116664
1,MinMaxScaler(),PCA,3,0.655202,0.486397
2,MinMaxScaler(),PCA,4,0.579898,0.352297


In [43]:
kmeans_list = train_kmeans(df_umap, k_clusters, n_init = 100)
gmm_list = train_gmm(df_umap, k_clusters, n_init = 100)
df_result_umap = pd.DataFrame({
'scaler': str(scaler),
'reducer': 'UMAP',
'cluster': k_clusters,
'KMeans': kmeans_list,
'GMM': gmm_list
})
df_result_umap

Unnamed: 0,scaler,reducer,cluster,KMeans,GMM
0,MinMaxScaler(),UMAP,2,0.442173,0.443959
1,MinMaxScaler(),UMAP,3,0.50893,0.507416
2,MinMaxScaler(),UMAP,4,0.49072,0.468462


In [44]:
kmeans_list = train_kmeans(df_tsne, k_clusters, n_init = 100)
gmm_list = train_gmm(df_tsne, k_clusters, n_init = 100)
df_result_tsne = pd.DataFrame({
'scaler': str(scaler),
'reducer': 'TSNE',
'cluster': k_clusters,
'KMeans': kmeans_list,
'GMM': gmm_list
})
df_result_tsne

Unnamed: 0,scaler,reducer,cluster,KMeans,GMM
0,MinMaxScaler(),TSNE,2,0.366944,0.357016
1,MinMaxScaler(),TSNE,3,0.408095,0.399221
2,MinMaxScaler(),TSNE,4,0.417818,0.386115


In [46]:
df_leaf = train_tree_embedding(scaler, df_eda,n_jobs=-1,n_estimators=100)
df_leaf_tsne = train_tsne(scaler, df_leaf,to_scaling=False)

kmeans_list = train_kmeans(df_leaf_tsne, k_clusters, n_init = 100)
gmm_list = train_gmm(df_leaf_tsne, k_clusters, n_init = 100)
df_result_tree_tsne = pd.DataFrame({
'scaler': str(scaler),
'reducer': 'Tree Embeding + TSNE',
'cluster': k_clusters,
'KMeans': kmeans_list,
'GMM': gmm_list
})
df_result_tree_tsne

Unnamed: 0,scaler,reducer,cluster,KMeans,GMM
0,MinMaxScaler(),Tree Embeding + TSNE,2,0.363529,0.363367
1,MinMaxScaler(),Tree Embeding + TSNE,3,0.398919,0.371062
2,MinMaxScaler(),Tree Embeding + TSNE,4,0.424415,0.387454


In [47]:
df_leaf_pca = train_pca(scaler, df_leaf,to_scaling=False)

kmeans_list = train_kmeans(df_leaf_pca, k_clusters, n_init = 100)
gmm_list = train_gmm(df_leaf_pca, k_clusters, n_init = 100)
df_result_tree_pca = pd.DataFrame({
'scaler': str(scaler),
'reducer': 'Tree Embeding + PCA',
'cluster': k_clusters,
'KMeans': kmeans_list,
'GMM': gmm_list
})
df_result_tree_pca

Unnamed: 0,scaler,reducer,cluster,KMeans,GMM
0,MinMaxScaler(),Tree Embeding + PCA,2,0.491556,0.450718
1,MinMaxScaler(),Tree Embeding + PCA,3,0.466663,0.447375
2,MinMaxScaler(),Tree Embeding + PCA,4,0.434922,0.424237


In [52]:
df_final_results = pd.concat([df_result_tree_pca,df_result_tree_tsne,df_result_tsne,df_result_umap,df_result_pca],axis = 0).reset_index(drop=True)
df_final_results

Unnamed: 0,scaler,reducer,cluster,KMeans,GMM
0,MinMaxScaler(),Tree Embeding + PCA,2,0.491556,0.450718
1,MinMaxScaler(),Tree Embeding + PCA,3,0.466663,0.447375
2,MinMaxScaler(),Tree Embeding + PCA,4,0.434922,0.424237
3,MinMaxScaler(),Tree Embeding + TSNE,2,0.363529,0.363367
4,MinMaxScaler(),Tree Embeding + TSNE,3,0.398919,0.371062
5,MinMaxScaler(),Tree Embeding + TSNE,4,0.424415,0.387454
6,MinMaxScaler(),TSNE,2,0.366944,0.357016
7,MinMaxScaler(),TSNE,3,0.408095,0.399221
8,MinMaxScaler(),TSNE,4,0.417818,0.386115
9,MinMaxScaler(),UMAP,2,0.442173,0.443959


## 5.6 Salvando os resultados

In [57]:
def salvar_bigquery(dataframe:pd.DataFrame,
                   project_name:str,
                   dataset_table_name:str):
    client = bigquery.Client(project=project_name)
    
    job = client.load_table_from_dataframe(dataframe,dataset_table_name)
    job.result()

In [58]:
#`comunidade-ds-420801.tabela_teste.ecommerce_criado`
salvar_bigquery(df_final_results,
                   'comunidade-ds-420801',
                   'tabela_teste.resultado_clusterizacao')