In [85]:
import os
import sys

# Obtener la ruta absoluta del directorio actual
current_dir = os.getcwd()

# Obtener la ruta absoluta del directorio padre
parent_dir = os.path.dirname(current_dir)

# Agregar el directorio padre a sys.path
sys.path.append(parent_dir)

In [86]:
from preprocess.preprocess_data import DataPreprocessor
data_transformer = DataPreprocessor()



In [66]:
import argparse
import joblib
from sklearn.base import BaseEstimator, TransformerMixin
import pandas as pd
from datetime import datetime
from sklearn.preprocessing import MinMaxScaler # Data preprocessing


class DataPreprocessor(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass
    
    def fit(self, X, y=None):
        return self
  
    def transform(self, X):
        df_transformed = self.feature_generation(X)
        #df_transformed = self.scaling_func(df_transformed)
        return df_transformed

    def preprocess_input(data):
        if isinstance(data, pd.DataFrame):
            return data
        elif isinstance(data, dict):
            df = pd.DataFrame([data])
            return df
        else:
            raise ValueError("Los datos proporcionados no son compatibles.")
    
    def feature_generation(self,data: pd.DataFrame) -> pd.DataFrame:        

        # Calculate client age
        data['Age'] = 2023 - data['Year_Birth']
        
        # Calculate number of years since customer registration
        registration_year = pd.to_datetime(data['Dt_Customer'], format='%d-%m-%Y').apply(lambda x: x.year)
        current_year = datetime.now().year
        data['Years_Since_Registration'] = current_year - registration_year 

        # Encode Education
        data["Education"] = data["Education"].replace({"Basic": 0, "Graduation": 1, "2n Cycle": 2, "Master": 2, "PhD": 3})
        
        # Calculate total amount spent on products
        mnt_cols = ['MntWines', 'MntFruits', 'MntMeatProducts', 'MntFishProducts', 'MntSweetProducts', 'MntGoldProds']
        data['Sum_Mnt'] = data[mnt_cols].sum(axis=1)
        
        # Calculate number of companies in which the client accepted the offer
        accepted_cmp_cols = ['AcceptedCmp1', 'AcceptedCmp2', 'AcceptedCmp3', 'AcceptedCmp4', 'AcceptedCmp5', 'Response']
        data['Num_Accepted_Cmp'] = data[accepted_cmp_cols].sum(axis=1)

        # Calculate total number of purchases
        total_purchases = ['NumWebPurchases', 'NumCatalogPurchases', 'NumStorePurchases']
        data['Num_Total_Purchases'] = data[total_purchases].sum(axis=1)

        # Drop missing values and unnecessary columns
        data.dropna(inplace=True)
        data.drop(['Year_Birth', 'Z_CostContact', 'Z_Revenue', 'Dt_Customer', 'Marital_Status'], axis=1, inplace=True)
    
        # Apply one-hot encoding for remaining categorical features
        data = pd.get_dummies(data)

        return data

    def scaling_func(self, data: pd.DataFrame) -> pd.DataFrame:
        mms = MinMaxScaler()
        return pd.DataFrame(data=mms.fit_transform(data), columns=data.columns)

    models_dir = '../models/'

    # def load_kmeans_model(self,n_clusters):
    #     return joblib.load(model_filename)

def predict_clusters(data,n_clusters):
    kmeans_model = joblib.load(f"../models/kmeans_{n_clusters}_clusters_model.pkl")
    return kmeans_model.predict(data)

In [87]:
json_data = {
    "ID": 1,
    "Education": "Graduation",
    "Year_Birth": 1957,
    "Marital_Status": "Single",
    "Income": 2103,
    "Kidhome": 0,
    "Teenhome": 0,
    "Dt_Customer": "04-09-2012",
    "Recency": 50,
    "MntWines": 22,
    "MntFruits": 88,
    "MntMeatProducts": 546,
    "MntFishProducts": 172,
    "MntSweetProducts": 88,
    "MntGoldProds": 88,
    "NumDealsPurchases": 3,
    "NumWebPurchases": 8,
    "NumCatalogPurchases": 10,
    "NumStorePurchases": 4,
    "NumWebVisitsMonth": 7,
    "AcceptedCmp3": 0,
    "AcceptedCmp4": 0,
    "AcceptedCmp5": 0,    
    "AcceptedCmp1": 0,
    "AcceptedCmp2": 0,
    "Complain": 0,
    "Z_CostContact": 3,
    "Z_Revenue": 11,
    "Response": 1
}

df = pd.DataFrame([json_data])

In [88]:
df_transform = data_transformer.feature_generation('/',df)
predict_clusters(data=df_transform,n_clusters=4)

TypeError: DataPreprocessor.feature_generation() takes 2 positional arguments but 3 were given

In [10]:
# Usage
if __name__ == "__main|a__":
    LoadAndPredict = LoadAndPredict()

      # Load new data for prediction
    df = pd.read_csv("../data/retrieved_data.csv")  # Replace with your new data file

    # Prepare data
    df = DataPreprocessor.feature_generation('/',df)
    df_transform = DataPreprocessor.scaling_func('/',df)
 
    # Predict clusters for the new data
    df['Predicted_Cluster'] = LoadAndPredict.predict_clusters(df_transform)
     
    # Save preprocessed data with predictions to a new CSV
    df.to_csv("data_with_predictions.csv", index=False)