In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
from scipy import stats
import warnings
warnings.filterwarnings("ignore")


In [2]:
df = pd.read_csv(r"../data/Train.csv")
features = pd.read_csv(r"../data/features.csv")
target = pd.read_csv(r"../data/target.csv").squeeze()

In [3]:
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=42)

# Standardize the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [4]:
# Función para eliminar outliers
def remove_outliers(df, columns, z_threshold=3):
    columns = [col for col in columns if col in df.columns]
    return df[(np.abs(stats.zscore(df[columns])) < z_threshold).all(axis=1)]

# Definir diferentes conjuntos de características
all_features = features.columns.tolist()
numeric_features = [col for col in ['Age', 'Work_Experience', 'Family_Size'] if col in features.columns]
categorical_features = [col for col in ['Gender', 'Ever_Married', 'Graduated', 'Spending_Score', 
                                        'Profession_Artist', 'Profession_Doctor', 'Profession_Engineer', 
                                        'Profession_Entertainment', 'Profession_Executive', 'Profession_Healthcare', 
                                        'Profession_Homemaker', 'Profession_Lawyer', 'Profession_Marketing'] if col in features.columns]

feature_sets = {
    'All Features': all_features,
    'Numeric Only': numeric_features,
    'Categorical Only': categorical_features,
    'No Family Size': [col for col in all_features if col != 'Family_Size']
}

# Definir modelos
models = {
    'Logistic Regression': LogisticRegression(random_state=42, max_iter=1000),
    'Decision Tree': DecisionTreeClassifier(random_state=42),
    'Random Forest': RandomForestClassifier(random_state=42),
    'SVM': SVC(random_state=42),
    'KNN': KNeighborsClassifier()
}

# Preparar resultados en un DataFrame
results = []

# Iterar a través de diferentes escenarios
for outliers in ['With Outliers', 'Without Outliers']:
    for scaling in ['No Scaling', 'Standardization', 'Normalization']:
        for feature_set_name, features_list in feature_sets.items():
            # Preparar los datos
            X_subset = features[features_list]
            y_subset = target
            
            if outliers == 'Without Outliers':
                X_subset = remove_outliers(X_subset, numeric_features)
                y_subset = y_subset[X_subset.index]
            
            # Dividir los datos en train y test
            X_train, X_test, y_train, y_test = train_test_split(X_subset, y_subset, test_size=0.2, random_state=42)
            
            # Aplicar escalado si es necesario
            if scaling == 'Standardization':
                scaler = StandardScaler()
                X_train_scaled = scaler.fit_transform(X_train)
                X_test_scaled = scaler.transform(X_test)
            elif scaling == 'Normalization':
                scaler = MinMaxScaler()
                X_train_scaled = scaler.fit_transform(X_train)
                X_test_scaled = scaler.transform(X_test)
            else:
                X_train_scaled = X_train
                X_test_scaled = X_test
            
            # Entrenar y evaluar modelos
            for model_name, model in models.items():
                model.fit(X_train_scaled, y_train)
                y_pred = model.predict(X_test_scaled)
                accuracy = accuracy_score(y_test, y_pred)
                
                results.append({
                    'Model': model_name,
                    'Outliers': outliers,
                    'Scaling': scaling,
                    'Feature Set': feature_set_name,
                    'Accuracy': accuracy
                })

# Convertir resultados a DataFrame
results_df = pd.DataFrame(results)

# Imprimir resumen
print(results_df)

# Encontrar el mejor modelo
best_model = results_df.loc[results_df['Accuracy'].idxmax()]
print("\nBest performing model:")
print(best_model)

# Opcional: Guardar los resultados en un CSV
results_df.to_csv('../data/model_comparison_results.csv', index=False)

                   Model          Outliers        Scaling     Feature Set  \
0    Logistic Regression     With Outliers     No Scaling    All Features   
1          Decision Tree     With Outliers     No Scaling    All Features   
2          Random Forest     With Outliers     No Scaling    All Features   
3                    SVM     With Outliers     No Scaling    All Features   
4                    KNN     With Outliers     No Scaling    All Features   
..                   ...               ...            ...             ...   
115  Logistic Regression  Without Outliers  Normalization  No Family Size   
116        Decision Tree  Without Outliers  Normalization  No Family Size   
117        Random Forest  Without Outliers  Normalization  No Family Size   
118                  SVM  Without Outliers  Normalization  No Family Size   
119                  KNN  Without Outliers  Normalization  No Family Size   

     Accuracy  
0    0.797619  
1    0.753720  
2    0.774554  
3    0.6808

In [5]:
#display all rows
pd.set_option('display.max_rows', None)

results_df.sort_values(by = "Accuracy", ascending = False)

Unnamed: 0,Model,Outliers,Scaling,Feature Set,Accuracy
83,SVM,Without Outliers,Standardization,All Features,0.815408
104,KNN,Without Outliers,Normalization,All Features,0.81312
84,KNN,Without Outliers,Standardization,All Features,0.807018
23,SVM,With Outliers,Standardization,All Features,0.806548
103,SVM,Without Outliers,Normalization,All Features,0.806255
100,Logistic Regression,Without Outliers,Normalization,All Features,0.806255
80,Logistic Regression,Without Outliers,Standardization,All Features,0.805492
60,Logistic Regression,Without Outliers,No Scaling,All Features,0.803966
40,Logistic Regression,With Outliers,Normalization,All Features,0.799851
43,SVM,With Outliers,Normalization,All Features,0.799107
