<a href="https://colab.research.google.com/github/jahnavimidde/VsemML/blob/main/MLlab3.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>


# Santander Customer Satisfaction Dataset

In [3]:
#Data Collection
import pandas as pd
df = pd.read_csv("/content/Santander Customer Satisfaction_train (1).csv")

In [4]:
y = df["TARGET"]
X = df.drop("TARGET", axis=1)

In [5]:
num_fea = X.select_dtypes(include=["number"]).columns.tolist()
cat_fea = X.select_dtypes(include=["object", "category"]).columns.tolist()

In [6]:
X[num_fea] = X[num_fea].fillna(X[num_fea].median())
X[cat_fea] = X[cat_fea].fillna("Unknown")

In [7]:
X = pd.get_dummies(X, columns=cat_fea, drop_first=True)
if "ID" in X.columns:
    X.drop(columns=["ID"], inplace=True)

In [8]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42, stratify=y
)

In [9]:
#Basic filter methods
#1.Removing Constant features
const = []
for features in X_train:
  if(X_train[features].std()==0):
    const.append(features)
print("Number of constant features:",len(const))
X_train.drop(labels=const,axis=1,inplace=True)
X_test.drop(labels=const,axis=1,inplace=True)

#2.Removing quasi constant features
quasi_constant = []
for feature in X_train.columns:
  predominant = (X_train[feature].value_counts()/float(len(X_train))).sort_values(ascending=False).values[0]
  if(predominant>0.999):
    quasi_constant.append(feature)
print("Number of quasi constant features:",len(quasi_constant))
X_train.drop(labels=quasi_constant,axis=1,inplace=True)
X_test.drop(labels=quasi_constant,axis=1,inplace=True)#Apply same removal to X_test

#3.Duplicated features
duplicates = []
for i in range(len(X_train.columns)):
  col1 = X_train.columns[i]
  for col2 in X_train.columns[i+1:]:
    if(X_train[col1].equals(X_train[col2])): #Not ==, as it won't return a single True of False
      duplicates.append(col2)
print("Number of duplicate features:",len(duplicates))
X_train.drop(labels=duplicates,axis=1,inplace=True)
X_test.drop(labels=duplicates,axis=1,inplace=True)#Apply same removal to X_test


Number of constant features: 57
Number of quasi constant features: 89
Number of duplicate features: 10


In [10]:
#Statistical Filter Methods
num_features=[col for col in X_train.columns if not set(X_train[col].unique()).issubset({0,1})]
cat_features = [col for col in X_train.columns if set(X_train[col].unique()).issubset({0,1})]
from sklearn.feature_selection import f_classif,SelectKBest
#1. Anova
f_values,p_values = f_classif(X_train[num_features],y_train)
anova_df=pd.DataFrame({
    "Numerical Features":num_features,
    "F_values":f_values,
    "P_values":p_values
    })
anova_df.sort_values(by="P_values",inplace=True)
significant_numeric_features = anova_df[anova_df["P_values"]<0.05]["Numerical Features"].tolist()
print("Selected Numerical Features:", significant_numeric_features)
#2.chi2
from sklearn.feature_selection import chi2
chi2_values,p_values=chi2(X_train[cat_features],y_train)
chi2_df=pd.DataFrame({
    "Categorical Features":cat_features,
    "Chi2_values":chi2_values,
    "p_values":p_values
     })
chi2_df.sort_values(by="p_values",inplace=True)
significant_chi2_features=chi2_df[chi2_df["p_values"]<0.05]["Categorical Features"].tolist()
print("Selected Features from Chi2 test:", significant_chi2_features)
#3. Mutual info
from sklearn.feature_selection import mutual_info_classif,SelectKBest
selector = SelectKBest(score_func=mutual_info_classif,k=10)
selector.fit(X_train[cat_features],y_train)
significant_mi_features=X_train[cat_features].columns[selector.get_support()].tolist()
print("Selected Features from MI test:",significant_mi_features)
final_selected_features=list(set(significant_numeric_features+significant_chi2_features+significant_mi_features))
#Filter
X_train=X_train[final_selected_features]
for col in X_train.columns:
    if col not in X_test.columns:
        X_test[col] = 0
X_test = X_test[X_train.columns]

Selected Numerical Features: ['num_meses_var5_ult3', 'num_var30', 'num_var5', 'num_var42', 'var15', 'var36', 'num_var4', 'num_var35', 'num_var8_0', 'num_var13', 'num_var13_0', 'saldo_var30', 'num_meses_var13_corto_ult3', 'num_var13_corto', 'num_var13_corto_0', 'num_var12', 'imp_op_var39_efect_ult1', 'num_var5_0', 'imp_op_var41_efect_ult1', 'num_var8', 'num_var24', 'num_var30_0', 'num_meses_var12_ult3', 'num_var22_ult1', 'saldo_var13', 'num_var24_0', 'num_meses_var8_ult3', 'imp_op_var39_efect_ult3', 'num_var41_0', 'imp_op_var41_ult1', 'imp_op_var39_ult1', 'imp_op_var41_efect_ult3', 'num_aport_var13_hace3', 'num_var39_0', 'saldo_var13_corto', 'saldo_medio_var13_corto_ult1', 'num_op_var39_efect_ult1', 'saldo_medio_var13_corto_ult3', 'num_op_var41_efect_ult1', 'saldo_var42', 'num_var26_0', 'num_var25_0', 'saldo_medio_var13_corto_hace2', 'imp_aport_var13_hace3', 'num_op_var39_efect_ult3', 'saldo_var12', 'saldo_var24', 'num_var12_0', 'num_op_var41_efect_ult3', 'saldo_medio_var12_ult3', 'sald

In [11]:
#Dataset after data preprocessing and feature selection
print(X_train.columns)

Index(['num_meses_var13_corto_ult3', 'num_var1', 'ind_var8',
       'num_ent_var16_ult1', 'imp_op_var41_ult1', 'imp_op_var41_efect_ult1',
       'num_var24', 'saldo_medio_var13_corto_hace3', 'ind_var24_0', 'var15',
       ...
       'num_meses_var13_largo_ult3', 'num_var22_hace2', 'saldo_var13_largo',
       'ind_var31_0', 'imp_op_var41_efect_ult3',
       'saldo_medio_var13_corto_ult1', 'num_var30', 'saldo_medio_var5_ult1',
       'saldo_medio_var5_hace2', 'num_var12'],
      dtype='object', length=120)
