In [12]:
import pandas as pd 
import numpy as np 
from sklearn.preprocessing import OrdinalEncoder
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import chi2, SelectKBest
from sklearn.impute import SimpleImputer


In [13]:
df=pd.read_csv('cleaned_data.csv')

In [14]:
df.head()

Unnamed: 0.1,Unnamed: 0,Year_Birth,Education,Marital_Status,Income,Kidhome,Teenhome,Recency,MntWines,MntFruits,...,NumWebVisitsMonth,AcceptedCmp3,AcceptedCmp4,AcceptedCmp5,AcceptedCmp1,AcceptedCmp2,Complain,Z_CostContact,Z_Revenue,Response
0,0,1957,Graduation,Single,58138.0,0,0,58,635,88,...,7,0,0,0,0,0,0,3,11,1
1,1,1954,Graduation,Single,46344.0,1,1,38,11,1,...,5,0,0,0,0,0,0,3,11,0
2,2,1965,Graduation,Together,71613.0,0,0,26,426,49,...,4,0,0,0,0,0,0,3,11,0
3,3,1984,Graduation,Together,26646.0,1,0,26,11,4,...,6,0,0,0,0,0,0,3,11,0
4,4,1981,PhD,Married,58293.0,1,0,94,173,43,...,5,0,0,0,0,0,0,3,11,0


In [15]:
df.drop(columns=['Unnamed: 0'],inplace=True,axis=1)

In [16]:
df

Unnamed: 0,Year_Birth,Education,Marital_Status,Income,Kidhome,Teenhome,Recency,MntWines,MntFruits,MntMeatProducts,...,NumWebVisitsMonth,AcceptedCmp3,AcceptedCmp4,AcceptedCmp5,AcceptedCmp1,AcceptedCmp2,Complain,Z_CostContact,Z_Revenue,Response
0,1957,Graduation,Single,58138.0,0,0,58,635,88,546,...,7,0,0,0,0,0,0,3,11,1
1,1954,Graduation,Single,46344.0,1,1,38,11,1,6,...,5,0,0,0,0,0,0,3,11,0
2,1965,Graduation,Together,71613.0,0,0,26,426,49,127,...,4,0,0,0,0,0,0,3,11,0
3,1984,Graduation,Together,26646.0,1,0,26,11,4,20,...,6,0,0,0,0,0,0,3,11,0
4,1981,PhD,Married,58293.0,1,0,94,173,43,118,...,5,0,0,0,0,0,0,3,11,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2053,1977,Graduation,Together,666666.0,1,0,23,9,14,18,...,6,0,0,0,0,0,0,3,11,0
2054,1967,Graduation,Married,61223.0,0,1,46,709,43,182,...,5,0,0,0,0,0,0,3,11,0
2055,1981,Graduation,Divorced,56981.0,0,0,91,908,48,217,...,6,0,1,0,0,0,0,3,11,0
2056,1956,Master,Together,69245.0,0,1,8,428,30,214,...,3,0,0,0,0,0,0,3,11,0


In [17]:
#deviding the data into dependent and independent variable
X=df.drop(labels=['Response'],axis=1)
y=df[['Response']]

In [18]:
Education_categories=['Basic','2n Cycle','Graduation','Master','PhD']
Marital_Status_categories=['Single','Together','Married','Divorced','Widow','Alone','Absurd','YOLO']

In [19]:
numerical_cols=X.select_dtypes(exclude='object').columns
categorical_cols=X.select_dtypes(include='object').columns

In [20]:
# Define pipelines for numerical and categorical data
num_pipeline = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

cat_pipeline = Pipeline(steps=[
                ('imputer', SimpleImputer(strategy='most_frequent')),
                ('ordinalencoder',OrdinalEncoder(categories=[Education_categories,Marital_Status_categories]))
            ])

preprocessor = ColumnTransformer(
    transformers=[
        ('num_pipeline', num_pipeline, numerical_cols),
        ('cat_pipeline', cat_pipeline, categorical_cols)
    ]
)

In [21]:
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.30)

In [22]:
X_train=pd.DataFrame(preprocessor.fit_transform(X_train),columns=preprocessor.get_feature_names_out())
X_test=pd.DataFrame(preprocessor.transform(X_test),columns=preprocessor.get_feature_names_out())

In [23]:
X_train.head()

Unnamed: 0,num_pipeline__Year_Birth,num_pipeline__Income,num_pipeline__Kidhome,num_pipeline__Teenhome,num_pipeline__Recency,num_pipeline__MntWines,num_pipeline__MntFruits,num_pipeline__MntMeatProducts,num_pipeline__MntFishProducts,num_pipeline__MntSweetProducts,...,num_pipeline__AcceptedCmp3,num_pipeline__AcceptedCmp4,num_pipeline__AcceptedCmp5,num_pipeline__AcceptedCmp1,num_pipeline__AcceptedCmp2,num_pipeline__Complain,num_pipeline__Z_CostContact,num_pipeline__Z_Revenue,cat_pipeline__Education,cat_pipeline__Marital_Status
0,0.535295,1.832351,-0.823785,-0.92862,-0.840682,1.484394,3.924279,2.720925,1.256185,3.267951,...,-0.274644,-0.293211,3.547521,3.660601,-0.095446,-0.102598,0.0,0.0,3.0,2.0
1,-1.473805,-1.375255,1.074216,0.893196,-0.631889,-0.857139,-0.658232,-0.728364,-0.690633,-0.654372,...,-0.274644,-0.293211,-0.281887,-0.273179,-0.095446,9.746794,0.0,0.0,2.0,3.0
2,0.200445,0.645675,-0.823785,0.893196,0.238082,2.283442,-0.357739,-0.320524,-0.396774,-0.365612,...,-0.274644,3.41051,-0.281887,-0.273179,-0.095446,-0.102598,0.0,0.0,2.0,2.0
3,-1.222667,0.234055,-0.823785,0.893196,0.203283,1.847332,-0.307657,0.985434,0.705199,0.741301,...,-0.274644,-0.293211,-0.281887,-0.273179,-0.095446,-0.102598,0.0,0.0,4.0,3.0
4,0.200445,0.280723,-0.823785,0.893196,-1.536659,0.536073,-0.007165,0.156737,1.439847,1.270695,...,-0.274644,-0.293211,-0.281887,-0.273179,-0.095446,-0.102598,0.0,0.0,2.0,0.0


In [24]:
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report,ConfusionMatrixDisplay, \
                            precision_score, recall_score, f1_score, roc_auc_score,roc_curve 
from xgboost import XGBClassifier
from catboost import CatBoostClassifier

In [25]:
def evaluate_models(true,predicted):
    acc=accuracy_score(true,predicted)
    precision=precision_score(true,predicted)
    f1=f1_score(true,predicted)
    recall=recall_score(true,predicted)
    roc_auc=roc_auc_score(true,predicted)
    return acc,precision,f1,recall,roc_auc

In [26]:
classification_models = {
    "Random Forest": RandomForestClassifier(),
    "Decision Tree": DecisionTreeClassifier(),
    "Gradient Boosting": GradientBoostingClassifier(),
    "Logistic Regression": LogisticRegression(),
     "K-Neighbors Classifier": KNeighborsClassifier(),
    "XGBClassifier": XGBClassifier(), 
     "CatBoosting Classifier": CatBoostClassifier(verbose=False),
     "Support Vector Classifier": SVC(),
    "AdaBoost Classifier": AdaBoostClassifier()


}

model_list = []
accuracy_list = []

for model_name, model in classification_models.items():
    model.fit(X_train,y_train)
    train_accuracy=model.score(X_train,y_train)
    test_accuracy=model.score(X_test,y_test)
    print(f"model:{model_name}")
    print(f"Training Accuracy: {train_accuracy}")
    print(f"Testing Accuracy: {test_accuracy}")
    accuracy_list.append(test_accuracy)
    model_list.append(model_name)
    print('=='*50)


  return fit_method(estimator, *args, **kwargs)


model:Random Forest
Training Accuracy: 0.9958333333333333
Testing Accuracy: 0.8867313915857605
model:Decision Tree
Training Accuracy: 0.9958333333333333
Testing Accuracy: 0.8074433656957929


  y = column_or_1d(y, warn=True)


model:Gradient Boosting
Training Accuracy: 0.9520833333333333
Testing Accuracy: 0.889967637540453


  y = column_or_1d(y, warn=True)


model:Logistic Regression
Training Accuracy: 0.875
Testing Accuracy: 0.8964401294498382


  return self._fit(X, y)


model:K-Neighbors Classifier
Training Accuracy: 0.8833333333333333
Testing Accuracy: 0.8818770226537217
model:XGBClassifier
Training Accuracy: 0.9958333333333333
Testing Accuracy: 0.883495145631068
model:CatBoosting Classifier
Training Accuracy: 0.9826388888888888
Testing Accuracy: 0.8915857605177994
model:Support Vector Classifier
Training Accuracy: 0.9013888888888889
Testing Accuracy: 0.8883495145631068


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


model:AdaBoost Classifier
Training Accuracy: 0.9
Testing Accuracy: 0.8786407766990292


In [27]:
# Results
pd.DataFrame(list(zip(model_list, accuracy_list)),columns=['Model Name','Accuracy Score']).sort_values(
    by=["Accuracy Score"],ascending=False)

Unnamed: 0,Model Name,Accuracy Score
3,Logistic Regression,0.89644
6,CatBoosting Classifier,0.891586
2,Gradient Boosting,0.889968
7,Support Vector Classifier,0.88835
0,Random Forest,0.886731
5,XGBClassifier,0.883495
4,K-Neighbors Classifier,0.881877
8,AdaBoost Classifier,0.878641
1,Decision Tree,0.807443


In [29]:
Logistic_Regression=LogisticRegression()
Logistic_Regression.fit(X_train,y_train)
y_pred=Logistic_Regression.predict(X_test)
accuracy= accuracy_score(y_test,y_pred)

print("Accuracy of the Logistic_Regression: {:.2f}%".format(accuracy * 100))


Accuracy of the Logistic_Regression: 89.64%


  y = column_or_1d(y, warn=True)
