<a href="https://colab.research.google.com/github/iamajeethazad10/Customer_Personality_Analysis/blob/main/customer_personality_analysis_till_model_evaluation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
#Basic Libraries
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns
from plotly.subplots import make_subplots
import plotly.graph_objects as go


#Preprocessing
from sklearn.preprocessing import StandardScaler


In [None]:
data = pd.read_excel("/content/marketing_campaign1 (1).xlsx")
data

FileNotFoundError: ignored

In [None]:
#Missing Values
data.isnull().any()

In [None]:
data.isnull().sum()

In [None]:
data = data.dropna()

In [None]:
data.isnull().sum()

In [None]:
data.duplicated().any

In [None]:
data.duplicated().sum()

# **Feature Engineering**

In [None]:
data.info()

In [None]:
data['Dt_Customer'] = pd.to_datetime(data['Dt_Customer'])

In [None]:
data.info()

In [None]:
print("The newest customer enrolled date in the records:", max(data['Dt_Customer']))
print("The oldest customer enrolled date in the records:", min(data['Dt_Customer']))

In [None]:
data['Age'] = 2015-data['Year_Birth']
data['Age'].head()

In [None]:
#Total Spent of Money on Products
data['Total_Spent'] = data['MntWines'] + data['MntFruits'] + data['MntMeatProducts'] + data['MntFishProducts'] + data['MntFishProducts'] + data['MntGoldProds']
data['Total_Spent'].head()

In [None]:
#Living Status
data['Living'] = data['Marital_Status'].replace({'Married':'Partner', 'Together':'Partner', 'Absurd':'Alone', 'Widow':'Alone', 'YOLO':'Alone', 'Divorced':'Alone', 'Single':'Alone'})
data['Living'].head()

In [None]:
#Number of Kids In home
data['kids'] = data['Kidhome'] + data['Teenhome']
data['kids'].head()

In [None]:
#Education Class
data['Education'] = data['Education'].replace({'Basic':'Undergraduate', '2n Cycle':'Undergraduate', 'Graduation':'Graduate', 'Master':'Postgraduate', 'PhD':'Postgraduate'})
data['Education'].head()

In [None]:
#Dropping Irrelevant Columns
data_drop = ['Marital_Status', 'Dt_Customer', 'Z_CostContact', 'Z_Revenue', 'Year_Birth', 'ID']
data = data.drop(data_drop, axis=1)

In [None]:
data

In [None]:
data.shape

In [None]:
data.describe()

In [None]:
#Spending Based on Age
plt.figure(figsize=(15,8))
sns.scatterplot(x=data['Age'],y=data['Total_Spent']);

In [None]:
#Sepending based on Education Class
plt.figure(figsize=(13,8))
sns.histplot(x=data['Total_Spent'], hue=data['Education']);

In [None]:
data['Education'].value_counts().plot.pie(explode=[0.1,0,0], autopct='%1.1f%%', shadow=True, figsize=(8,8), colors=sns.color_palette('bright'));

# **Outliers Detection**

In [None]:
plt.figure(figsize=(15,8))
sns.distplot(data.Age, color='red');

In [None]:
plt.figure(figsize=(15,8))
sns.distplot(data.Income, color='blue');

In [None]:
plt.figure(figsize=(15,8))
sns.distplot(data.Total_Spent, color='#ff9966');

In [None]:
fig = make_subplots(rows=1, cols=3)

fig.add_trace(go.Box(y=data['Age'], notched=True, name='Age', marker_color = '#6699ff',
                     boxmean=True, boxpoints='suspectedoutliers'), 1, 2)

fig.add_trace(go.Box(y=data['Income'], notched=True, name='Income', marker_color = '#ff0066',
                     boxmean=True, boxpoints='suspectedoutliers'), 1, 1)

fig.add_trace(go.Box(y=data['Total_Spent'], notched=True, name='Spent', marker_color = 'lightseagreen',
                     boxmean=True, boxpoints='suspectedoutliers'), 1, 3)

fig.update_layout(title_text='Box Plots for Numerical Variables')

fig.show()

In [None]:
data.head()

In [None]:
Numerical = ['Income', 'Recency', 'Age', 'Total_Spent']

In [None]:
def detect_outliers(d):
  for i in d:
    Q1, Q3 = np.percentile(data[i], [25 ,75])
    IQR = Q3 - Q1

    ul = Q3+1.5*IQR
    ll = Q1-1.5*IQR

    outliers = data[i][(data[i] > ul) | (data[i] < ll)]
    print(f'*** {i} outlier points***', '\n', outliers, '\n')

In [None]:
detect_outliers(Numerical)

In [None]:
#removing Some Outliers
data = data[(data['Age']<100)]
data = data[(data['Income']<600000)]

In [None]:
data.shape

# **Rare Categories**

In [None]:
Categorical = [var for var in data.columns if data[var].dtype=='O']

In [None]:
# check the number of different labels
for var in Categorical:
    print(data[var].value_counts() / np.float(len(data)))
    print()
    print()


## **Categorical Variables Encoding**

In [None]:
Categorical

In [None]:
data['Living'].unique()

In [None]:
data['Education'] = data['Education'].map({'Undergraduate':0,'Graduate':1, 'Postgraduate':2})

In [None]:
data['Living'] = data['Living'].map({'Alone':0,'Partner':1})

In [None]:
data.dtypes

In [None]:
df=data.copy()

In [None]:
corrmap = data.corr()

plt.figure(figsize=(25,20))
sns.heatmap(corrmap, annot = True, cmap = 'mako', center = 0)

In [None]:
data = data.copy()

# creating a subset of dataframe by dropping the features on deals accepted and promotions
cols_del = ['AcceptedCmp1', 'AcceptedCmp2', 'AcceptedCmp3', 'AcceptedCmp4','AcceptedCmp5', 'Complain', 'Response']
data = data.drop(cols_del, axis=1)

In [None]:
stdscaler = StandardScaler()
data = pd.DataFrame(stdscaler.fit_transform(data), columns = data.columns)

In [None]:
data.head()

## **Dimensionality Reduction**

In [None]:
from sklearn.decomposition import PCA

In [None]:
p = PCA(n_components=3)
p.fit(data)

In [None]:
W = p.components_.T
W

In [None]:
pd.DataFrame(W, index=data.columns, columns=['W1','W2','W3'])

In [None]:
p.explained_variance_

In [None]:
p.explained_variance_ratio_

In [None]:
pd.DataFrame(p.explained_variance_ratio_, index=range(1,4), columns=['Explained Variability'])

In [None]:
sns.barplot(x = list(range(1,4)), y = p.explained_variance_, palette = 'GnBu_r')
plt.xlabel('i')
plt.ylabel('Lambda i');

In [None]:
data_PCA = pd.DataFrame(p.transform(data), columns=(['col1', 'col2', 'col3']))

In [None]:
data_PCA.describe().T


In [None]:
x = data_PCA['col1']
y = data_PCA['col2']
z = data_PCA['col3']

fig = plt.figure(figsize=(13,8))
ax = fig.add_subplot(111, projection='3d')
ax.scatter(x,y,z, c='darkred', marker='o')
ax.set_title('A 3D Projection of Data In the Reduced Dimension')
plt.show()

In [None]:
from sklearn.preprocessing import StandardScaler
scalar=StandardScaler()
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans,AgglomerativeClustering,DBSCAN,SpectralClustering
from sklearn.mixture import GaussianMixture
from sklearn.metrics import silhouette_samples, silhouette_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.cluster import KMeans,AgglomerativeClustering,DBSCAN
from sklearn.preprocessing import MinMaxScaler,StandardScaler
from sklearn.metrics import silhouette_score,calinski_harabasz_score
from sklearn import tree
from sklearn import metrics

import warnings
warnings.filterwarnings("ignore")

In [None]:
#Data Preprocessing
#Standardization
standard_scaler = StandardScaler()
std = standard_scaler.fit_transform(df)
std.shape

In [None]:
#Normalization
minmax = MinMaxScaler()
norm = minmax.fit_transform(df)
norm.shape

#K means  Clustering

In [None]:
#Elbow method & Scaler Method on Data
cluster_range = range(1,15)
cluster_errors = []
for num_clusters in cluster_range:
    clusters = KMeans(num_clusters,n_init=10)
    clusters.fit(std)
    labels = clusters.labels_
    centroids = clusters.cluster_centers_
    cluster_errors.append(clusters.inertia_)
clusters_df = pd.DataFrame({"Num_Clusters":cluster_range,"Cluster_Errors":cluster_errors})
clusters_df

In [None]:
wcss=[]
for i in range(1,9):
    kmeans = KMeans(n_clusters=i,random_state=2)
    kmeans.fit(std)
    wcss.append(kmeans.inertia_)

# Plot K values range vs WCSS to get Elbow graph for choosing K (no. of clusters)
plt.plot(range(1,9),wcss,color = 'black')
plt.scatter(range(1,9),wcss,color='red')
plt.title('Elbow Graph for Standard Scaler')
plt.xlabel('Number of clusters')
plt.ylabel('WCSS')
plt.show()

In [None]:
#Silhouette Score
n_clusters=[2,3,4,5,6,7,8,9,10]
clusters_inertia=[]
s_scores=[]

for n in n_clusters:
  KM_est=KMeans(n_clusters=n,init='k-means++').fit(std)
  clusters_inertia.append(KM_est.inertia_)  # data for the elbow method
  silhouette_avg = silhouette_score(std, KM_est.labels_)
  s_scores.append(silhouette_avg)

In [None]:
fig, ax = plt.subplots(figsize=(12,5))
ax =sns.lineplot(s_scores,marker='o',ax=ax)
ax.set_title("Silhouette score method")
ax.set_xlabel("number of clusters")
ax.set_ylabel("Silhouette score")
ax.axvline(2,ls="--", c="red")
plt.grid()
plt.show()

In [None]:
import scipy.cluster.hierarchy as sch
from yellowbrick.cluster import KElbowVisualizer

In [None]:
# Instantiate a scikit-learn K-Means model. we will check for two diff hyperparameters value effect.
model = KMeans(random_state=10, max_iter=500, init='k-means++')

# Instantiate the KElbowVisualizer with the number of clusters and the metric
visualizer = KElbowVisualizer(model, k=(2,20), metric='silhouette', timings=False)
fig, ax = plt.subplots(figsize=(12,5))
# Fit the data and visualize
print('Elbow Plot for Standard Scaler data')
visualizer.fit(std)
visualizer.poof()
plt.show()

In [None]:
clust_list = [2,3,4,5,6,7,8,9]

#  Silhouette score for stadardScaler applied on data.
for n_clusters in clust_list:
    clusterer1 = KMeans(n_clusters=n_clusters, random_state=0)
    cluster_labels1 = clusterer1.fit_predict(std)
    sil_score1= silhouette_score(std, cluster_labels1)
    print("For n_clusters =", n_clusters,"The average silhouette_score is :", sil_score1)


Conclusion: According the the silhouette score of:

The standardized data, the ideal number of clusters is 5, with a score higher than other options, of 0.179

In [None]:
model_kmeans = KMeans(n_clusters=6, random_state=0, init='k-means++')
y_predict_kmeans = model_kmeans.fit_predict(std)
y_predict_kmeans.shape

In [None]:
y_predict_kmeans

In [None]:
model_kmeans.labels_

In [None]:
model_kmeans.cluster_centers_

In [None]:
model_kmeans.inertia_

In [None]:
df['Kmeans_Label'] = model_kmeans.labels_

In [None]:
df.groupby('Kmeans_Label').agg(['mean'])

In [None]:
from yellowbrick.cluster import SilhouetteVisualizer

fig,(ax1,ax2) = plt.subplots(1,2,sharey=False)
fig.set_size_inches(12,6)

sil_visualizer1 = SilhouetteVisualizer(model_kmeans,ax= ax1, colors=['#922B21','#5B2C6F','#1B4F72','#32a84a','#a83232','#323aa8'])
sil_visualizer1.fit(std)

# 2nd Plot showing the actual clusters formed

import matplotlib.cm as cm
colors1 = cm.nipy_spectral(model_kmeans.labels_.astype(float) / 6) # 6 is number of clusters
ax2.scatter(std[:, 6], std[:, 9], marker='.', s=30, lw=0, alpha=0.7, c=colors1, edgecolor='k')

# Labeling the clusters
centers1 = model_kmeans.cluster_centers_
# Draw white circles at cluster centers
ax2.scatter(centers1[:, 6], centers1[:, 9], marker='o',c="white", alpha=1, s=200, edgecolor='k')

for i, c in enumerate(centers1):
    ax2.scatter(c[6], c[9], marker='$%d$' % i, alpha=1,s=50, edgecolor='k')

ax2.set_title(label ="The visualization of the clustered data.")
ax2.set_xlabel("Feature space for the 1st feature")
ax2.set_ylabel("Feature space for the 2nd feature")

plt.suptitle(("Silhouette analysis for KMeans clustering on sample data "
              "with n_clusters = %d" % 6),fontsize=14, fontweight='bold')

sil_visualizer1.show()
plt.show()

In [None]:
#Elbow Method and Silhouette Score on MinMaxScaler Applied Data
cluster_range = range(1,15)
cluster_errors = []
for num_clusters in cluster_range:
    clusters = KMeans(num_clusters,n_init=10)
    clusters.fit(norm)
    labels = clusters.labels_
    centroids = clusters.cluster_centers_
    cluster_errors.append(clusters.inertia_)
clusters_df = pd.DataFrame({"Num_Clusters":cluster_range,"Cluster_Errors":cluster_errors})
clusters_df

In [None]:
wcss=[]
for i in range (1,9):
    kmeans=KMeans(n_clusters=i,random_state=2)
    kmeans.fit(norm)
    wcss.append(kmeans.inertia_)

# Plot K values range vs WCSS to get Elbow graph for choosing K (no. of clusters)
fig, ax = plt.subplots(figsize=(12,5))
plt.plot(range(1,9), wcss,color = 'black')
plt.scatter(range(1,9), wcss,color='red')
plt.title('Elbow Graph for MinMaxScaler')
plt.xlabel('Number of clusters')
plt.ylabel('WCSS')
plt.show()

In [None]:
n_clusters = [2,3,4,5,6,7,8,9,10] # number of clusters
clusters_inertia = [] # inertia of clusters
s_scores = [] # silhouette scores

for n in n_clusters:
    KM_est = KMeans(n_clusters=n, init='k-means++').fit(norm)
    clusters_inertia.append(KM_est.inertia_)    # data for the elbow method
    silhouette_avg = silhouette_score(norm, KM_est.labels_)
    s_scores.append(silhouette_avg) # data for the silhouette score method

fig, ax = plt.subplots(figsize=(12,5))
ax = sns.lineplot( s_scores, marker='o', ax=ax)
ax.set_title("Silhouette score method")
ax.set_xlabel("number of clusters")
ax.set_ylabel("Silhouette score")
ax.axvline(2, ls="--", c="red")
plt.grid()
plt.show()

In [None]:
model = KMeans(random_state=10, max_iter=500, init='k-means++')
plt.subplots(figsize=(12,5))
# Instantiate the KElbowVisualizer with the number of clusters and the metric
visualizer = KElbowVisualizer(model, k=(2,20), metric='silhouette', timings=False)
# Fit the data and visualize
print('Elbow Plot for Normalization data')
visualizer.fit(norm)
visualizer.poof()
plt.show()

In [None]:
clust_list = [2,3,4,5,6,7,8,9]

# Silhouette score for MinMaxScaler Applied on data .
for n_clusters in clust_list:
    clusterer1 = KMeans(n_clusters=n_clusters, random_state=0)
    cluster_labels1 = clusterer1.fit_predict(norm)
    sil_score1= silhouette_score(norm, cluster_labels1)
    print("For n_clusters =", n_clusters,"The average silhouette_score is :", sil_score1)

In [None]:
model_kmeans = KMeans(n_clusters=2, random_state=0, init='k-means++')
y_predict_kmeans = model_kmeans.fit_predict(norm)
y_predict_kmeans.shape

In [None]:
# these are nothing but cluster labels...
y_predict_kmeans

In [None]:
model_kmeans.labels_

In [None]:
# cluster centres associated with each lables
model_kmeans.cluster_centers_

In [None]:
model_kmeans.inertia_

In [None]:
from yellowbrick.cluster import SilhouetteVisualizer

fig,(ax1,ax2) = plt.subplots(1,2,sharey=False)
fig.set_size_inches(12,5)
sil_visualizer1 = SilhouetteVisualizer(model_kmeans,ax= ax1, colors=['#922B21','#5B2C6F','#1B4F72','#32a84a'])
sil_visualizer1.fit(norm)

# 2nd Plot showing the actual clusters formed

import matplotlib.cm as cm
colors1 = cm.nipy_spectral(model_kmeans.labels_.astype(float) / 2) # 2 is number of clusters
ax2.scatter(norm[:, 6], norm[:, 9], marker='.', s=30, lw=0, alpha=0.7, c=colors1, edgecolor='k')

# Labeling the clusters
centers1 = model_kmeans.cluster_centers_
# Draw white circles at cluster centers
ax2.scatter(centers1[:, 6], centers1[:, 9], marker='o',c="white", alpha=1, s=200, edgecolor='k')

for i, c in enumerate(centers1):
    ax2.scatter(c[6], c[9], marker='$%d$' % i, alpha=1,s=50, edgecolor='k')

ax2.set_title(label ="The visualization of the clustered data.")
ax2.set_xlabel("Feature space for the 1st feature")
ax2.set_ylabel("Feature space for the 2nd feature")

plt.suptitle(("Silhouette analysis for KMeans clustering on sample data "
              "with n_clusters = %d" % 4),fontsize=14, fontweight='bold')

sil_visualizer1.show()
plt.show()

In [None]:
df.groupby('Kmeans_Label').agg(['mean'])

#Hierarchical Clustering Algorithm

In [None]:
for methods in ['single','complete','average','weighted','centroid','median','ward']:
    plt.figure(figsize =(14,6))
    dict = {'fontsize':16,'fontweight' :14, 'color' : 'blue'}
    plt.title('Visualising the data, Method- {}'.format(methods),fontdict = dict)
    Dendrogram1 = sch.dendrogram(sch.linkage(norm, method = methods,optimal_ordering=False))

In [None]:
n_clusters = [2,3,4,5,6,7,8]  # always start number from 2.

for n_clusters in n_clusters:
    for linkages in ["ward", "complete", "average", "single"]:
        hie_cluster1 = AgglomerativeClustering(n_clusters=n_clusters,linkage=linkages) # bydefault it takes linkage 'ward'
        hie_labels1 = hie_cluster1.fit_predict(norm)
        silhouette_score1 = silhouette_score(norm, hie_labels1)
        print("For n_clusters =", n_clusters,"The average silhouette_score with linkage-",linkages, ':',silhouette_score1)
    print()

Conclusion: Max score is with cluster numbers 2 in standard scaler transformation and in Min Max scaler transformation.

Heirarchical clustering means creating a tree of clusters by iteratively grouping or separating data points. There are two types of hierarchical clustering: Agglomerative clustering Divisive clustering We now apply the Agglomerative clustering technique:Agglomerative clustering is kind of a bottom-up approach. Each data point is assumed to be a separate cluster at first. Then the similar clusters are iteratively combined

In [None]:
agg_clustering = AgglomerativeClustering(n_clusters=2, linkage='ward')
y_pred_hie = agg_clustering.fit_predict(norm)
print(y_pred_hie.shape)
y_pred_hie

In [None]:
agg_clustering.n_clusters_

In [None]:
(silhouette_score(norm, agg_clustering.labels_)*100).round(3)

In [None]:
df['Hierarchical_Labels'] = agg_clustering.labels_

In [None]:
df.groupby('Hierarchical_Labels').agg(['mean'])

Model Building

In [None]:
df

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
df.head()

In [None]:
from sklearn import preprocessing
label_encoder = preprocessing.LabelEncoder()
df

In [None]:
X = df.iloc[:,0:27]
Y = df.iloc[:,27]

In [None]:
x_train, x_test, y_train, y_test = train_test_split (X,Y,test_size = 0.2, random_state = 42)

In [None]:
print(x_train.shape, x_test.shape, y_train.shape, y_test.shape)

#Model Building

In [None]:
from sklearn import metrics
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report as report
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import BernoulliNB

Gaussian Naive Bayes

In [None]:
#instantiating the object
model_GNB = GaussianNB()

#fit the model
model_GNB.fit(x_train, y_train)

In [None]:
#prediction
y_pred_GNB = model_GNB.predict(x_test)

In [None]:
#Accuracy
#first argument is true values, second argument is predicted values
model_GNB_score = accuracy_score(y_test,y_pred_GNB)
print('GaussianNB Model accuracy score: {0:0.4f}'. format(accuracy_score(y_test, y_pred_GNB)))

Bernoulli Naive Bayes

In [None]:
#Bernoulli Naive Bayes
#instantiating the object
model_BNB = BernoulliNB()

#fit the model
model_BNB.fit(x_train, y_train)

#prediction
y_pred_BNB = model_BNB.predict(x_test)

#Accuracy
#first argument is true values, second argument is predicted values
model_BNB_score = accuracy_score(y_test,y_pred_BNB)
print('BernoulliNB Model accuracy score: {0:0.4f}'. format(accuracy_score(y_test, y_pred_BNB)))

Decision Tree

In [None]:
from sklearn.tree import  DecisionTreeClassifier
from sklearn import tree

In [None]:
# Decision Tree Classifier with criteria as entropy

model_DT = DecisionTreeClassifier(criterion = 'entropy',max_depth=4)
model_DT.fit(x_train,y_train)

#Predicting on test data
pred = model_DT.predict(x_test)
pred

In [None]:
pd.crosstab(y_test,pred)

In [None]:
# Accuracy
np.mean(pred==y_test)

In [None]:
Model_DT_score = accuracy_score(y_test, pred)
Model_DT_score
print(" Decision Tree Classifier with Entropy: ")
print(report(y_test, pred))

In [None]:
tree.plot_tree(model_DT)

In [None]:
# Decision Tree Classifier with criteria as Gini

model_DT2 = DecisionTreeClassifier(criterion = 'entropy',max_depth=4)
model_DT2.fit(x_train,y_train)

#Predicting on test data
pred2 = model_DT.predict(x_test)
pred2

In [None]:
pd.crosstab(y_test,pred2)

In [None]:
# Accuracy
np.mean(pred2==y_test)

In [None]:
Model_DT2_Score = accuracy_score(y_test, pred)
print("Decision Tree Classifier with GINI")
print(report(y_test,pred2))

In [None]:
tree.plot_tree(model_DT2)

KNN Classifier

In [None]:
from sklearn.neighbors import KNeighborsClassifier

model_KNN=KNeighborsClassifier(n_neighbors=10)
model_KNN.fit(x_train,y_train)
pred_knn = model_KNN.predict(x_test)

pd.crosstab(y_test,pred_knn)
Model_KNN_score= accuracy_score(y_test,pred_knn)

print(report(y_test,pred_knn))

In [None]:
Model_KNN_score

Xg Boost

In [None]:
from xgboost import XGBClassifier

In [None]:
model_xgboost = XGBClassifier(n_estimators = 100, max_depth =3)
model_xgboost.fit(x_train,y_train)

In [None]:
y_pred_xgboost = model_xgboost.predict(x_test)

In [None]:
model_xgboost_score= accuracy_score(y_test, y_pred_xgboost)
model_xgboost_score

Random forest

In [None]:
from sklearn.ensemble import RandomForestClassifier

In [None]:
model_RFC = RandomForestClassifier(n_estimators = 100, max_features=3)
model_RFC.fit(x_train,y_train)
rfc_pred = model_RFC.predict(x_test);

In [None]:
model_RFC_score = accuracy_score(y_test, rfc_pred)
model_RFC_score

In [None]:
print("Random Forest :")
print(report(y_test,rfc_pred))

ADA Boost

In [None]:
from sklearn.ensemble import AdaBoostClassifier

In [None]:
model_adaboost = AdaBoostClassifier(n_estimators=100, random_state=25)
model_adaboost.fit(x_train, y_train)
adaboost_pred = model_adaboost.predict(x_test)

In [None]:
model_adaboost_score = accuracy_score(y_test,adaboost_pred)
model_adaboost_score

In [None]:
print(report(y_test,adaboost_pred))

Stacking

In [None]:
from sklearn.ensemble import VotingClassifier

In [None]:
estimators = []

estimators.append(('DTC',model_DT))
estimators.append(('knn', model_KNN))
estimators.append(('svm',svm))
estimators.append(('RFC',model_RFC))
estimators.append(('LR',model_LR))

In [None]:
#Creating ensemble model

stacking = VotingClassifier(estimators)
stacking.fit(x_train,y_train)

stack_pred = stacking.predict(x_test)
stack_score = accuracy_score(y_test,stack_pred)
stack_score

In [None]:
print(report(y_test,stack_pred))

Model Evaluation

In [None]:
data_score={'accuracy_score' : [0.8330,0.8555,0.9029,0.7584,0.9706,0.9051,0.8487]}

In [None]:
df = pd.DataFrame(data_score,index = ["model_GNB","model_BNB","model_DT","model_KNN","model_xgboost","model_adaboost","model_ensemble"] )

In [None]:
df.sort_values(by=['accuracy_score'])

In [None]:
# Using K-fold validation
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score

In [None]:
kfold_GNB = cross_val_score(model_GNB,X,Y, cv=5)
print("K-fold validation score of Gaussian Naive bayes is",kfold_GNB.mean())

kfold_BNB = cross_val_score(model_BNB,X,Y, cv=5)
print("K-fold validation score of Bernoulli Naive bayes is",kfold_BNB.mean())

kfold_KNN = cross_val_score(model_KNN,X,Y, cv=5)
print("K-fold validation score of KNN is",kfold_KNN.mean())

kfold_svm = cross_val_score(svm,X,Y, cv=5)
print("K-fold validation score of SVM is",kfold_svm.mean())

kfold_DT = cross_val_score(model_DT,X,Y, cv=5)
print("K-fold validation score of Decision tree with criteria as entropy is",kfold_DT.mean())

kfold_xgboost = cross_val_score(model_xgboost,X,Y, cv=5)
print("K-fold validation score of xgboost is",kfold_xgboost.mean())

kfold_adaboost = cross_val_score(model_adaboost,X,Y, cv=5)
print("K-fold validation score of Adaboost is",kfold_adaboost.mean())

kfold_stack = cross_val_score(stacking,X,Y, cv=5)
print("K-fold validation score of Stacking is",kfold_stack.mean())

In [None]:
kfold_score = {"Kfold_score" :[0.8349,0.8250,0.7463,0.6772,0.8811,0.9647,0.7925,0.8395]}

In [None]:
df_score = pd.DataFrame(kfold_score,index = ["model_GNB","model_BNB","model_DT","model_KNN","model_svm","model_xgboost","model_adaboost","model_stacking"])

In [None]:
df_score.sort_values(by='Kfold_score')