# Use-case: A shopping mall owner has provided the dataset. Your job is to create a model that can predict whether the customer is a good customer or bad customer based on customer's age and customer's estimated salary(Customer Segmentation)

In [1]:
import numpy as np
import pandas as pd

In [2]:
socialNetworkAdsDataset = pd.read_csv('Social_Network_Ads.csv')

In [3]:
socialNetworkAdsDataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 400 entries, 0 to 399
Data columns (total 5 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   User ID          400 non-null    int64 
 1   Gender           400 non-null    object
 2   Age              400 non-null    int64 
 3   EstimatedSalary  400 non-null    int64 
 4   Purchased        400 non-null    int64 
dtypes: int64(4), object(1)
memory usage: 15.8+ KB


In [4]:
# Check whether the given dataset is a BALANCED or UNBALANCED DATASET
# Decide which metric to use to judge the quality of the model

socialNetworkAdsDataset.Purchased.value_counts()

0    257
1    143
Name: Purchased, dtype: int64

In [None]:
# Binary Classification
# Unbalanced Dataset --- F1_Score

In [5]:
# Rules for Classification as per Sklearn
# 1. Data must be complete
# 2. Data must be strictly numeric
# 3. Features must be in the form of 2d np array
# 4. Label must be in the form of 1d np array

In [6]:
# Seperate data as features and label

features = socialNetworkAdsDataset.iloc[:,[2,3]].values
label = socialNetworkAdsDataset.iloc[:,4].values

In [9]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

CL = 0.7

for rs in range(1,301):
  X_train,X_test,y_train,y_test=train_test_split(features,
                                                 label,
                                                 test_size=0.2,
                                                 random_state=rs)

  model = LogisticRegression()
  model.fit(X_train,y_train)

  trainScore = model.score(X_train,y_train)
  testScore = model.score(X_test,y_test)

  if testScore > trainScore and testScore >= CL:
    print("TestScore {} TrainScore {} Rs {}".format(testScore,trainScore,rs))

TestScore 0.7375 TrainScore 0.61875 Rs 4
TestScore 0.7125 TrainScore 0.625 Rs 13
TestScore 0.7 TrainScore 0.628125 Rs 17
TestScore 0.7 TrainScore 0.628125 Rs 21
TestScore 0.75 TrainScore 0.615625 Rs 26
TestScore 0.7 TrainScore 0.628125 Rs 28
TestScore 0.7 TrainScore 0.628125 Rs 39
TestScore 0.7 TrainScore 0.628125 Rs 40
TestScore 0.725 TrainScore 0.621875 Rs 46
TestScore 0.7 TrainScore 0.634375 Rs 55
TestScore 0.7 TrainScore 0.628125 Rs 60
TestScore 0.7 TrainScore 0.628125 Rs 70
TestScore 0.875 TrainScore 0.8375 Rs 82
TestScore 0.7 TrainScore 0.628125 Rs 83
TestScore 0.7 TrainScore 0.628125 Rs 93
TestScore 0.7375 TrainScore 0.61875 Rs 94
TestScore 0.7 TrainScore 0.628125 Rs 99
TestScore 0.725 TrainScore 0.621875 Rs 103
TestScore 0.75 TrainScore 0.615625 Rs 114
TestScore 0.725 TrainScore 0.621875 Rs 125
TestScore 0.7 TrainScore 0.628125 Rs 139
TestScore 0.7125 TrainScore 0.63125 Rs 141
TestScore 0.725 TrainScore 0.621875 Rs 142
TestScore 0.7125 TrainScore 0.625 Rs 150
TestScore 0.8875 T

In [10]:
# Create train test Split
X_train,X_test,y_train,y_test=train_test_split(features,
                                                 label,
                                                 test_size=0.2,
                                                 random_state=158)
# Apply Algo
model = LogisticRegression()
model.fit(X_train,y_train)

# Evaluate
# Since we are dealing with unbalanced dataset we will use F1 Score to compare with CL

# model.score() in any classification algo of sklearn returns accuracy

trainScore_Accuracy = model.score(X_train,y_train)
testScore_Accuracy = model.score(X_test,y_test)

In [11]:
# Use F1_Score with entire DATASET
# Print Classification Metrics

from sklearn.metrics import classification_report
print(classification_report(label,model.predict(features)))

              precision    recall  f1-score   support

           0       0.84      0.93      0.89       257
           1       0.85      0.69      0.76       143

    accuracy                           0.84       400
   macro avg       0.85      0.81      0.82       400
weighted avg       0.85      0.84      0.84       400



In [12]:
# Evaluating a classification model ---- Guideline 
#
# 1. Check for Generalization (Using Accuracy irrespective of any type of dataset)
#        testScore > trainScore and testScore >= CL
# 2. Check with entire dataset the following metric
#      if balanced:
#          accuracy(entireDataset) >= CL
#      else:
#          f1Score(entireDataset) >= CL #incase of stat approach
#          prPair(entireDataset) >= CL #incase of domain based approach


# F1Score avg >= CL ----- Approve the model Else reject the model
# 0.82 > CL --- Approve the model(0.89+0.76/2) - average value

In [None]:
# Deploy code
# pkl model