## Linear Discriminant Analysis (LDA) - classification problem (social network ad dataset)

In [29]:
# read the dataset 
import pandas as pd
dataset = pd.read_csv("Social_Network_Ads.csv")
dataset

Unnamed: 0,User ID,Gender,Age,EstimatedSalary,Purchased
0,15624510,Male,19,19000,0
1,15810944,Male,35,20000,0
2,15668575,Female,26,43000,0
3,15603246,Female,27,57000,0
4,15804002,Male,19,76000,0
...,...,...,...,...,...
395,15691863,Female,46,41000,1
396,15706071,Male,51,23000,1
397,15654296,Female,50,20000,1
398,15755018,Male,36,33000,0


In [30]:
# convert the categorical data into numerical data 
dataset = pd.get_dummies(dataset, drop_first=True, dtype=int)
dataset.head()

Unnamed: 0,User ID,Age,EstimatedSalary,Purchased,Gender_Male
0,15624510,19,19000,0,1
1,15810944,35,20000,0,1
2,15668575,26,43000,0,0
3,15603246,27,57000,0,0
4,15804002,19,76000,0,1


In [31]:
dataset.drop(['User ID'], axis=1)

Unnamed: 0,Age,EstimatedSalary,Purchased,Gender_Male
0,19,19000,0,1
1,35,20000,0,1
2,26,43000,0,0
3,27,57000,0,0
4,19,76000,0,1
...,...,...,...,...
395,46,41000,1,0
396,51,23000,1,1
397,50,20000,1,0
398,36,33000,0,1


In [32]:
dataset.columns

Index(['User ID', 'Age', 'EstimatedSalary', 'Purchased', 'Gender_Male'], dtype='object')

In [33]:
# split input and output 
independent = dataset[[ 'Age', 'EstimatedSalary','Gender_Male' ]]
dependent = dataset[[ 'Purchased']]

In [34]:
dependent.head()

Unnamed: 0,Purchased
0,0
1,0
2,0
3,0
4,0


In [35]:
independent.head()

Unnamed: 0,Age,EstimatedSalary,Gender_Male
0,19,19000,1
1,35,20000,1
2,26,43000,0
3,27,57000,0
4,19,76000,1


In [36]:
# split train & test 
from sklearn.model_selection import train_test_split
x_train, x_test, y_train,y_test = train_test_split(independent, dependent, test_size=0.20, random_state=0)

In [37]:
x_train.head()

Unnamed: 0,Age,EstimatedSalary,Gender_Male
336,58,144000,1
64,59,83000,0
55,24,55000,0
106,26,35000,0
300,58,38000,0


In [38]:
y_train.head()

Unnamed: 0,Purchased
336,1
64,0
55,0
106,0
300,1


In [39]:
# Standardization 
from sklearn.preprocessing import StandardScaler
sc= StandardScaler()
x_train = sc.fit_transform(x_train)
x_test = sc.transform(x_test)

print("x_train: ",x_train, "\n\n x_test: ",x_test)

x_train:  [[ 1.92295008e+00  2.14601566e+00  1.02532046e+00]
 [ 2.02016082e+00  3.78719297e-01 -9.75304830e-01]
 [-1.38221530e+00 -4.32498705e-01 -9.75304830e-01]
 [-1.18779381e+00 -1.01194013e+00 -9.75304830e-01]
 [ 1.92295008e+00 -9.25023920e-01 -9.75304830e-01]
 [ 3.67578135e-01  2.91803083e-01 -9.75304830e-01]
 [ 1.73156642e-01  1.46942725e-01 -9.75304830e-01]
 [ 2.02016082e+00  1.74040666e+00  1.02532046e+00]
 [ 7.56421121e-01 -8.38107706e-01 -9.75304830e-01]
 [ 2.70367388e-01 -2.87638347e-01 -9.75304830e-01]
 [ 3.67578135e-01 -1.71750061e-01  1.02532046e+00]
 [-1.18475597e-01  2.20395980e+00 -9.75304830e-01]
 [-1.47942605e+00 -6.35303205e-01 -9.75304830e-01]
 [-1.28500455e+00 -1.06988428e+00  1.02532046e+00]
 [-1.38221530e+00  4.07691369e-01  1.02532046e+00]
 [-1.09058306e+00  7.55356227e-01 -9.75304830e-01]
 [-1.47942605e+00 -2.00722133e-01  1.02532046e+00]
 [ 9.50842613e-01 -1.06988428e+00  1.02532046e+00]
 [ 9.50842613e-01  5.81523798e-01  1.02532046e+00]
 [ 3.67578135e-01  9.

In [40]:
# model creation 
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
classifier = LinearDiscriminantAnalysis(solver='svd') # default parameters
classifier.fit(x_train,y_train)


  y = column_or_1d(y, warn=True)


In [41]:
classifier.coef_

array([[ 1.95595647,  0.98374632, -0.01070972]])

In [42]:
classifier.intercept_

array([-0.8784755])

In [43]:
classifier.classes_

array([0, 1])

In [44]:
# model prediction
y_pred = classifier.predict(x_test)
y_pred[1:10] #1st 10 rows

array([0, 0, 0, 0, 0, 0, 1, 0, 1])

In [45]:
# evaluating the model 
from sklearn.metrics import confusion_matrix
matrix = confusion_matrix(y_test,y_pred)
matrix

array([[56,  2],
       [ 5, 17]])

In [46]:
from sklearn.metrics import classification_report
report = classification_report(y_test,y_pred)
print(report)

# accuracy =  0.91   
# overall - good but recall for class 1 is 0.77 (make decision according to the use case)

              precision    recall  f1-score   support

           0       0.92      0.97      0.94        58
           1       0.89      0.77      0.83        22

    accuracy                           0.91        80
   macro avg       0.91      0.87      0.89        80
weighted avg       0.91      0.91      0.91        80



## check overt fit under fit  

In [47]:

train_score = classifier.score(x_train,y_train) 
test_score = classifier.score(x_test,y_test) 
print(
    "train_score: ",train_score ,"\n",
    "test_score: ",test_score ,"\n",
    "train and test difference: ", abs(train_score - test_score),"\n"
)
#  train and test difference:  0.08750000000000002   -- good fit 

train_score:  0.825 
 test_score:  0.9125 
 train and test difference:  0.08750000000000002 



In [50]:
# save the model 
import pickle
pickle.dump( classifier, open('LinearDiscriminantAnalysis_classifier.sav','wb'))