In [1]:
# Social Network Advertisements Logistic Regression

# Importing the libraries
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

Logistic gives you discrete outcome

Linear gives you a continuous outcome

### Advantages of Logistic Regression

- independent var don't have to be normally distributed
- can handle non-linear effects
- efficient
- highly iterpretable
- Can be used as a baseline against more complex algorithms

### Disadvantages of Logistic Regression

- can't solve non-linear problems (RF can solve easily)
- must identify the important independent variables
- know to overfit

In [2]:
# Importing the dataset
dataset = pd.read_csv('Social_Network_Ads.csv')

In [5]:
dataset.head()

Unnamed: 0,User ID,Gender,Age,EstimatedSalary,Purchased
0,15624510,Male,19,19000,0
1,15810944,Male,35,20000,0
2,15668575,Female,26,43000,0
3,15603246,Female,27,57000,0
4,15804002,Male,19,76000,0


In [15]:
#see what data looks like
columns = dataset.columns
for col in columns:
    uniq = np.unique(dataset[col])
    print ('{} '.format(col) + '  ' + str(uniq[0:5]))    

User ID   [15566689 15569641 15570769 15570932 15571059]
Gender   ['Female' 'Male']
Age   [18 19 20 21 22]
EstimatedSalary   [15000 16000 17000 18000 19000]
Purchased   [0 1]


In [21]:
#check duplicated rows
dataset[dataset.duplicated()]

Unnamed: 0,User ID,Gender,Age,EstimatedSalary,Purchased


In [12]:
#check null
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 400 entries, 0 to 399
Data columns (total 5 columns):
User ID            400 non-null int64
Gender             400 non-null object
Age                400 non-null int64
EstimatedSalary    400 non-null int64
Purchased          400 non-null int64
dtypes: int64(4), object(1)
memory usage: 15.7+ KB


In [34]:
#convert gender column to labels
#since there are only two unique variables in the Gender feature, we can just lablel encode them to 0,1
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
le.fit(dataset['Gender'])

#create key for categorical variables
gender_key = {}
for i,g in enumerate(le.classes_):
    gender_key[g] = i
gender_key

dataset['Gender'] = le.transform(dataset['Gender'])

In [35]:
dataset

Unnamed: 0,User ID,Gender,Age,EstimatedSalary,Purchased
0,15624510,1,19,19000,0
1,15810944,1,35,20000,0
2,15668575,0,26,43000,0
3,15603246,0,27,57000,0
4,15804002,1,19,76000,0
5,15728773,1,27,58000,0
6,15598044,0,27,84000,0
7,15694829,0,32,150000,1
8,15600575,1,25,33000,0
9,15727311,0,35,65000,0


In [37]:
#set training and test set
X = dataset.iloc[:,1:4].values
y = dataset.iloc[:, 4].values

In [38]:
# Splitting the dataset into the Training set and Test set
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state = 0)

In [40]:
#StandardScalar to normalize each INDEPENDENT feature such that its distribution will have a mean of 0 and a std of 1
#Normalize so the large values don't overwhlem the small ones
from sklearn.preprocessing import StandardScaler
sc_X = StandardScaler()
X_train = sc_X.fit_transform(X_train)
X_test = sc_X.transform(X_test)

In [42]:
#Fitting Logistic Regression to the Training Set
from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression(random_state = 0)
classifier.fit(X_train,y_train)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=0, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

In [43]:
# Predicting the Test set results
y_pred = classifier.predict(X_test)

In [44]:
# Confusion matrix
#[actual False + Pred False (TN)  ,  Actual False + Pred True(FP)]
#[actual True + Pred False (FN)  ,  Actual True + Pred True(TP)]
from sklearn.metrics import confusion_matrix # functions are lowercase, classes are uppercase
cm = confusion_matrix(y_test,y_pred)
print(cm)

[[56  2]
 [ 4 18]]


In [46]:
# Classification Report
#Precision ratio tp / (tp + fp). Ability of the classifier NOT to label a positive sample that is negative
#Recall ratio tp / (tp + fn). Ability to find all the positive samples.
#F1-score weighted mean of precision and recall. Closer to 1 is better

from sklearn.metrics import classification_report
cr = classification_report(y_test,y_pred)
print(cr)

              precision    recall  f1-score   support

           0       0.93      0.97      0.95        58
           1       0.90      0.82      0.86        22

    accuracy                           0.93        80
   macro avg       0.92      0.89      0.90        80
weighted avg       0.92      0.93      0.92        80

