# Logistic Regression VS Decision Tree
Abstract: Predict whether income exceeds $50K/yr based on census data. Also known as "Adult" dataset.

Aim: Compare performance of Logistic regression and decision tree model on target dataset.

Dataset: Census Income Data Set - link: https://archive.ics.uci.edu/ml/datasets/Census+Income

Conducted by Liu Qihan

# Now Lets Start!✌

In [123]:
import pandas as pd
import numpy as np

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import time

# Import dataset

In [124]:
df = pd.read_csv('https://raw.githubusercontent.com/liuqihan338811/Decision-Tree/main/adult_data.csv')

# Exploratory Data Analysis

In [125]:
df.head()

Unnamed: 0,age,workclass,fnlwgt,education,education_num,marital_status,occupation,relationship,race,sex,capital_gain,capital_loss,hours_per_week,native_country,Income
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [126]:
df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
age,32561.0,38.581647,13.640433,17.0,28.0,37.0,48.0,90.0
fnlwgt,32561.0,189778.366512,105549.977697,12285.0,117827.0,178356.0,237051.0,1484705.0
education_num,32561.0,10.080679,2.57272,1.0,9.0,10.0,12.0,16.0
capital_gain,32561.0,1077.648844,7385.292085,0.0,0.0,0.0,0.0,99999.0
capital_loss,32561.0,87.30383,402.960219,0.0,0.0,0.0,0.0,4356.0
hours_per_week,32561.0,40.437456,12.347429,1.0,40.0,40.0,45.0,99.0


In [127]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32561 entries, 0 to 32560
Data columns (total 15 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   age             32561 non-null  int64 
 1   workclass       32561 non-null  object
 2   fnlwgt          32561 non-null  int64 
 3   education       32561 non-null  object
 4   education_num   32561 non-null  int64 
 5   marital_status  32561 non-null  object
 6   occupation      32561 non-null  object
 7   relationship    32561 non-null  object
 8   race            32561 non-null  object
 9   sex             32561 non-null  object
 10  capital_gain    32561 non-null  int64 
 11  capital_loss    32561 non-null  int64 
 12  hours_per_week  32561 non-null  int64 
 13  native_country  32561 non-null  object
 14  Income          32561 non-null  object
dtypes: int64(6), object(9)
memory usage: 3.7+ MB


# Checking the presence of Missing Values

In [128]:
#the head() shows that the dataset contains values coded as ?. So,encode ? as NaN values
df[df == '?'] = np.nan

#Display missing value
df.isnull().sum()

age               0
workclass         0
fnlwgt            0
education         0
education_num     0
marital_status    0
occupation        0
relationship      0
race              0
sex               0
capital_gain      0
capital_loss      0
hours_per_week    0
native_country    0
Income            0
dtype: int64

# Split dataset into independent (X) and dependent (Y) variable

In [129]:
X = df.drop(['Income'], axis=1)

y = df['Income']

# Split data into training and testing dataset

In [130]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 0)

# Feature engineering

In [131]:
from sklearn import preprocessing

categorical = ['workclass','education', 'marital_status', 'occupation', 'relationship','race', 'sex','native_country']
for feature in categorical:
        le = preprocessing.LabelEncoder()
        X_train[feature] = le.fit_transform(X_train[feature])
        X_test[feature] = le.transform(X_test[feature])

In [132]:
#Using StandardScalar to normalise the dataset

from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

X_train = pd.DataFrame(scaler.fit_transform(X_train), columns = X.columns)

X_test = pd.DataFrame(scaler.transform(X_test), columns = X.columns)

In [133]:
X_train.head()

Unnamed: 0,age,workclass,fnlwgt,education,education_num,marital_status,occupation,relationship,race,sex,capital_gain,capital_loss,hours_per_week,native_country
0,0.470399,0.091645,-0.174981,0.179166,-0.414963,-0.409687,-0.842289,-0.898445,0.396164,0.701715,0.826332,-0.216969,1.589744,0.290103
1,0.617346,2.157698,-0.763517,0.179166,-0.414963,-1.738219,-1.315337,1.591127,0.396164,-1.425079,0.098582,-0.216969,-0.200583,0.290103
2,0.69082,0.091645,-0.017034,1.216323,-0.025832,-0.409687,1.286425,-0.898445,0.396164,0.701715,-0.145156,-0.216969,0.775959,0.290103
3,-0.705178,0.091645,-0.415016,0.179166,-0.414963,0.918845,-0.842289,-0.276052,0.396164,0.701715,-0.145156,-0.216969,-0.037826,-3.019607
4,-1.146019,0.091645,0.130127,-1.37657,-2.36062,0.918845,-0.842289,-0.276052,0.396164,0.701715,-0.145156,-0.216969,-1.258503,-3.2742


# Applying the Logistic Regression algorithm

In [134]:
logreg = LogisticRegression()
start_time = time.time()
logreg.fit(X_train, y_train)
end_time = time.time()

Y_pred = logreg.predict(X_test)

acc_lr = accuracy_score(y_test, Y_pred)
precision_lr = precision_score(y_test, Y_pred, pos_label='>50K')
recall_lr = recall_score(y_test, Y_pred, pos_label='>50K')
elapsed_time_lr = end_time - start_time

# Applying the Decision Tree algorithm

In [135]:
decision_tree = DecisionTreeClassifier()
start_time = time.time()
decision_tree.fit(X_train, y_train)
end_time = time.time()

Y_pred = decision_tree.predict(X_test)

acc_dt = accuracy_score(y_test, Y_pred)
precision_dt = precision_score(y_test, Y_pred, pos_label='>50K')
recall_dt = recall_score(y_test, Y_pred, pos_label='>50K')
elapsed_time_dt = end_time - start_time

# Performance Comparasion Matrix

In [136]:
#Plotting the accuracy of the used algorithms to find the best fit

results = pd.DataFrame({
    'Model': ['Logistic Regression','Decision Tree'],
    'Accuracy Score': [ acc_lr,acc_dt],
    'Precision Score': [ precision_lr,precision_dt],
    'Recall Score': [ recall_lr,recall_dt],
    'Elapsed_time': [ elapsed_time_lr,elapsed_time_dt]})
result_df = results.sort_values(by='Accuracy Score', ascending=False)
result_df = result_df.set_index('Accuracy Score')
result_df.head(2)

Unnamed: 0_level_0,Model,Precision Score,Recall Score,Elapsed_time
Accuracy Score,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0.824342,Logistic Regression,0.70974,0.462743,0.081879
0.810318,Decision Tree,0.608621,0.603726,0.201998


# Test if the performance difference (Practice)

In [139]:
from scipy.stats import ttest_rel

# Perform a two-tailed paired t-test on the accuracy scores of the two models
t_stat, p_value = ttest_rel([acc_dt], [acc_lr])

# Print the results
if p_value < 0.05:
    print("The performance difference between the two models is statistically significant.")
else:
    print("The performance difference between the two models is not statistically significant.")

The performance difference between the two models is not statistically significant.


  t_stat, p_value = ttest_rel([acc_dt], [acc_lr])
