In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/customer-behaviour/Customer_Behaviour.csv


In [2]:
# import libraries

import pandas as pd
import numpy as np 

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

from sklearn.linear_model import LogisticRegression

In [3]:
data = pd.read_csv('../input/customer-behaviour/Customer_Behaviour.csv')

In [4]:
data

Unnamed: 0,User ID,Gender,Age,EstimatedSalary,Purchased
0,15624510,Male,19,19000,0
1,15810944,Male,35,20000,0
2,15668575,Female,26,43000,0
3,15603246,Female,27,57000,0
4,15804002,Male,19,76000,0
...,...,...,...,...,...
395,15691863,Female,46,41000,1
396,15706071,Male,51,23000,1
397,15654296,Female,50,20000,1
398,15755018,Male,36,33000,0


# Preprocessing

In [5]:
def preprocessing(df, engineering_feature=False):
    
    df = df.copy()
    
    #Drop userID column
    df = df.drop('User ID', axis=1)
    
    #binary encoding
    df['Gender'] = df['Gender'].replace({'Female': 0, 'Male': 1})
    
    #Feature engineering
    if engineering_feature==True:
        income_threshold = df['EstimatedSalary'].quantile(0.95)
        df['High_income'] = df['EstimatedSalary'].apply(lambda x: 1 if x >= income_threshold else 0)
        
        old_age_threshold = df['Age'].quantile(0.75)
        df['Old Age'] = df['Age'].apply(lambda x: 1 if x >= old_age_threshold else 0)
        young_age_threshold = df['Age'].quantile(0.25)
        df['Young Age'] = df['Age'].apply(lambda x: 1 if x >= young_age_threshold else 0)
    
    
    #split df into x and y
    y = df['Purchased']
    X = df.drop('Purchased', axis=1)
    
    # train-test split
    X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.7, shuffle=True, random_state=1)
    
    #scale X
    scaler = StandardScaler()
    scaler.fit(X_train)
    X_train = pd.DataFrame(scaler.transform(X_train), index=X_train.index, columns=X_train.columns)
    X_test = pd.DataFrame(scaler.transform(X_test), index=X_test.index, columns=X_test.columns)
    
    return X_train, X_test, y_train, y_test

# Training without feature engineering

In [6]:
X_train, X_test, y_train, y_test = preprocessing(data, engineering_feature=False)

In [7]:
X_train

Unnamed: 0,Gender,Age,EstimatedSalary
39,-0.992882,-0.979100,-1.110590
167,-0.992882,-0.203575,0.056697
383,1.007169,1.153594,-1.198137
221,1.007169,-0.203575,0.640340
351,1.007169,-0.009694,0.173426
...,...,...,...
255,-0.992882,1.444415,0.611158
72,-0.992882,-1.657685,-1.344048
396,1.007169,1.347475,-1.344048
235,1.007169,0.862772,0.290154


In [8]:
model = LogisticRegression()
model.fit(X_train, y_train)

acc = model.score(X_test, y_test)

print(f'Test accuracy: {acc * 100}')


Test accuracy: 80.83333333333333


# Training with Feature engineering

In [9]:
X_train, X_test, y_train, y_test = preprocessing(data, engineering_feature=True)
X_train

Unnamed: 0,Gender,Age,EstimatedSalary,High_income,Old Age,Young Age
39,-0.992882,-0.979100,-1.110590,-0.246183,-0.555348,-1.653280
167,-0.992882,-0.203575,0.056697,-0.246183,-0.555348,0.604858
383,1.007169,1.153594,-1.198137,-0.246183,1.800673,0.604858
221,1.007169,-0.203575,0.640340,-0.246183,-0.555348,0.604858
351,1.007169,-0.009694,0.173426,-0.246183,-0.555348,0.604858
...,...,...,...,...,...,...
255,-0.992882,1.444415,0.611158,-0.246183,1.800673,0.604858
72,-0.992882,-1.657685,-1.344048,-0.246183,-0.555348,-1.653280
396,1.007169,1.347475,-1.344048,-0.246183,1.800673,0.604858
235,1.007169,0.862772,0.290154,-0.246183,1.800673,0.604858


In [10]:
model = LogisticRegression()
model.fit(X_train, y_train)

acc = model.score(X_test, y_test)

print(f'Test accuracy: {acc * 100}')

Test accuracy: 85.0
