# ISYS2407 Information Systems Solutions & Design

# Feature Engineering

#### Student name: Thao Vy LE

#### Student number: S3970577

# 1 Import Libraries

In [18]:
# Library for pickling
import joblib

# Also need pandas and nympy for some tasks
import pandas as pd
import numpy as np

# Library for splitting the data into train and test sets
from sklearn.model_selection import train_test_split 

# Libraries to select k best features
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2

## 2 Load the cleaned data

In [19]:
# Load the pickled file
personal_loan_df = joblib.load('cleaned_personal_loan.pkl')  
# Check
personal_loan_df.head()

Unnamed: 0,customer_id,age,yrs_experience,family_size,education_level,income,mortgage_amt,credit_card_acct,credit_card_spend,share_trading_acct,fixed_deposit_acct,online_acct,personal_loan
0,3815,40.0,9.0,3.0,1,60.0,0.0,1,0.0,0,0,1,0
1,5737,65.0,27.0,4.0,2,36.0,90.0,0,0.0,0,0,1,0
2,4209,61.0,3.0,2.0,0,145.0,0.0,0,0.0,0,1,1,1
3,4734,29.0,14.0,1.0,0,52.0,0.0,0,0.0,1,0,1,0
4,2551,43.0,23.0,4.0,1,33.0,0.0,0,0.0,0,0,1,0


## 3 Splitting data for testing and training

In [27]:
# Features are variables that affect the target/label
# So, it's all the columns excluding the target column
# However, you may also use a subset of features previously identified as best features
# You might want to experiment with both the full set and the best features
feature_cols = [
    'age', 
    'yrs_experience', 
    'family_size', 
    'education_level',
    'income', 
    'mortgage_amt',
    'credit_card_acct', 
    'credit_card_spend', 
    'share_trading_acct',
    'fixed_deposit_acct',
    'online_acct'
]

X = personal_loan_df[feature_cols]
#print('X:\n', X)

# Store the labels/target in variable y (lower case as its a single value)
y = personal_loan_df['personal_loan']
#print('y:\n', y)

# Split into train/test set
X_train, X_test, y_train, y_test = train_test_split(X, 
                                                    y, 
                                                    test_size=0.2, # keep 20% for testing, rest for training
                                                    random_state=2 # pass an int for reproducible rtesult
                                                    )

In [28]:
# Check splitted training/testing set
print(X_train.shape, y_train.shape)
print(X_test.shape, y_test.shape)

(4800, 11) (4800,)
(1200, 11) (1200,)


In [29]:
# Check the data shape
print(personal_loan_df.shape)
print(personal_loan_df.columns)

(6000, 13)
Index(['customer_id', 'age', 'yrs_experience', 'family_size',
       'education_level', 'income', 'mortgage_amt', 'credit_card_acct',
       'credit_card_spend', 'share_trading_acct', 'fixed_deposit_acct',
       'online_acct', 'personal_loan'],
      dtype='object')


### 4 Experiment with feature selection

In [30]:
# This is a trial and error process
# There are 11 features in this data set
# First let's try k=9
X_selected = SelectKBest(chi2, k=10).fit_transform(X_train, y_train)

# Print the type and shape of the selected features
print(type(X_selected))
print(X_selected.shape)

# Print the first 7 rows of the selected features
np.set_printoptions(suppress=True) # Suppress scientific notation when printing
print(X_selected[:7])

# Print the first 7 rows of all the features in the dataset
X_train.head(7)

<class 'numpy.ndarray'>
(4800, 10)
[[ 63.  11.   1.   0.  19.   0.   0.   0.   0.   0.]
 [ 35.  39.   4.   0. 144.   0.   0.   0.   0.   1.]
 [ 34.  38.   2.   2.  55.   0.   1.   0.   0.   1.]
 [ 62.  12.   1.   0.  94.   0.   0.   0.   0.   1.]
 [ 57.  32.   4.   1.  70.   0.   0.   0.   0.   1.]
 [ 27.  31.   1.   0.  85.  95.   0.   0.   0.   0.]
 [ 44.   3.   4.   2.  88. 237.   0.   0.   1.   0.]]


Unnamed: 0,age,yrs_experience,family_size,education_level,income,mortgage_amt,credit_card_acct,credit_card_spend,share_trading_acct,fixed_deposit_acct,online_acct
2973,63.0,11.0,1.0,0,19.0,0.0,0,0.0,0,0,0
2774,35.0,39.0,4.0,0,144.0,0.0,0,0.0,0,0,1
3281,34.0,38.0,2.0,2,55.0,0.0,1,0.0,0,0,1
2512,62.0,12.0,1.0,0,94.0,0.0,0,0.0,0,0,1
457,57.0,32.0,4.0,1,70.0,0.0,0,0.0,0,0,1
5505,27.0,31.0,1.0,0,85.0,95.0,0,0.0,0,0,0
2002,44.0,3.0,4.0,2,88.0,237.0,0,0.0,0,1,0


In [40]:
#try k=9
X_selected = SelectKBest(chi2, k=9).fit_transform(X_train, y_train)

# Print the type and shape of the selected features
print(type(X_selected))
print(X_selected.shape)

# Print the first 7 rows of the selected features
np.set_printoptions(suppress=True) # Suppress scientific notation when printing
print(X_selected[:7])

# Print the first 7 rows of all the features in the dataset
X_train.head(7)

<class 'numpy.ndarray'>
(4800, 9)
[[ 63.  11.   1.   0.  19.   0.   0.   0.   0.]
 [ 35.  39.   4.   0. 144.   0.   0.   0.   0.]
 [ 34.  38.   2.   2.  55.   0.   1.   0.   0.]
 [ 62.  12.   1.   0.  94.   0.   0.   0.   0.]
 [ 57.  32.   4.   1.  70.   0.   0.   0.   0.]
 [ 27.  31.   1.   0.  85.  95.   0.   0.   0.]
 [ 44.   3.   4.   2.  88. 237.   0.   0.   1.]]


Unnamed: 0,age,yrs_experience,family_size,education_level,income,mortgage_amt,credit_card_acct,credit_card_spend,share_trading_acct,fixed_deposit_acct,online_acct
2973,63.0,11.0,1.0,0,19.0,0.0,0,0.0,0,0,0
2774,35.0,39.0,4.0,0,144.0,0.0,0,0.0,0,0,1
3281,34.0,38.0,2.0,2,55.0,0.0,1,0.0,0,0,1
2512,62.0,12.0,1.0,0,94.0,0.0,0,0.0,0,0,1
457,57.0,32.0,4.0,1,70.0,0.0,0,0.0,0,0,1
5505,27.0,31.0,1.0,0,85.0,95.0,0,0.0,0,0,0
2002,44.0,3.0,4.0,2,88.0,237.0,0,0.0,0,1,0


In [38]:
#try k=8
X_selected = SelectKBest(chi2, k=8).fit_transform(X_train, y_train)

# Print the type and shape of the selected features
print(type(X_selected))
print(X_selected.shape)

# Print the first 7 rows of the selected features
np.set_printoptions(suppress=True) # Suppress scientific notation when printing
print(X_selected[:9])

# Print the first 7 rows of all the features in the dataset
X_train.head(9)

<class 'numpy.ndarray'>
(4800, 8)
[[ 63.  11.   1.   0.  19.   0.   0.   0.]
 [ 35.  39.   4.   0. 144.   0.   0.   0.]
 [ 34.  38.   2.   2.  55.   0.   0.   0.]
 [ 62.  12.   1.   0.  94.   0.   0.   0.]
 [ 57.  32.   4.   1.  70.   0.   0.   0.]
 [ 27.  31.   1.   0.  85.  95.   0.   0.]
 [ 44.   3.   4.   2.  88. 237.   0.   1.]
 [ 26.   7.   4.   2.  14.  98.   0.   0.]
 [ 34.   7.   1.   0. 195.   0.   0.   0.]]


Unnamed: 0,age,yrs_experience,family_size,education_level,income,mortgage_amt,credit_card_acct,credit_card_spend,share_trading_acct,fixed_deposit_acct,online_acct
2973,63.0,11.0,1.0,0,19.0,0.0,0,0.0,0,0,0
2774,35.0,39.0,4.0,0,144.0,0.0,0,0.0,0,0,1
3281,34.0,38.0,2.0,2,55.0,0.0,1,0.0,0,0,1
2512,62.0,12.0,1.0,0,94.0,0.0,0,0.0,0,0,1
457,57.0,32.0,4.0,1,70.0,0.0,0,0.0,0,0,1
5505,27.0,31.0,1.0,0,85.0,95.0,0,0.0,0,0,0
2002,44.0,3.0,4.0,2,88.0,237.0,0,0.0,0,1,0
3306,26.0,7.0,4.0,2,14.0,98.0,1,0.0,0,0,1
3685,34.0,7.0,1.0,0,195.0,0.0,1,0.0,0,0,1
