In [130]:
import pandas as pd
import numpy as np

In [131]:
# specify the location to the data file
data_file = "German credit data.txt"
# define column names
var_names = [
    "a_status",            #1  Status of existing checking account
    "duration",            #2  Duration in month
    "c_history",           #3  Credit history
    "purpose",             #4  Purpose
    "c_amount",            #5  Credit amount
    "savings",             #6  Savings account/bonds
    "employment",          #7  Present employment since
    "inst_rate",           #8  Installment rate in percentage of disposable income
    "p_status",            #9  Personal status and sex
    "o_debtors",           #10 Other debtors / guarantors
    "residence",           #11 Present residence since
    "property",            #12 Property
    "age",                 #13 Age in years
    "o_inst_plans",        #14 Other isntallment plans
    "housing",             #15 Housing
    "e_credit",            #16 Number of existing credits at this bank
    "job",                 #17 Job
    "ppl_liable",          #18 Number of people being liable to provide maintenance for
    "telephone",           #19 Telephone
    "foreign",             #20 Foreign worker
    "class",               #21 Classification
]

# read from text file
df_data = pd.read_csv(data_file, sep=" ", names=var_names)

# examine data types
df_data.dtypes

a_status        object
duration         int64
c_history       object
purpose         object
c_amount         int64
savings         object
employment      object
inst_rate        int64
p_status        object
o_debtors       object
residence        int64
property        object
age              int64
o_inst_plans    object
housing         object
e_credit         int64
job             object
ppl_liable       int64
telephone       object
foreign         object
class            int64
dtype: object

In [132]:
# examine the top 10 rows of the data frame
df_data.head(10)

Unnamed: 0,a_status,duration,c_history,purpose,c_amount,savings,employment,inst_rate,p_status,o_debtors,...,property,age,o_inst_plans,housing,e_credit,job,ppl_liable,telephone,foreign,class
0,A11,6,A34,A43,1169,A65,A75,4,A93,A101,...,A121,67,A143,A152,2,A173,1,A192,A201,1
1,A12,48,A32,A43,5951,A61,A73,2,A92,A101,...,A121,22,A143,A152,1,A173,1,A191,A201,2
2,A14,12,A34,A46,2096,A61,A74,2,A93,A101,...,A121,49,A143,A152,1,A172,2,A191,A201,1
3,A11,42,A32,A42,7882,A61,A74,2,A93,A103,...,A122,45,A143,A153,1,A173,2,A191,A201,1
4,A11,24,A33,A40,4870,A61,A73,3,A93,A101,...,A124,53,A143,A153,2,A173,2,A191,A201,2
5,A14,36,A32,A46,9055,A65,A73,2,A93,A101,...,A124,35,A143,A153,1,A172,2,A192,A201,1
6,A14,24,A32,A42,2835,A63,A75,3,A93,A101,...,A122,53,A143,A152,1,A173,1,A191,A201,1
7,A12,36,A32,A41,6948,A61,A73,2,A93,A101,...,A123,35,A143,A151,1,A174,1,A192,A201,1
8,A14,12,A32,A43,3059,A64,A74,2,A91,A101,...,A121,61,A143,A152,1,A172,1,A191,A201,1
9,A12,30,A34,A40,5234,A61,A71,4,A94,A101,...,A123,28,A143,A152,2,A174,1,A191,A201,2


In [133]:
# encode string values to integers
from sklearn.preprocessing import LabelEncoder

# maintain a dict for string to integer mappings for each column
label_dict = dict()
for var_name in var_names:
    # only map string values
    if df_data[var_name].dtype == np.object:
        le = LabelEncoder()
        # map the string values
        df_data[var_name] = le.fit_transform(df_data[var_name])
        # store the mapping in the dict
        label_dict[var_name] = dict()
        for cls, label in zip(le.classes_, le.transform(le.classes_)):
            label_dict[var_name][label] = cls
label_dict

Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  if df_data[var_name].dtype == np.object:


{'a_status': {0: 'A11', 1: 'A12', 2: 'A13', 3: 'A14'},
 'c_history': {0: 'A30', 1: 'A31', 2: 'A32', 3: 'A33', 4: 'A34'},
 'purpose': {0: 'A40',
  1: 'A41',
  2: 'A410',
  3: 'A42',
  4: 'A43',
  5: 'A44',
  6: 'A45',
  7: 'A46',
  8: 'A48',
  9: 'A49'},
 'savings': {0: 'A61', 1: 'A62', 2: 'A63', 3: 'A64', 4: 'A65'},
 'employment': {0: 'A71', 1: 'A72', 2: 'A73', 3: 'A74', 4: 'A75'},
 'p_status': {0: 'A91', 1: 'A92', 2: 'A93', 3: 'A94'},
 'o_debtors': {0: 'A101', 1: 'A102', 2: 'A103'},
 'property': {0: 'A121', 1: 'A122', 2: 'A123', 3: 'A124'},
 'o_inst_plans': {0: 'A141', 1: 'A142', 2: 'A143'},
 'housing': {0: 'A151', 1: 'A152', 2: 'A153'},
 'job': {0: 'A171', 1: 'A172', 2: 'A173', 3: 'A174'},
 'telephone': {0: 'A191', 1: 'A192'},
 'foreign': {0: 'A201', 1: 'A202'}}

In [134]:
# examine data types again
df_data.dtypes

a_status        int32
duration        int64
c_history       int32
purpose         int32
c_amount        int64
savings         int32
employment      int32
inst_rate       int64
p_status        int32
o_debtors       int32
residence       int64
property        int32
age             int64
o_inst_plans    int32
housing         int32
e_credit        int64
job             int32
ppl_liable      int64
telephone       int32
foreign         int32
class           int64
dtype: object

In [135]:
df_data.head(10)

Unnamed: 0,a_status,duration,c_history,purpose,c_amount,savings,employment,inst_rate,p_status,o_debtors,...,property,age,o_inst_plans,housing,e_credit,job,ppl_liable,telephone,foreign,class
0,0,6,4,4,1169,4,4,4,2,0,...,0,67,2,1,2,2,1,1,0,1
1,1,48,2,4,5951,0,2,2,1,0,...,0,22,2,1,1,2,1,0,0,2
2,3,12,4,7,2096,0,3,2,2,0,...,0,49,2,1,1,1,2,0,0,1
3,0,42,2,3,7882,0,3,2,2,2,...,1,45,2,2,1,2,2,0,0,1
4,0,24,3,0,4870,0,2,3,2,0,...,3,53,2,2,2,2,2,0,0,2
5,3,36,2,7,9055,4,2,2,2,0,...,3,35,2,2,1,1,2,1,0,1
6,3,24,2,3,2835,2,4,3,2,0,...,1,53,2,1,1,2,1,0,0,1
7,1,36,2,1,6948,0,2,2,2,0,...,2,35,2,0,1,3,1,1,0,1
8,3,12,2,4,3059,3,3,2,0,0,...,0,61,2,1,1,1,1,0,0,1
9,1,30,4,0,5234,0,0,4,3,0,...,2,28,2,1,2,3,1,0,0,2


In [136]:
#Partition the data set for cross validation

from sklearn.model_selection  import train_test_split

attributes = [col for col in var_names if col != "class"]

# partition: train/test = 70/30
train_x, test_x, train_y, test_y = train_test_split(df_data[attributes], df_data["class"], test_size=0.3, random_state=123)

# convert numpy arrays to data frames
df_train_x = pd.DataFrame(train_x, columns=attributes)
df_test_x = pd.DataFrame(test_x, columns=attributes)
df_train_y = pd.DataFrame(train_y, columns=["class"])
df_test_y = pd.DataFrame(test_y, columns=["class"])

print ("shapes")
print (df_train_x.shape)
print (df_test_x.shape)
print (df_train_y.shape)
print (df_test_y.shape)

print 
attributes
print ("class counts")
print (df_data["class"].value_counts())
print (df_train_y["class"].value_counts())
print (df_test_y["class"].value_counts())

shapes
(700, 20)
(300, 20)
(700, 1)
(300, 1)
class counts
1    700
2    300
Name: class, dtype: int64
1    500
2    200
Name: class, dtype: int64
1    200
2    100
Name: class, dtype: int64


# Feature selection must be performed using only training data!! ONLY!!

In [137]:
#Feature selection must be performed using only training data
# compute the f value and p value of the chi-squared test between each attribute and the class
from sklearn.feature_extraction.text import *
from sklearn.feature_selection import *

f_val, p_val = chi2(df_train_x, df_train_y["class"]) 

# print the Chi-squared valus and p values
df_scores = pd.DataFrame(zip(attributes, f_val, p_val), columns=["feature", "chi2", "p"])
df_scores["chi2"] = df_scores["chi2"].round(2)
df_scores["p"] = df_scores["p"].round(3)

# use features with p < 0.05
sel_ohe_cols = df_scores[df_scores["p"]<0.05]["feature"].values
print ("\nSelected features: %d" % len(sel_ohe_cols))
print (sel_ohe_cols)




Selected features: 8
['a_status' 'duration' 'c_history' 'c_amount' 'savings' 'employment'
 'property' 'age']


In [138]:
df_scores.sort_values("chi2", ascending = False)

Unnamed: 0,feature,chi2,p
4,c_amount,28703.88,0.0
1,duration,230.26,0.0
0,a_status,68.76,0.0
5,savings,31.07,0.0
12,age,20.28,0.0
2,c_history,13.76,0.0
11,property,5.81,0.016
6,employment,4.11,0.043
19,foreign,3.7,0.055
8,p_status,2.05,0.152


In [139]:
#Creating a model with feature selection
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score

from sklearn.svm import LinearSVC

clf = LinearSVC()
clf = clf.fit(df_train_x[sel_ohe_cols], train_y)
#df_importance = pd.DataFrame(zip(df_train_x.columns, clf.coef_[0]), columns=["feature", "weight"])
#df_importance.sort_values("weight", ascending=False)
pred_y = clf.predict(df_test_x[sel_ohe_cols])
print ("f1:" + str(f1_score(pred_y, test_y)))
print ("accuracy:" + str(accuracy_score(pred_y, test_y)))
print ("precision:" + str(precision_score(pred_y, test_y)))
print ("recall:" + str(recall_score(pred_y, test_y)))

f1:0.8
accuracy:0.6666666666666666
precision:1.0
recall:0.6666666666666666




In [140]:
#Without feature selection
from sklearn import tree
clf = tree.DecisionTreeClassifier()
# train model
clf = clf.fit(train_x, train_y)
# make prediction
pred_y = clf.predict(test_x)
# evaluate the prediction results

from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score

print ("f1:" + str(f1_score(pred_y, test_y)))
print ("accuracy:" + str(accuracy_score(pred_y, test_y)))
print ("precision:" + str(precision_score(pred_y, test_y)))
print ("recall:" + str(recall_score(pred_y, test_y)))

f1:0.7543424317617866
accuracy:0.67
precision:0.76
recall:0.7487684729064039
