In [20]:
import pandas as pd
import numpy as np

## Categorical variables with the best chi2 scores 

In [21]:
from sklearn.feature_selection import chi2, SelectKBest

In [22]:
data = pd.read_csv('credit.csv')
x = data.iloc[:,:-1]
y = data['class']

string = ['credit_history', 'purpose', 'savings_status',
       'employment', 'personal_status', 'other_parties', 'property_magnitude',
       'other_payment_plans', 'housing', 'job', 'own_telephone',
       'foreign_worker']

In [23]:
# Categorize all the string features for chi2
def myohc(df, var):
    uniques = df[var].unique()
    count = len(df[var].unique())
    thismap ={}
    for i in range(count):
        thismap[uniques[i]] = i
        
    df.replace({var:thismap}, inplace=True)
    
for var in string:
    myohc(x, var)

x[string].head()

Unnamed: 0,credit_history,purpose,savings_status,employment,personal_status,other_parties,property_magnitude,other_payment_plans,housing,job,own_telephone,foreign_worker
0,0,0,0,0,0,0,0,0,0,0,0,0
1,1,0,1,1,1,0,0,0,0,0,1,0
2,0,1,1,2,0,0,0,0,0,1,1,0
3,1,2,1,2,0,1,1,0,1,0,1,0
4,2,3,1,1,0,0,2,0,1,0,1,0


In [24]:
which = SelectKBest(chi2, k=9).fit(x[string], y).get_support()
print('these are the top 9 string features by chi2:')
feat = np.array(string)[which]
print(feat)

these are the top 9 string features by chi2:
['credit_history' 'purpose' 'employment' 'personal_status' 'other_parties'
 'property_magnitude' 'other_payment_plans' 'housing' 'foreign_worker']


Three features (checking_Status, employment, property_magnitude) are being converted into numerical features. own_telephone will be dropped because it did not appear in the top 9 features.

The categorical features which I HAVE to preprocess and use in my model are:
* credit history (5 values)
* purpose (11 values)
* other payment plans (3 values)
* housing (3 values)
* foreign worker (2 values)
* personal_status (4 values)

The features with 2 or 3 values can be one hot encoded, but credit history and purpose have too many unique values to be one hot coded. They would cause the curse of dimensionality. I am barely comfortable one hot encoding just these 3 features becaues they amount to 2+3+3=8 features.

To avoid the curse of dimensionality, I can encode the larger features with:

* Frequency of value

* Mean of target variable

* Sum of target variable

In [25]:
for f in ['credit_history', 'personal_status', 'purpose']:
    print(f)
    print(data[f].value_counts(), '\n')

credit_history
'existing paid'                     530
'critical/other existing credit'    293
'delayed previously'                 88
'all paid'                           49
'no credits/all paid'                40
Name: credit_history, dtype: int64 

personal_status
'male single'           548
'female div/dep/mar'    310
'male mar/wid'           92
'male div/sep'           50
Name: personal_status, dtype: int64 

purpose
buy_radio_tv               280
buy_new_car                234
buy_furniture_equipment    181
buy_used_car               103
business                    97
education                   50
repairs                     22
buy_domestic_appliance      12
other                       12
retraining                   9
Name: purpose, dtype: int64 



## Dropping redundant variables

In [26]:
feat_keep = ['credit_history', 'purpose',
       'personal_status','other_payment_plans', 'housing', 'foreign_worker']

In [27]:
credit = pd.read_csv('credit.csv')
features_keep = list(feat_keep) + ['class']
credit = credit[features_keep]

In [28]:
# Converting BAD to 1 GOOD to 0
credit.replace({'class':{'good':0, 'bad':1}}, inplace=True)

## One hot encoding

In [29]:
# Onehotncoding function that drops the original column

def onehotencode(df, feat):
    new_df = df.copy()
    ohc = pd.get_dummies(df[feat], prefix=feat)
    new_df.drop(columns=feat, inplace=True)
    return pd.concat([new_df, ohc], axis=1)

In [30]:
ohc = ['housing','other_payment_plans','foreign_worker']

for v in ohc:
    credit = onehotencode(credit, v)
    
credit.columns

Index(['credit_history', 'purpose', 'personal_status', 'class',
       'housing_'for free'', 'housing_own', 'housing_rent',
       'other_payment_plans_bank', 'other_payment_plans_none',
       'other_payment_plans_stores', 'foreign_worker_no',
       'foreign_worker_yes'],
      dtype='object')

## Converting to means

In [31]:
# mean encoding that drops the original column

def meanencode(df, feat):
    new_df = df.copy()
    new_df[feat] = df[[feat, 'class']].groupby(feat).transform('mean')
    
    return new_df

In [32]:
means = ['personal_status', 'credit_history']

for v in means:
    credit = meanencode(credit, v)
    
credit.columns

Index(['credit_history', 'purpose', 'personal_status', 'class',
       'housing_'for free'', 'housing_own', 'housing_rent',
       'other_payment_plans_bank', 'other_payment_plans_none',
       'other_payment_plans_stores', 'foreign_worker_no',
       'foreign_worker_yes'],
      dtype='object')

## Purpose column

In [33]:
# Working with reason data
from sklearn.feature_extraction.text import CountVectorizer

In [34]:
credit['purpose'].unique()

array(['buy_radio_tv', 'education', 'buy_furniture_equipment',
       'buy_new_car', 'buy_used_car', 'business',
       'buy_domestic_appliance', 'repairs', 'other', 'retraining'],
      dtype=object)

In [35]:
vec = CountVectorizer()

l1 = lambda x: ' '.join(x.split('_'))
credit['purpose'] = credit['purpose'].apply(l1)
dummy_matrix = vec.fit_transform(credit['purpose']).toarray()

text = pd.DataFrame(dummy_matrix, columns=vec.get_feature_names_out())
text.head()

Unnamed: 0,appliance,business,buy,car,domestic,education,equipment,furniture,new,other,radio,repairs,retraining,tv,used
0,0,0,1,0,0,0,0,0,0,0,1,0,0,1,0
1,0,0,1,0,0,0,0,0,0,0,1,0,0,1,0
2,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0
3,0,0,1,0,0,0,1,1,0,0,0,0,0,0,0
4,0,0,1,1,0,0,0,0,1,0,0,0,0,0,0


In [36]:
# running chi2 tests
x = text
y = credit['class']

which = SelectKBest(chi2, k=3).fit(x, y).get_support()
print('these are the top 3 string features by chi2:')
feat = np.array(x.columns)[which]
print(feat)

these are the top 3 string features by chi2:
['radio' 'tv' 'used']


In [37]:
credit.drop(columns='purpose', inplace=True)

credit['purpose_radio'] = text['radio']
credit['purpose_tv'] = text['tv']
credit['purpose_used'] = text['used']
credit.columns

Index(['credit_history', 'personal_status', 'class', 'housing_'for free'',
       'housing_own', 'housing_rent', 'other_payment_plans_bank',
       'other_payment_plans_none', 'other_payment_plans_stores',
       'foreign_worker_no', 'foreign_worker_yes', 'purpose_radio',
       'purpose_tv', 'purpose_used'],
      dtype='object')

In [38]:
credit.to_csv('data/data_categorical.csv', index=False)