In [86]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
import statsmodels.api as sm
from sklearn.utils import resample

In [11]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 10)

In [14]:
categorical = pd.read_csv('categorical.csv')
numerical = pd.read_csv('numerical.csv', low_memory=False)

# 1. Preparation

### 1.1 Look critically at the dtypes of numerical and categorical columns and make changes where appropriate.

In [32]:
# numerical

In [22]:
numerical['INCOME'] = numerical['INCOME'].astype(int)
numerical['WEALTH2'] = numerical['WEALTH2'].astype(int)
numerical['MSA'] = numerical['MSA'].astype(int)
numerical['RAMNTALL'] = numerical['RAMNTALL'].astype(int)
numerical['MAXRAMNT'] = numerical['MAXRAMNT'].astype(int)
numerical['TARGET_D'] = numerical['TARGET_D'].astype(int)
numerical['CLUSTER2'] = numerical['CLUSTER2'].astype(int)

In [65]:
# categorical

### 1.2 Concatenate numerical and categorical back together again for your X dataframe.

In [28]:
data = pd.concat([numerical, categorical], axis=1)

In [34]:
# data

### 1.3 Designate the TargetB as y.

In [36]:
y = data['TARGET_B']
X = data.drop(['TARGET_B'], axis=1)

### 1.4 Split the data into a training set and a test set.

In [40]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

### 1.5 Split further into train_num and train_cat.  Also test_num and test_cat.

In [41]:
train_num = X_train.select_dtypes(include='number')
train_cat = X_train.select_dtypes(include='object')

test_num = X_test.select_dtypes(include='number')
test_cat = X_test.select_dtypes(include='object')

### 1.6 Scale the features either by using MinMax Scaler or a Standard Scaler. (train_num, test_num)

In [43]:
minmax_fit = MinMaxScaler().fit(train_num)

In [46]:
train_num_trans = minmax_fit.transform(train_num)
test_num_trans = minmax_fit.transform(test_num)

train_num_trans = pd.DataFrame(train_num_trans, columns=train_num.columns)
test_num_trans = pd.DataFrame(test_num_trans, columns=train_num.columns)

### 1.7 Encode the categorical features using One-Hot Encoding or Ordinal Encoding. 

In [66]:
onehot_fit = OneHotEncoder(drop='first', handle_unknown='ignore').fit(train_cat)

In [67]:
train_cat_enc = onehot_fit.transform(train_cat).toarray()
train_cat_enc = pd.DataFrame(train_cat_enc, columns=onehot_fit.get_feature_names_out(input_features=train_cat.columns))

test_cat_enc = onehot_fit.transform(test_cat).toarray()
test_cat_enc = pd.DataFrame(test_cat_enc, columns=onehot_fit.get_feature_names_out(input_features=train_cat.columns))



### 1.8 Re-concatenate train_num and train_cat as X_train as well as test_num and test_cat as X_test

In [69]:
X_train = pd.concat([train_cat_enc, train_num_trans], axis=1)
X_test = pd.concat([test_cat_enc, test_num_trans], axis=1)

In [72]:
# X_train.shape

In [73]:
# X_test.shape

### 1.9 Fit a logistic regression model on the training data.

In [77]:
model = LogisticRegression().fit(X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [78]:
y_pred = model.predict(X_test)

In [79]:
accuracy_score(y_test, y_pred)

0.9793009484881832

> The accuracy score is pretty high, although we shouldn´t rely just on this metric.

# 2. Imbalance

### 2.1 Check for the imbalance.

In [84]:
data['TARGET_B'].value_counts()

0    90569
1     4843
Name: TARGET_B, dtype: int64

In [87]:
category_0 = data[data['TARGET_B'] == 0]
category_1 = data[data['TARGET_B'] == 1]

### 2.2 Strategy 1: Downsampling

In [88]:
category_0_undersampled = resample(category_0,
                                   replace=False,
                                   n_samples = len(category_1))

In [89]:
data_downsampled = pd.concat([category_0_undersampled, category_1], axis=0)

In [91]:
y = data_downsampled['TARGET_B']
X = data_downsampled.drop(['TARGET_B'], axis=1)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

train_num = X_train.select_dtypes(include='number')
train_cat = X_train.select_dtypes(include='object')

test_num = X_test.select_dtypes(include='number')
test_cat = X_test.select_dtypes(include='object')

minmax_fit = MinMaxScaler().fit(train_num)

train_num_trans = minmax_fit.transform(train_num)
test_num_trans = minmax_fit.transform(test_num)

train_num_trans = pd.DataFrame(train_num_trans, columns=train_num.columns)
test_num_trans = pd.DataFrame(test_num_trans, columns=train_num.columns)

onehot_fit = OneHotEncoder(drop='first', handle_unknown='ignore').fit(train_cat)

train_cat_enc = onehot_fit.transform(train_cat).toarray()
train_cat_enc = pd.DataFrame(train_cat_enc, columns=onehot_fit.get_feature_names_out(input_features=train_cat.columns))

test_cat_enc = onehot_fit.transform(test_cat).toarray()
test_cat_enc = pd.DataFrame(test_cat_enc, columns=onehot_fit.get_feature_names_out(input_features=train_cat.columns))

X_train = pd.concat([train_cat_enc, train_num_trans], axis=1)
X_test = pd.concat([test_cat_enc, test_num_trans], axis=1)

model = LogisticRegression().fit(X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


0.9081527347781218

In [92]:
y_pred = model.predict(X_test)

In [93]:
accuracy_score(y_test, y_pred)

0.9081527347781218

### 2.3 Strategy 2: Upsampling

In [94]:
category_1_oversampled = resample(category_1,
                                  replace=True,
                                  n_samples = len(category_0))

In [95]:
data_upsampled = pd.concat([category_0, category_1_oversampled], axis=0)

In [96]:
y = data_upsampled['TARGET_B']
X = data_upsampled.drop(['TARGET_B'], axis=1)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

train_num = X_train.select_dtypes(include='number')
train_cat = X_train.select_dtypes(include='object')

test_num = X_test.select_dtypes(include='number')
test_cat = X_test.select_dtypes(include='object')

minmax_fit = MinMaxScaler().fit(train_num)

train_num_trans = minmax_fit.transform(train_num)
test_num_trans = minmax_fit.transform(test_num)

train_num_trans = pd.DataFrame(train_num_trans, columns=train_num.columns)
test_num_trans = pd.DataFrame(test_num_trans, columns=train_num.columns)

onehot_fit = OneHotEncoder(drop='first', handle_unknown='ignore').fit(train_cat)

train_cat_enc = onehot_fit.transform(train_cat).toarray()
train_cat_enc = pd.DataFrame(train_cat_enc, columns=onehot_fit.get_feature_names_out(input_features=train_cat.columns))

test_cat_enc = onehot_fit.transform(test_cat).toarray()
test_cat_enc = pd.DataFrame(test_cat_enc, columns=onehot_fit.get_feature_names_out(input_features=train_cat.columns))

X_train = pd.concat([train_cat_enc, train_num_trans], axis=1)
X_test = pd.concat([test_cat_enc, test_num_trans], axis=1)

model = LogisticRegression().fit(X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [97]:
y_pred = model.predict(X_test)

In [98]:
accuracy_score(y_test, y_pred)

0.9950314673733024