In [1]:
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler 
from sklearn.preprocessing import OneHotEncoder

import statsmodels.api as sm

# Logistic Regression model
from sklearn.linear_model import LogisticRegression  # Logistic Regression model
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score


import sklearn
from sklearn.utils import resample

In [2]:
# # ROWS (all, or only some of them)
# pd.set_option('display.max_rows', None) # Showing all rows.
# pd.reset_option('display.max_rows') # Showing only some rows. 

**Begin the Modeling here**
- Look critically at the dtypes of numerical and categorical columns and make changes where appropriate.
- Concatenate numerical and categorical back together again for your X dataframe.  Designate the TargetB as y.
  - Split the data into a training set and a test set.
  - Split further into train_num and train_cat.  Also test_num and test_cat.
  - Scale the features either by using MinMax Scaler or a Standard Scaler. (train_num, test_num)
  - Encode the categorical features using One-Hot Encoding or Ordinal Encoding.  (train_cat, test_cat)
      - **fit** only on train data, transform both train and test
      - again re-concatenate train_num and train_cat as X_train as well as test_num and test_cat as X_test
  - Fit a logistic regression (classification) model on the training data.
  - Check the accuracy on the test data.

**Note**: So far we have not balanced the data.

Managing imbalance in the dataset

- Check for the imbalance.
- Use the resampling strategies used in class for upsampling and downsampling to create a balance between the two classes.
- Each time fit the model and see how the accuracy of the model has changed.

#### Importing data

In [3]:
numerical = pd.read_csv('numerical.csv')
numerical.head()

Unnamed: 0,TCODE,AGE,INCOME,WEALTH1,HIT,MALEMILI,MALEVET,VIETVETS,WWIIVETS,LOCALGOV,...,CARDGIFT,MINRAMNT,MAXRAMNT,LASTGIFT,TIMELAG,AVGGIFT,CONTROLN,HPHONE_D,RFA_2F,CLUSTER2
0,0,60.0,5,9,0,0,39,34,18,10,...,14,5.0,12.0,10.0,4,7.741935,95515,0,4,39
1,1,46.0,6,9,16,0,15,55,11,6,...,1,10.0,25.0,25.0,18,15.666667,148535,0,2,1
2,1,61.611649,3,1,2,0,20,29,33,6,...,14,2.0,16.0,5.0,12,7.481481,15078,1,4,60
3,0,70.0,1,4,2,0,23,14,31,3,...,7,2.0,11.0,10.0,9,6.8125,172556,1,4,41
4,0,78.0,3,2,60,1,28,9,53,26,...,8,3.0,15.0,15.0,14,6.864865,7112,1,2,26


In [4]:
targets = pd.read_csv('target.csv')
targets.head()

Unnamed: 0,TARGET_B,TARGET_D
0,0,0.0
1,0,0.0
2,0,0.0
3,0,0.0
4,0,0.0


In [5]:
categorical = pd.read_csv('categorical.csv')
categorical.head()

Unnamed: 0,STATE,CLUSTER,HOMEOWNR,GENDER,DATASRCE,RFA_2R,RFA_2A,GEOCODE2,DOMAIN_A,DOMAIN_B,...,DOB_YR,DOB_MM,MINRDATE_YR,MINRDATE_MM,MAXRDATE_YR,MAXRDATE_MM,LASTDATE_YR,LASTDATE_MM,FIRSTDATE_YR,FIRSTDATE_MM
0,IL,36,H,F,3,L,E,C,T,2,...,37,12,92,8,94,2,95,12,89,11
1,CA,14,H,M,3,L,G,A,S,1,...,52,2,93,10,95,12,95,12,93,10
2,NC,43,U,M,3,L,E,C,R,2,...,0,2,91,11,92,7,95,12,90,1
3,CA,44,U,F,3,L,E,C,R,2,...,28,1,87,11,94,11,95,12,87,2
4,FL,16,H,F,3,L,F,A,S,2,...,20,1,93,10,96,1,96,1,79,3


#### Checking dtypes

In [6]:
categorical.dtypes

STATE           object
CLUSTER          int64
HOMEOWNR        object
GENDER          object
DATASRCE         int64
RFA_2R          object
RFA_2A          object
GEOCODE2        object
DOMAIN_A        object
DOMAIN_B         int64
ODATEW_YR        int64
ODATEW_MM        int64
DOB_YR           int64
DOB_MM           int64
MINRDATE_YR      int64
MINRDATE_MM      int64
MAXRDATE_YR      int64
MAXRDATE_MM      int64
LASTDATE_YR      int64
LASTDATE_MM      int64
FIRSTDATE_YR     int64
FIRSTDATE_MM     int64
dtype: object

In [7]:
categorical["DATASRCE"].value_counts()
# This feature should be expressed as object. 
# It should be converted to object.

DATASRCE
3    64829
2    23455
1     7128
Name: count, dtype: int64

In [8]:
categorical["DATASRCE"] = categorical["DATASRCE"].astype(str)

In [9]:
categorical["DOB_YR"].value_counts()
# These features are expressed as integers. The birt_date, so age could have also a meaning here. Bigger age could mean, that someone can make bigger donation.
# All features connected to date I will keep as numbers (int or float), to avoid onehotencoding later one (these will create too many features)


DOB_YR
0     23661
20     1942
30     1923
48     1912
50     1890
      ...  
7         1
89        1
9         1
85        1
83        1
Name: count, Length: 96, dtype: int64

In [10]:
pd.set_option('display.max_rows', None) # Showing all rows.
numerical.dtypes

TCODE         int64
AGE         float64
INCOME        int64
WEALTH1       int64
HIT           int64
MALEMILI      int64
MALEVET       int64
VIETVETS      int64
WWIIVETS      int64
LOCALGOV      int64
STATEGOV      int64
FEDGOV        int64
WEALTH2       int64
POP901        int64
POP902        int64
POP903        int64
POP90C1       int64
POP90C2       int64
POP90C3       int64
POP90C4       int64
POP90C5       int64
ETH1          int64
ETH2          int64
ETH3          int64
ETH4          int64
ETH5          int64
ETH6          int64
ETH7          int64
ETH8          int64
ETH9          int64
ETH10         int64
ETH11         int64
ETH12         int64
ETH13         int64
ETH14         int64
ETH15         int64
ETH16         int64
AGE901        int64
AGE902        int64
AGE903        int64
AGE904        int64
AGE905        int64
AGE906        int64
AGE907        int64
CHIL1         int64
CHIL2         int64
CHIL3         int64
AGEC1         int64
AGEC2         int64
AGEC3         int64


In [11]:
pd.reset_option('display.max_rows') # Showing only some rows. 

In [12]:
# All features in numerical are integer or float.
# Even they are categorical, but have dtype integer or float, I will not convert dtype, as this will create too much additional features for the future model.

In [13]:
numerical.columns[numerical.dtypes == object]

Index([], dtype='object')

In [14]:
# All numerical columns are numerical (no column has dtype object)
# WEALTH 
# INCOME should be consideres as a categorical value, because there are only very few values here

# are actually interpreted as categorical features. although having dtype as integer. 
# But I decide not to change the dtype, becasue this way we would have need 

In [15]:
numerical["INCOME"].value_counts()

INCOME
5    36737
2    13114
4    12732
1     9022
3     8558
6     7778
7     7471
Name: count, dtype: int64

In [16]:
numerical["WEALTH1"].value_counts()

WEALTH1
9    52317
8     6793
7     6198
6     5825
5     5280
4     4810
3     4237
2     4085
1     3454
0     2413
Name: count, dtype: int64

In [17]:
numerical.columns

Index(['TCODE', 'AGE', 'INCOME', 'WEALTH1', 'HIT', 'MALEMILI', 'MALEVET',
       'VIETVETS', 'WWIIVETS', 'LOCALGOV',
       ...
       'CARDGIFT', 'MINRAMNT', 'MAXRAMNT', 'LASTGIFT', 'TIMELAG', 'AVGGIFT',
       'CONTROLN', 'HPHONE_D', 'RFA_2F', 'CLUSTER2'],
      dtype='object', length=315)

In [18]:
categorical["DOMAIN_B"].value_counts()

DOMAIN_B
2    48356
1    28498
3    16754
4     1804
Name: count, dtype: int64

In [19]:
categorical['DOMAIN_B'].replace({1: 4, 2: 3, 3: 2, 4: 1}, inplace=True)

In [20]:
categorical["DOMAIN_B"].value_counts()

DOMAIN_B
3    48356
4    28498
2    16754
1     1804
Name: count, dtype: int64

In [21]:
# Checking potential features, which are not numerical, but actually could be categorical

# numer_values =[]
# for i in numerical.columns:
#     unique_values = numerical[i].nunique()
#     if unique_values <=10: 
#         numer_values.append(i)

# numer_values

# this above code check the number of unique values of each 
# numerical columns and when the unique values are less than or equal to 10 , we will convert it into category

In [22]:
# Those columns should be treated as categorical data, not numerical.
# data type of those features should be converted to object 

In [23]:
# numerical["INCOME"] = numerical["INCOME"].astype(str)
# numerical["WEALTH1"] = numerical["WEALTH1"].astype(str)
# numerical["WEALTH2"] = numerical["WEALTH2"].astype(str)
# numerical["MHUC2"] = numerical["MHUC2"].astype(str)
# numerical["HPHONE_D"] = numerical["HPHONE_D"].astype(str)
# numerical["RFA_2F"] = numerical["RFA_2F"].astype(str)

In [24]:
# numerical.columns[numerical.dtypes == object]

#### Concatenate numerical and categorical back together again for your X dataframe. Designate the TargetB as y

In [25]:
X = pd.concat((numerical,categorical),axis=1)

In [26]:
X.head()

Unnamed: 0,TCODE,AGE,INCOME,WEALTH1,HIT,MALEMILI,MALEVET,VIETVETS,WWIIVETS,LOCALGOV,...,DOB_YR,DOB_MM,MINRDATE_YR,MINRDATE_MM,MAXRDATE_YR,MAXRDATE_MM,LASTDATE_YR,LASTDATE_MM,FIRSTDATE_YR,FIRSTDATE_MM
0,0,60.0,5,9,0,0,39,34,18,10,...,37,12,92,8,94,2,95,12,89,11
1,1,46.0,6,9,16,0,15,55,11,6,...,52,2,93,10,95,12,95,12,93,10
2,1,61.611649,3,1,2,0,20,29,33,6,...,0,2,91,11,92,7,95,12,90,1
3,0,70.0,1,4,2,0,23,14,31,3,...,28,1,87,11,94,11,95,12,87,2
4,0,78.0,3,2,60,1,28,9,53,26,...,20,1,93,10,96,1,96,1,79,3


In [27]:
y = targets.drop(columns=['TARGET_D'])

In [28]:
y.head()

Unnamed: 0,TARGET_B
0,0
1,0
2,0
3,0
4,0


#### Split the data into a training set and a test set

In [29]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)

In [30]:
X_train.shape

(71559, 337)

#### Split further into train_num and train_cat. Also test_num and test_cat.

In [31]:
# numerical/categorical on train set
X_train_num = X_train.select_dtypes(include = np.number)
X_train_cat = X_train.select_dtypes(include = object)

# numerical/categorical on test set
X_test_num = X_test.select_dtypes(include = np.number)
X_test_cat = X_test.select_dtypes(include = object)

In [32]:
display(X_train_num.shape)
display(X_train_cat.shape)

(71559, 329)

(71559, 8)

#### Scale the features either by using MinMax Scaler or a Standard Scaler. (train_num, test_num)

In [33]:
# MinMaxScaler
transformer = MinMaxScaler().fit(X_train_num)

In [34]:
X_train_normalized = transformer.transform(X_train_num)
df_X_train_normalized = pd.DataFrame(X_train_normalized, columns=X_train_num.columns)
df_X_train_normalized.head()

Unnamed: 0,TCODE,AGE,INCOME,WEALTH1,HIT,MALEMILI,MALEVET,VIETVETS,WWIIVETS,LOCALGOV,...,DOB_YR,DOB_MM,MINRDATE_YR,MINRDATE_MM,MAXRDATE_YR,MAXRDATE_MM,LASTDATE_YR,LASTDATE_MM,FIRSTDATE_YR,FIRSTDATE_MM
0,0.0,0.670103,0.5,0.0,0.004149,0.0,0.292929,0.151515,0.484848,0.080808,...,0.329897,0.0,0.863636,0.818182,0.888889,0.545455,0.5,0.181818,0.979167,0.818182
1,0.0,0.624862,0.666667,1.0,0.0,0.010101,0.373737,0.333333,0.333333,0.080808,...,0.0,0.090909,0.545455,0.727273,0.833333,0.636364,0.0,1.0,0.895833,0.727273
2,1.7e-05,0.443299,0.666667,0.444444,0.112033,0.0,0.323232,0.393939,0.40404,0.050505,...,0.556701,0.0,0.909091,0.0,0.888889,1.0,0.0,1.0,0.989583,0.0
3,0.0,0.43299,0.833333,1.0,0.053942,0.010101,0.454545,0.373737,0.111111,0.050505,...,0.556701,0.545455,0.772727,0.181818,0.944444,0.0,0.5,0.0,0.90625,0.272727
4,3.4e-05,0.845361,0.666667,0.444444,0.016598,0.0,0.464646,0.262626,0.515152,0.141414,...,0.154639,0.090909,0.545455,0.909091,0.833333,0.727273,0.0,0.909091,0.895833,1.0


In [35]:
X_test_normalized = transformer.transform(X_test_num)
df_X_test_normalized = pd.DataFrame(X_test_normalized, columns=X_test_num.columns)
df_X_test_normalized.head()

Unnamed: 0,TCODE,AGE,INCOME,WEALTH1,HIT,MALEMILI,MALEVET,VIETVETS,WWIIVETS,LOCALGOV,...,DOB_YR,DOB_MM,MINRDATE_YR,MINRDATE_MM,MAXRDATE_YR,MAXRDATE_MM,LASTDATE_YR,LASTDATE_MM,FIRSTDATE_YR,FIRSTDATE_MM
0,0.000483,0.659794,0.833333,1.0,0.0,0.0,0.242424,0.424242,0.222222,0.090909,...,0.329897,0.636364,0.727273,0.818182,0.888889,0.909091,0.0,0.909091,0.947917,0.727273
1,1.7e-05,0.587629,0.666667,1.0,0.0,0.0,0.171717,0.282828,0.424242,0.10101,...,0.412371,0.363636,0.681818,0.636364,0.888889,0.0,0.0,0.272727,0.90625,0.272727
2,1.7e-05,0.463918,1.0,0.888889,0.004149,0.0,0.282828,0.444444,0.222222,0.020202,...,0.536082,0.181818,0.727273,0.818182,0.888889,0.818182,0.0,0.818182,0.947917,0.818182
3,0.0,0.624862,0.666667,1.0,0.0,0.0,0.333333,0.373737,0.494949,0.080808,...,0.0,0.090909,0.727273,0.090909,0.555556,0.090909,0.5,0.090909,0.895833,0.909091
4,0.0,0.463918,0.333333,0.222222,0.0,0.494949,0.161616,0.323232,0.030303,0.090909,...,0.536082,0.0,0.954545,0.090909,0.944444,0.090909,0.5,0.090909,1.0,0.090909


#### Encode the categorical features using One-Hot Encoding or Ordinal Encoding. (train_cat, test_cat)

In [36]:
X_train_cat

Unnamed: 0,STATE,HOMEOWNR,GENDER,DATASRCE,RFA_2R,RFA_2A,GEOCODE2,DOMAIN_A
4398,TX,H,F,3,L,F,D,R
90538,other,H,F,3,L,F,D,T
16147,WI,H,M,3,L,F,C,T
65008,other,H,F,3,L,F,B,T
52515,WA,H,M,3,L,E,C,C
...,...,...,...,...,...,...,...,...
21243,WI,H,F,3,L,E,B,S
45891,GA,U,F,3,L,F,D,T
42613,other,H,M,3,L,D,D,R
43567,FL,H,M,3,L,G,C,T


In [37]:
X_train_cat["RFA_2R"].unique()

# This feature should be dropped from train and test

array(['L'], dtype=object)

In [38]:
X_train_cat = X_train_cat.drop(["RFA_2R"], axis=1)
X_test_cat = X_test_cat.drop(["RFA_2R"], axis=1)

In [39]:
X_train_cat.shape

(71559, 7)

In [40]:
X_test_cat.shape

(23853, 7)

In [41]:
# OneHot Encoding

In [42]:
encoder = OneHotEncoder(drop='first').fit(X_train_cat)

In [43]:
# Getting names of columns to be able to label features
cols = encoder.get_feature_names_out(input_features=X_train_cat.columns)

In [44]:
# Running encoder on a X_train_cat
X_train_cat_encoded = encoder.transform(X_train_cat).toarray()
df_X_train_cat_encoded = pd.DataFrame(X_train_cat_encoded, columns=cols)
df_X_train_cat_encoded.head()

Unnamed: 0,STATE_FL,STATE_GA,STATE_IL,STATE_IN,STATE_MI,STATE_MO,STATE_NC,STATE_TX,STATE_WA,STATE_WI,...,RFA_2A_E,RFA_2A_F,RFA_2A_G,GEOCODE2_B,GEOCODE2_C,GEOCODE2_D,DOMAIN_A_R,DOMAIN_A_S,DOMAIN_A_T,DOMAIN_A_U
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0


In [45]:
# Running encoder on a X_test_cat
X_test_cat_encoded = encoder.transform(X_test_cat).toarray()
df_X_test_cat_encoded = pd.DataFrame(X_test_cat_encoded, columns=cols)
df_X_test_cat_encoded.head()

Unnamed: 0,STATE_FL,STATE_GA,STATE_IL,STATE_IN,STATE_MI,STATE_MO,STATE_NC,STATE_TX,STATE_WA,STATE_WI,...,RFA_2A_E,RFA_2A_F,RFA_2A_G,GEOCODE2_B,GEOCODE2_C,GEOCODE2_D,DOMAIN_A_R,DOMAIN_A_S,DOMAIN_A_T,DOMAIN_A_U
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0


#### Re-concatenate train_num and train_cat as X_train as well as test_num and test_cat as X_test

In [47]:
X_train_transformed = pd.concat([df_X_train_normalized, df_X_train_cat_encoded], axis=1) 

X_train_transformed.head()

Unnamed: 0,TCODE,AGE,INCOME,WEALTH1,HIT,MALEMILI,MALEVET,VIETVETS,WWIIVETS,LOCALGOV,...,RFA_2A_E,RFA_2A_F,RFA_2A_G,GEOCODE2_B,GEOCODE2_C,GEOCODE2_D,DOMAIN_A_R,DOMAIN_A_S,DOMAIN_A_T,DOMAIN_A_U
0,0.0,0.670103,0.5,0.0,0.004149,0.0,0.292929,0.151515,0.484848,0.080808,...,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0
1,0.0,0.624862,0.666667,1.0,0.0,0.010101,0.373737,0.333333,0.333333,0.080808,...,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
2,1.7e-05,0.443299,0.666667,0.444444,0.112033,0.0,0.323232,0.393939,0.40404,0.050505,...,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
3,0.0,0.43299,0.833333,1.0,0.053942,0.010101,0.454545,0.373737,0.111111,0.050505,...,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
4,3.4e-05,0.845361,0.666667,0.444444,0.016598,0.0,0.464646,0.262626,0.515152,0.141414,...,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0


In [49]:
X_test_transformed = pd.concat([df_X_test_normalized, df_X_test_cat_encoded], axis=1)
X_test_transformed.head()

Unnamed: 0,TCODE,AGE,INCOME,WEALTH1,HIT,MALEMILI,MALEVET,VIETVETS,WWIIVETS,LOCALGOV,...,RFA_2A_E,RFA_2A_F,RFA_2A_G,GEOCODE2_B,GEOCODE2_C,GEOCODE2_D,DOMAIN_A_R,DOMAIN_A_S,DOMAIN_A_T,DOMAIN_A_U
0,0.000483,0.659794,0.833333,1.0,0.0,0.0,0.242424,0.424242,0.222222,0.090909,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1.7e-05,0.587629,0.666667,1.0,0.0,0.0,0.171717,0.282828,0.424242,0.10101,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2,1.7e-05,0.463918,1.0,0.888889,0.004149,0.0,0.282828,0.444444,0.222222,0.020202,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
3,0.0,0.624862,0.666667,1.0,0.0,0.0,0.333333,0.373737,0.494949,0.080808,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
4,0.0,0.463918,0.333333,0.222222,0.0,0.494949,0.161616,0.323232,0.030303,0.090909,...,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0


#### Fit a logistic regression (classification) model on the training data

In [50]:
# Creating a Logistic model
LR = LogisticRegression(random_state=0, solver='lbfgs')

In [51]:
LR.fit(X_train_transformed, y_train)

  y = column_or_1d(y, warn=True)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


#### Check the accuracy on the test data.

In [52]:
pred = LR.predict(X_test_transformed)

In [53]:
LR.score(X_test_transformed, y_test) # accuracy

0.9474279964784303

In [54]:
print("precision: ",precision_score(y_test,pred))  # Not that important
print("recall: ",recall_score(y_test,pred))  # Should be possibly high
print("f1: ",f1_score(y_test,pred))

precision:  0.0
recall:  0.0
f1:  0.0


  _warn_prf(average, modifier, msg_start, len(result))


In [55]:
from sklearn.metrics import confusion_matrix
confusion_matrix(y_test, pred)

array([[22599,     0],
       [ 1254,     0]], dtype=int64)

### Managing imbalance in the dataset

#### Check for the imbalance.

In [56]:
targets['TARGET_B'].value_counts()

TARGET_B
0    90569
1     4843
Name: count, dtype: int64

In [None]:
# we see the TARGET_B is very imbalanced

### Resampling

#### Downsampling

In [75]:
X_train_transformed.head()

Unnamed: 0,TCODE,AGE,INCOME,WEALTH1,HIT,MALEMILI,MALEVET,VIETVETS,WWIIVETS,LOCALGOV,...,RFA_2A_E,RFA_2A_F,RFA_2A_G,GEOCODE2_B,GEOCODE2_C,GEOCODE2_D,DOMAIN_A_R,DOMAIN_A_S,DOMAIN_A_T,DOMAIN_A_U
0,0.0,0.670103,0.5,0.0,0.004149,0.0,0.292929,0.151515,0.484848,0.080808,...,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0
1,0.0,0.624862,0.666667,1.0,0.0,0.010101,0.373737,0.333333,0.333333,0.080808,...,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
2,1.7e-05,0.443299,0.666667,0.444444,0.112033,0.0,0.323232,0.393939,0.40404,0.050505,...,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
3,0.0,0.43299,0.833333,1.0,0.053942,0.010101,0.454545,0.373737,0.111111,0.050505,...,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
4,3.4e-05,0.845361,0.666667,0.444444,0.016598,0.0,0.464646,0.262626,0.515152,0.141414,...,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0


In [78]:
y_train = y_train.reset_index(drop=True)

In [79]:
y_train

Unnamed: 0,TARGET_B
0,0
1,0
2,0
3,0
4,0
...,...
71554,0
71555,0
71556,1
71557,0


In [80]:
data_b = pd.concat([X_train_transformed, y_train], axis=1)
data_b

Unnamed: 0,TCODE,AGE,INCOME,WEALTH1,HIT,MALEMILI,MALEVET,VIETVETS,WWIIVETS,LOCALGOV,...,RFA_2A_F,RFA_2A_G,GEOCODE2_B,GEOCODE2_C,GEOCODE2_D,DOMAIN_A_R,DOMAIN_A_S,DOMAIN_A_T,DOMAIN_A_U,TARGET_B
0,0.000000,0.670103,0.500000,0.000000,0.004149,0.000000,0.292929,0.151515,0.484848,0.080808,...,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0
1,0.000000,0.624862,0.666667,1.000000,0.000000,0.010101,0.373737,0.333333,0.333333,0.080808,...,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0
2,0.000017,0.443299,0.666667,0.444444,0.112033,0.000000,0.323232,0.393939,0.404040,0.050505,...,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0
3,0.000000,0.432990,0.833333,1.000000,0.053942,0.010101,0.454545,0.373737,0.111111,0.050505,...,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0
4,0.000034,0.845361,0.666667,0.444444,0.016598,0.000000,0.464646,0.262626,0.515152,0.141414,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
71554,0.000000,0.711340,0.333333,1.000000,0.020747,0.020202,0.373737,0.101010,0.323232,0.262626,...,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0
71555,0.000034,0.670103,0.333333,0.666667,0.000000,0.000000,0.363636,0.111111,0.626263,0.040404,...,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0
71556,0.000017,0.624862,0.666667,1.000000,0.000000,0.000000,0.353535,0.292929,0.424242,0.171717,...,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1
71557,0.000017,0.773196,0.666667,0.333333,0.004149,0.000000,0.595960,0.222222,0.313131,0.030303,...,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0


In [81]:
category_0 = data_b[data_b['TARGET_B'] == 0]
category_1 = data_b[data_b['TARGET_B'] == 1]

In [82]:
category_0_undersampled = resample(category_0,
                                   replace=False,                    # replace=False means all rows will be unique
                                   n_samples = len(category_1))      # how many we need, so the same as category_1

In [83]:
print(category_0_undersampled.shape)
print(category_1.shape)

(3589, 356)
(3589, 356)


In [84]:
data_downsampled = pd.concat([category_0_undersampled, category_1], axis=0)

In [85]:
data_downsampled['TARGET_B'].value_counts()

TARGET_B
0    3589
1    3589
Name: count, dtype: int64

In [86]:
X_d = data_downsampled.drop(['TARGET_B'], axis=1)
y_d = data_downsampled['TARGET_B']

In [87]:
X_d.head()

Unnamed: 0,TCODE,AGE,INCOME,WEALTH1,HIT,MALEMILI,MALEVET,VIETVETS,WWIIVETS,LOCALGOV,...,RFA_2A_E,RFA_2A_F,RFA_2A_G,GEOCODE2_B,GEOCODE2_C,GEOCODE2_D,DOMAIN_A_R,DOMAIN_A_S,DOMAIN_A_T,DOMAIN_A_U
38855,3.4e-05,0.624862,0.0,0.222222,0.0,0.010101,0.30303,0.393939,0.262626,0.010101,...,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0
23712,1.7e-05,0.762887,0.5,0.888889,0.008299,0.0,0.373737,0.40404,0.323232,0.131313,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
47047,0.0,0.268041,0.666667,1.0,0.0,0.0,0.393939,0.282828,0.626263,0.151515,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0
39123,0.000483,0.624862,0.666667,0.666667,0.0,0.010101,0.272727,0.444444,0.121212,0.050505,...,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0
19192,0.0,0.597938,1.0,0.777778,0.0,0.0,0.343434,0.070707,0.636364,0.10101,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0


In [89]:
y_d

38855    0
23712    0
47047    0
39123    0
19192    0
        ..
71491    1
71506    1
71530    1
71531    1
71556    1
Name: TARGET_B, Length: 7178, dtype: int64

In [93]:
# Creating a Logistic model for downs
LR_d = LogisticRegression(random_state=0, solver='lbfgs')

In [94]:
LR_d.fit(X_d, y_d)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [95]:
pred_d = LR_d.predict(X_test_transformed)

In [96]:
LR_d.score(X_test_transformed, y_test) # accuracy

0.5943487192386702

In [97]:
print("precision: ",precision_score(y_test,pred_d))  # Not that important
print("recall: ",recall_score(y_test,pred_d))  # Should be possibly high
print("f1: ",f1_score(y_test,pred_d))

precision:  0.06986721144024514
recall:  0.5454545454545454
f1:  0.12386816370880117


In [98]:
confusion_matrix(y_test, pred_d)

array([[13493,  9106],
       [  570,   684]], dtype=int64)