# Chapter 3 Regression with imputation

In [1]:
import pandas as pd
import numpy as np
import matplotlib as plt
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
df = pd.read_csv('pva97nk_ch3_result1.csv') #This data has NaN values.
df.head()

Unnamed: 0,TargetB,GiftCnt36,GiftCntAll,GiftCntCard36,GiftCntCardAll,GiftAvgLast,GiftAvg36,GiftAvgAll,GiftAvgCard36,GiftTimeLast,GiftTimeFirst,PromCnt12,PromCnt36,PromCntAll,PromCntCard12,PromCntCard36,PromCntCardAll,StatusCat96NK,StatusCatStarAll,DemCluster,DemAge,DemGender,DemHomeOwner,DemMedHomeValue,DemPctVeterans,DemMedIncome
0,0.0,2.0,4.0,1.0,3.0,17.0,13.5,9.25,17.0,21.0,66.0,8.0,17.0,26.0,3.0,8.0,13.0,1,0.0,0,,1,2,0.0,0.0,
1,0.0,1.0,8.0,0.0,3.0,20.0,20.0,15.88,,26.0,92.0,14.0,35.0,79.0,5.0,5.0,24.0,1,0.0,2,67.0,1,2,186800.0,85.0,
2,1.0,6.0,41.0,3.0,20.0,6.0,5.17,3.73,5.0,18.0,111.0,12.0,23.0,51.0,5.0,11.0,22.0,2,1.0,0,,2,2,87600.0,36.0,38750.0
3,1.0,3.0,12.0,3.0,8.0,10.0,8.67,8.5,8.67,9.0,93.0,14.0,22.0,44.0,2.0,6.0,16.0,5,1.0,0,,2,2,139200.0,27.0,38942.0
4,0.0,1.0,1.0,1.0,1.0,20.0,20.0,20.0,20.0,21.0,21.0,10.0,15.0,13.0,4.0,7.0,6.0,3,0.0,3,53.0,2,2,168100.0,37.0,71509.0


In [2]:
df.isnull().sum()

TargetB                0
GiftCnt36              0
GiftCntAll             0
GiftCntCard36          0
GiftCntCardAll         0
GiftAvgLast            0
GiftAvg36              0
GiftAvgAll             0
GiftAvgCard36       1780
GiftTimeLast           0
GiftTimeFirst          0
PromCnt12              0
PromCnt36              0
PromCntAll             0
PromCntCard12          0
PromCntCard36          0
PromCntCardAll         0
StatusCat96NK          0
StatusCatStarAll       0
DemCluster             0
DemAge              2407
DemGender              0
DemHomeOwner           0
DemMedHomeValue        0
DemPctVeterans         0
DemMedIncome        2357
dtype: int64

In [3]:
# Finding the columns which have null values.
df.isna().any()[lambda x: x]

GiftAvgCard36    True
DemAge           True
DemMedIncome     True
dtype: bool

# Making Imputation Indicators Manually (P 4-21)
This process is too ineffienct to do manually. 
I only recorded the codes for manual imputation for your information.
### Please note that automatic imputation by SimpleImputer is given after this part.

In [4]:
# Making imputation Indicator columns
cols = ['GiftAvgCard36','DemAge','DemMedIncome']
dfm_total = df[cols].isnull().astype(int).add_suffix('_indicator')
dfm_total.head()

Unnamed: 0,GiftAvgCard36_indicator,DemAge_indicator,DemMedIncome_indicator
0,0,1,1
1,1,0,1
2,0,1,0
3,0,1,0
4,0,0,0


In [5]:
dfm_total['GiftAvgCard36_indicator'].value_counts(dropna=False)

0    7906
1    1780
Name: GiftAvgCard36_indicator, dtype: int64

In [6]:
dfm_total['DemAge_indicator'].value_counts(dropna=False)

0    7279
1    2407
Name: DemAge_indicator, dtype: int64

In [7]:
dfm_total['DemMedIncome_indicator'].value_counts(dropna=False)

0    7329
1    2357
Name: DemMedIncome_indicator, dtype: int64

# Manual checking for variables which have null values

In [8]:
for col in df.columns: 
    print(col)

TargetB
GiftCnt36
GiftCntAll
GiftCntCard36
GiftCntCardAll
GiftAvgLast
GiftAvg36
GiftAvgAll
GiftAvgCard36
GiftTimeLast
GiftTimeFirst
PromCnt12
PromCnt36
PromCntAll
PromCntCard12
PromCntCard36
PromCntCardAll
StatusCat96NK
StatusCatStarAll
DemCluster
DemAge
DemGender
DemHomeOwner
DemMedHomeValue
DemPctVeterans
DemMedIncome


In [9]:
df['GiftAvgCard36'].describe()

count    7906.000000
mean       14.224431
std        10.022710
min         1.330000
25%         8.670000
50%        12.500000
75%        18.000000
max       260.000000
Name: GiftAvgCard36, dtype: float64

In [10]:
df['DemAge'].describe()

count    7279.000000
mean       59.150845
std        16.516400
min        -0.000000
25%        47.000000
50%        60.000000
75%        73.000000
max        87.000000
Name: DemAge, dtype: float64

In [11]:
df['DemMedIncome'].describe()

count      7329.000000
mean      53513.457361
std       19805.168339
min        2499.000000
25%       40389.000000
50%       48699.000000
75%       62385.000000
max      200001.000000
Name: DemMedIncome, dtype: float64

In [12]:
df1= df
df1['GiftAvgCard36']= df1['GiftAvgCard36'].fillna(14.22)
df1['DemAge']= df1['DemAge'].fillna(59.15)
df1['DemMedIncome']= df1['DemMedIncome'].fillna(53513.46)

In [13]:
# Concatenating df and dfm_total
dfu = pd.concat([df1, dfm_total], axis = 1)
dfu.head()

Unnamed: 0,TargetB,GiftCnt36,GiftCntAll,GiftCntCard36,GiftCntCardAll,GiftAvgLast,GiftAvg36,GiftAvgAll,GiftAvgCard36,GiftTimeLast,GiftTimeFirst,PromCnt12,PromCnt36,PromCntAll,PromCntCard12,PromCntCard36,PromCntCardAll,StatusCat96NK,StatusCatStarAll,DemCluster,DemAge,DemGender,DemHomeOwner,DemMedHomeValue,DemPctVeterans,DemMedIncome,GiftAvgCard36_indicator,DemAge_indicator,DemMedIncome_indicator
0,0.0,2.0,4.0,1.0,3.0,17.0,13.5,9.25,17.0,21.0,66.0,8.0,17.0,26.0,3.0,8.0,13.0,1,0.0,0,59.15,1,2,0.0,0.0,53513.46,0,1,1
1,0.0,1.0,8.0,0.0,3.0,20.0,20.0,15.88,14.22,26.0,92.0,14.0,35.0,79.0,5.0,5.0,24.0,1,0.0,2,67.0,1,2,186800.0,85.0,53513.46,1,0,1
2,1.0,6.0,41.0,3.0,20.0,6.0,5.17,3.73,5.0,18.0,111.0,12.0,23.0,51.0,5.0,11.0,22.0,2,1.0,0,59.15,2,2,87600.0,36.0,38750.0,0,1,0
3,1.0,3.0,12.0,3.0,8.0,10.0,8.67,8.5,8.67,9.0,93.0,14.0,22.0,44.0,2.0,6.0,16.0,5,1.0,0,59.15,2,2,139200.0,27.0,38942.0,0,1,0
4,0.0,1.0,1.0,1.0,1.0,20.0,20.0,20.0,20.0,21.0,21.0,10.0,15.0,13.0,4.0,7.0,6.0,3,0.0,3,53.0,2,2,168100.0,37.0,71509.0,0,0,0


In [14]:
df.shape

(9686, 26)

In [15]:
dfu.shape # The number of columns was increased from 26 to 29 with three imputation indicator variables.

(9686, 29)

# Until now we made dataframe dfu which have variables  imputated and imputation indicator variables.
If you use dfu instead of df in the following procesure, then manual imputation data is input
in the data partition and (logistic) regresson.
### However, I recommend strongly to use the following SimpleImputer function which is consice and correct in every sense.

# Data Partition and (Automatic) Imputation

In [16]:
### We reloaded the origial dataframe which has NaN values.
df = pd.read_csv('pva97nk_ch3_result1.csv') #This data has NaN values. It is needed for imputation.

# 5:5 data partition
data = df.drop(['TargetB'], axis=1)
target = df['TargetB']

import sklearn
from sklearn.model_selection import train_test_split 
X_train, X_test, y_train, y_test = train_test_split( 
    data, target, test_size=0.5, random_state=42) 

# Imputation with creating imputation indicator variables

from sklearn.impute import SimpleImputer
imp= SimpleImputer(strategy = 'mean', add_indicator=True)  
X_train2= imp.fit_transform(X_train) 
X_test2= imp.fit_transform(X_test) # Please don't forget this line. Both X_train and X_test should be fit_transfomred.
X_train2 

array([[ 2.,  2.,  1., ...,  0.,  0.,  0.],
       [ 3.,  3.,  1., ...,  0.,  0.,  0.],
       [ 2.,  2.,  2., ...,  0.,  0.,  0.],
       ...,
       [ 6., 16.,  2., ...,  0.,  0.,  0.],
       [ 2.,  2.,  0., ...,  1.,  0.,  0.],
       [ 2.,  3.,  2., ...,  0.,  0.,  0.]])

In [17]:
X_test2

array([[ 4., 16.,  1., ...,  0.,  0.,  1.],
       [ 6., 28.,  4., ...,  0.,  1.,  1.],
       [ 1.,  1.,  1., ...,  0.,  1.,  1.],
       ...,
       [ 3.,  4.,  1., ...,  0.,  1.,  1.],
       [ 2.,  6.,  0., ...,  1.,  0.,  0.],
       [ 5.,  8.,  3., ...,  0.,  1.,  0.]])

In [18]:
### This part is optional. But it is needed in the next stage.
# As you can see in the above, it is not that easy to check the imputation values by screen display.
# Thus, I made the X_train array into X_traindf dataframe.
# Then, it is easy to check values of each column.

X_traindf= pd.DataFrame(X_train2, columns=['GiftCnt36','GiftCntAll','GiftCntCard36','GiftCntCardAll',\
                                           'GiftAvgLast','GiftAvg36','GiftAvgAll','GiftAvgCard36',\
                                           'GiftTimeLast','GiftTimeFirst','PromCnt12','PromCnt36',\
                                           'PromCntAll','PromCntCard12','PromCntCard36','PromCntCardAll',\
                                           'StatusCat96NK','StatusCatStarAll','DemCluster','DemAge','DemGender',\
                                           'DemHomeOwner','DemMedHomeValue','DemPctVeterans','DemMedIncome',\
                                           'GiftAvgCard36_indicator','DemAge_indicator','DemMedIncome_indicator'])
X_traindf.head()

Unnamed: 0,GiftCnt36,GiftCntAll,GiftCntCard36,GiftCntCardAll,GiftAvgLast,GiftAvg36,GiftAvgAll,GiftAvgCard36,GiftTimeLast,GiftTimeFirst,PromCnt12,PromCnt36,PromCntAll,PromCntCard12,PromCntCard36,PromCntCardAll,StatusCat96NK,StatusCatStarAll,DemCluster,DemAge,DemGender,DemHomeOwner,DemMedHomeValue,DemPctVeterans,DemMedIncome,GiftAvgCard36_indicator,DemAge_indicator,DemMedIncome_indicator
0,2.0,2.0,1.0,1.0,15.0,15.0,15.0,15.0,15.0,29.0,6.0,14.0,13.0,3.0,8.0,7.0,1.0,0.0,0.0,66.0,2.0,1.0,93100.0,46.0,54225.0,0.0,0.0,0.0
1,3.0,3.0,1.0,1.0,35.0,43.33,43.33,20.0,16.0,31.0,16.0,31.0,28.0,6.0,13.0,11.0,1.0,0.0,2.0,79.0,1.0,2.0,92900.0,33.0,50707.0,0.0,0.0,0.0
2,2.0,2.0,2.0,2.0,15.0,15.0,15.0,15.0,20.0,28.0,10.0,23.0,22.0,4.0,10.0,9.0,1.0,0.0,2.0,64.0,1.0,1.0,67700.0,40.0,53851.0,0.0,0.0,0.0
3,5.0,32.0,1.0,15.0,5.0,5.4,3.06,5.0,15.0,114.0,11.0,31.0,78.0,5.0,14.0,32.0,2.0,1.0,4.0,74.0,1.0,2.0,52300.0,32.0,45069.0,0.0,0.0,0.0
4,8.0,15.0,7.0,10.0,7.0,6.5,5.2,6.71,16.0,105.0,14.0,32.0,51.0,6.0,15.0,21.0,2.0,1.0,1.0,50.0,1.0,1.0,102600.0,31.0,74255.0,0.0,0.0,0.0


In [19]:
# This is optional. But it is needed in the next stage.
X_testdf= pd.DataFrame(X_test2, columns=['GiftCnt36','GiftCntAll','GiftCntCard36','GiftCntCardAll',\
                                           'GiftAvgLast','GiftAvg36','GiftAvgAll','GiftAvgCard36',\
                                           'GiftTimeLast','GiftTimeFirst','PromCnt12','PromCnt36',\
                                           'PromCntAll','PromCntCard12','PromCntCard36','PromCntCardAll',\
                                           'StatusCat96NK','StatusCatStarAll','DemCluster','DemAge','DemGender',\
                                           'DemHomeOwner','DemMedHomeValue','DemPctVeterans','DemMedIncome',\
                                           'GiftAvgCard36_indicator','DemAge_indicator','DemMedIncome_indicator'])
X_testdf.head()

Unnamed: 0,GiftCnt36,GiftCntAll,GiftCntCard36,GiftCntCardAll,GiftAvgLast,GiftAvg36,GiftAvgAll,GiftAvgCard36,GiftTimeLast,GiftTimeFirst,PromCnt12,PromCnt36,PromCntAll,PromCntCard12,PromCntCard36,PromCntCardAll,StatusCat96NK,StatusCatStarAll,DemCluster,DemAge,DemGender,DemHomeOwner,DemMedHomeValue,DemPctVeterans,DemMedIncome,GiftAvgCard36_indicator,DemAge_indicator,DemMedIncome_indicator
0,4.0,16.0,1.0,8.0,15.0,13.5,9.88,12.0,19.0,62.0,13.0,35.0,55.0,6.0,17.0,22.0,2.0,1.0,2.0,57.0,1.0,1.0,323300.0,0.0,53151.335148,0.0,0.0,1.0
1,6.0,28.0,4.0,14.0,35.0,34.33,20.75,35.0,24.0,92.0,12.0,34.0,73.0,6.0,17.0,28.0,2.0,1.0,3.0,58.925055,1.0,2.0,34500.0,27.0,53151.335148,0.0,1.0,1.0
2,1.0,1.0,1.0,1.0,25.0,25.0,25.0,25.0,24.0,24.0,10.0,20.0,19.0,4.0,8.0,7.0,4.0,0.0,1.0,58.925055,1.0,2.0,500000.0,31.0,53151.335148,0.0,1.0,1.0
3,2.0,4.0,2.0,4.0,20.0,12.5,17.5,12.5,24.0,89.0,9.0,27.0,52.0,4.0,11.0,19.0,1.0,1.0,1.0,55.0,2.0,1.0,155000.0,32.0,58976.0,0.0,0.0,0.0
4,2.0,10.0,2.0,6.0,14.0,14.5,11.7,14.5,16.0,65.0,13.0,34.0,54.0,5.0,15.0,22.0,1.0,1.0,1.0,54.0,2.0,1.0,143800.0,37.0,77155.0,0.0,0.0,0.0


In [20]:
X_traindf['GiftAvgCard36'].value_counts(dropna=False)
### We can see that 885 NaN values were changed into the mean value of train data set (14.280359)

14.280359     885
15.000000     444
10.000000     443
20.000000     377
25.000000     219
5.000000      188
11.000000     115
12.000000     103
8.000000       80
7.500000       79
12.500000      78
6.000000       76
16.000000      74
9.000000       65
13.000000      63
7.000000       58
14.000000      53
30.000000      43
17.000000      35
17.500000      32
10.500000      31
18.000000      29
6.670000       28
21.000000      27
10.670000      27
22.500000      27
23.000000      26
4.000000       25
6.500000       25
13.500000      25
8.500000       25
50.000000      24
13.330000      22
5.500000       21
8.330000       21
11.500000      19
19.000000      18
5.330000       17
11.330000      17
8.670000       16
3.000000       16
5.670000       16
11.670000      16
14.500000      15
7.670000       15
7.330000       15
16.500000      14
6.330000       14
4.500000       14
10.330000      13
9.500000       13
27.500000      12
4.330000       12
9.330000       11
35.000000      11
26.000000 

In [21]:
df['GiftAvgCard36'].value_counts(dropna=False)
### Please note that 1780 NaNs are rouply into train set and test set by 5:5.
### Thus, 885 NaNs in the above result matches.

NaN       1780
15.00      862
10.00      854
20.00      736
25.00      424
5.00       398
11.00      238
12.00      198
8.00       162
12.50      156
16.00      153
13.00      145
7.50       139
6.00       135
7.00       127
9.00       123
14.00      115
17.50       80
17.00       80
30.00       76
18.00       76
6.67        66
8.50        62
10.50       59
22.50       57
10.67       55
5.50        50
50.00       49
4.00        49
6.50        46
9.50        45
23.00       44
13.50       44
8.33        43
11.67       42
21.00       41
5.67        41
3.00        40
7.33        39
13.33       37
19.00       35
8.67        34
11.50       34
4.50        34
14.50       31
11.33       31
7.67        28
6.33        27
15.50       27
9.67        25
5.33        24
9.33        24
10.33       24
16.67       21
35.00       21
26.00       20
13.67       20
4.33        20
16.50       19
4.67        18
8.75        18
12.67       18
5.40        17
3.67        17
6.25        17
3.50        17
15.67     

# (Logistic) Regression (P 4-23)

In [22]:
from sklearn.linear_model import LogisticRegression
logreg = LogisticRegression(solver='lbfgs', max_iter=1000).fit(X_train2, y_train ) 
print ("Logreg Training set score:{:.5f}".format(logreg.score(X_train2, y_train))) 
print ("Logreg Test set score:{:.5f}".format(logreg.score(X_test2, y_test))) 

#note that X_train2 is usued here. not X_train.
#note that X_test2 is usued here. not X_test.

###score --> the coefficient determinant of R^2

Logreg Training set score:0.52075
Logreg Test set score:0.53046


In [23]:
print ("model.coef_:", logreg.coef_ ) 
print ("model.intercept_:", logreg.intercept_) 

model.coef_: [[ 1.22566360e-09  3.95480880e-09  8.63717150e-10  2.13583126e-09
  -4.62622927e-09 -4.66539568e-09 -3.53642309e-09 -4.02360144e-09
  -2.64788804e-09  9.36283059e-09  1.82169847e-10  8.25646596e-10
   5.41688067e-09 -9.26992867e-12  1.19945851e-09  2.07422577e-09
  -2.49473172e-11  2.28970581e-10 -2.28644570e-10 -5.51280623e-10
  -6.20422553e-11 -1.03573136e-10  1.18044360e-06 -7.89272026e-10
  -1.64411396e-06 -1.23051592e-10 -5.29914804e-11 -1.79169476e-11]]
model.intercept_: [-4.89411372e-11]


# Selecting input model => Stepwise (p4-31)
Python Scikit-learn does not provide stepwise selection. It provides similar algorighm like backward selection.
If you want to conduct Regression with stepwise selection, please search the proper Python package.
# Here we just followed the variable selection result from the textbook (p 4-34).

In [24]:
### Making stepwise result into train set.
X_traindf_sw= X_traindf[['GiftCnt36','GiftTimeLast','DemMedHomeValue','GiftAvgAll','StatusCat96NK','DemPctVeterans',\
                  'GiftAvgCard36_indicator', 'DemAge_indicator']]
#X_traindf_sw.head()
X_train_sw= X_traindf_sw.values 
X_train_sw

### Making stepwise result into test set.
X_testdf_sw= X_testdf[['GiftCnt36','GiftTimeLast','DemMedHomeValue','GiftAvgAll','StatusCat96NK','DemPctVeterans',\
                  'GiftAvgCard36_indicator', 'DemAge_indicator']]
#X_testdf_sw.head()
X_test_sw= X_testdf_sw.values 
X_test_sw

array([[4.000e+00, 1.900e+01, 3.233e+05, ..., 0.000e+00, 0.000e+00,
        0.000e+00],
       [6.000e+00, 2.400e+01, 3.450e+04, ..., 2.700e+01, 0.000e+00,
        1.000e+00],
       [1.000e+00, 2.400e+01, 5.000e+05, ..., 3.100e+01, 0.000e+00,
        1.000e+00],
       ...,
       [3.000e+00, 1.800e+01, 0.000e+00, ..., 0.000e+00, 0.000e+00,
        1.000e+00],
       [2.000e+00, 1.800e+01, 6.090e+04, ..., 4.200e+01, 1.000e+00,
        0.000e+00],
       [5.000e+00, 1.700e+01, 5.460e+04, ..., 2.700e+01, 0.000e+00,
        1.000e+00]])

In [25]:
from sklearn.linear_model import LogisticRegression
logreg = LogisticRegression(solver='lbfgs', max_iter=1000).fit(X_train_sw, y_train ) 
print ("Logreg Training set score:{:.5f}".format(logreg.score(X_train_sw, y_train))) 
print ("Logreg Test set score:{:.5f}".format(logreg.score(X_test_sw, y_test))) 

#note that X_train_sw is usued here. 
#note that X_test_sw is usued here. 

###score --> the coefficient determinant of R^2

Logreg Training set score:0.51043
Logreg Test set score:0.49350


# Transforming Inputs (P 4-53)
#Variables need to be log-transformed according to (P 4-59):
GiftAvg36, GiftAvgAll, GiftAvgCard36, GiftAvgLast, GiftCnt36, GiftCntAll, GiftCntCard36, GiftCntCardAll

In [26]:
df = pd.read_csv('pva97nk_ch3_result1.csv') #This data has NaN values.

for col in df.columns: 
    print(col)

TargetB
GiftCnt36
GiftCntAll
GiftCntCard36
GiftCntCardAll
GiftAvgLast
GiftAvg36
GiftAvgAll
GiftAvgCard36
GiftTimeLast
GiftTimeFirst
PromCnt12
PromCnt36
PromCntAll
PromCntCard12
PromCntCard36
PromCntCardAll
StatusCat96NK
StatusCatStarAll
DemCluster
DemAge
DemGender
DemHomeOwner
DemMedHomeValue
DemPctVeterans
DemMedIncome


In [27]:
df1= df

df1= df1.transform(lambda x: x + 1) # added by 1 accordign to (P 4-60)
df1= df1.apply(np.log)
df1.head()

Unnamed: 0,TargetB,GiftCnt36,GiftCntAll,GiftCntCard36,GiftCntCardAll,GiftAvgLast,GiftAvg36,GiftAvgAll,GiftAvgCard36,GiftTimeLast,GiftTimeFirst,PromCnt12,PromCnt36,PromCntAll,PromCntCard12,PromCntCard36,PromCntCardAll,StatusCat96NK,StatusCatStarAll,DemCluster,DemAge,DemGender,DemHomeOwner,DemMedHomeValue,DemPctVeterans,DemMedIncome
0,0.0,1.098612,1.609438,0.693147,1.386294,2.890372,2.674149,2.327278,2.890372,3.091042,4.204693,2.197225,2.890372,3.295837,1.386294,2.197225,2.639057,0.693147,0.0,0.0,,0.693147,1.098612,0.0,0.0,
1,0.0,0.693147,2.197225,0.0,1.386294,3.044522,3.044522,2.826129,,3.295837,4.532599,2.70805,3.583519,4.382027,1.791759,1.791759,3.218876,0.693147,0.0,1.098612,4.219508,0.693147,1.098612,12.137799,4.454347,
2,0.693147,1.94591,3.73767,1.386294,3.044522,1.94591,1.819699,1.553925,1.791759,2.944439,4.718499,2.564949,3.178054,3.951244,1.791759,2.484907,3.135494,1.098612,0.693147,0.0,,1.098612,1.098612,11.380548,3.610918,10.564912
3,0.693147,1.386294,2.564949,1.386294,2.197225,2.397895,2.269028,2.251292,2.269028,2.302585,4.543295,2.70805,3.135494,3.806662,1.098612,1.94591,2.833213,1.791759,0.693147,0.0,,1.098612,1.098612,11.843674,3.332205,10.569854
4,0.0,0.693147,0.693147,0.693147,0.693147,3.044522,3.044522,3.044522,3.044522,3.091042,3.091042,2.397895,2.772589,2.639057,1.609438,2.079442,1.94591,1.386294,0.0,1.386294,3.988984,1.098612,1.098612,12.03232,3.637586,11.177593


In [29]:
# Making the original dataframe without 8 values
df_except_8_values= df.drop(['GiftAvg36', 'GiftAvgAll', 'GiftAvgCard36', 'GiftAvgLast', 'GiftCnt36',\
                             'GiftCntAll', 'GiftCntCard36', 'GiftCntCardAll'], axis=1)

# Making the log-valued dataframe with 8 values
df1_log_8_values= df1[['GiftAvg36', 'GiftAvgAll', 'GiftAvgCard36', 'GiftAvgLast', 'GiftCnt36',\
                       'GiftCntAll', 'GiftCntCard36', 'GiftCntCardAll']]

# Concatenating these two dataframes.
dfu2= pd.concat([df_except_8_values, df1_log_8_values], axis = 1)
dfu2.head()

Unnamed: 0,TargetB,GiftTimeLast,GiftTimeFirst,PromCnt12,PromCnt36,PromCntAll,PromCntCard12,PromCntCard36,PromCntCardAll,StatusCat96NK,StatusCatStarAll,DemCluster,DemAge,DemGender,DemHomeOwner,DemMedHomeValue,DemPctVeterans,DemMedIncome,GiftAvg36,GiftAvgAll,GiftAvgCard36,GiftAvgLast,GiftCnt36,GiftCntAll,GiftCntCard36,GiftCntCardAll
0,0.0,21.0,66.0,8.0,17.0,26.0,3.0,8.0,13.0,1,0.0,0,,1,2,0.0,0.0,,2.674149,2.327278,2.890372,2.890372,1.098612,1.609438,0.693147,1.386294
1,0.0,26.0,92.0,14.0,35.0,79.0,5.0,5.0,24.0,1,0.0,2,67.0,1,2,186800.0,85.0,,3.044522,2.826129,,3.044522,0.693147,2.197225,0.0,1.386294
2,1.0,18.0,111.0,12.0,23.0,51.0,5.0,11.0,22.0,2,1.0,0,,2,2,87600.0,36.0,38750.0,1.819699,1.553925,1.791759,1.94591,1.94591,3.73767,1.386294,3.044522
3,1.0,9.0,93.0,14.0,22.0,44.0,2.0,6.0,16.0,5,1.0,0,,2,2,139200.0,27.0,38942.0,2.269028,2.251292,2.269028,2.397895,1.386294,2.564949,1.386294,2.197225
4,0.0,21.0,21.0,10.0,15.0,13.0,4.0,7.0,6.0,3,0.0,3,53.0,2,2,168100.0,37.0,71509.0,3.044522,3.044522,3.044522,3.044522,0.693147,0.693147,0.693147,0.693147


# Selected model variables accoding to (P 4-60):
DemMedHomeValue, GiftTimeLast, GiftAvgAll(log form), GiftCnt36(Log form)

In [30]:
### Making stepwise result into train set.
dfu2_sw= dfu2[['TargetB','DemMedHomeValue', 'GiftTimeLast', 'GiftAvgAll', 'GiftCnt36']] 
dfu2_sw.head()
### Please make sure that 'TargetB' is included in this datafrmae

Unnamed: 0,TargetB,DemMedHomeValue,GiftTimeLast,GiftAvgAll,GiftCnt36
0,0.0,0.0,21.0,2.327278,1.098612
1,0.0,186800.0,26.0,2.826129,0.693147
2,1.0,87600.0,18.0,1.553925,1.94591
3,1.0,139200.0,9.0,2.251292,1.386294
4,0.0,168100.0,21.0,3.044522,0.693147


In [31]:
# 5:5 data partition
data = dfu2_sw.drop(['TargetB'], axis=1) # Note that we use dfu2_sw dataframe here.
target = dfu2_sw['TargetB']

import sklearn
from sklearn.model_selection import train_test_split 
X_train, X_test, y_train, y_test = train_test_split( 
    data, target, test_size=0.5, random_state=42) 

# Imputation with creating imputation variables

from sklearn.impute import SimpleImputer
imp= SimpleImputer(strategy = 'mean', add_indicator=True) 
X_train2= imp.fit_transform(X_train) 
X_test2= imp.fit_transform(X_test) # Please don't forget this line. Both X_train and X_test should be fit_transfomred.
X_train2 

array([[9.31000000e+04, 1.50000000e+01, 2.77258872e+00, 1.09861229e+00],
       [9.29000000e+04, 1.60000000e+01, 3.79166165e+00, 1.38629436e+00],
       [6.77000000e+04, 2.00000000e+01, 2.77258872e+00, 1.09861229e+00],
       ...,
       [1.53700000e+05, 8.00000000e+00, 2.48989419e+00, 1.94591015e+00],
       [6.36000000e+04, 1.80000000e+01, 2.60268969e+00, 1.09861229e+00],
       [1.86400000e+05, 2.60000000e+01, 2.96165829e+00, 1.09861229e+00]])

In [32]:
from sklearn.linear_model import LogisticRegression
logreg = LogisticRegression(solver='lbfgs', max_iter=1000).fit(X_train2, y_train ) 
print ("Logreg Training set score:{:.5f}".format(logreg.score(X_train2, y_train))) 
print ("Logreg Test set score:{:.5f}".format(logreg.score(X_test2, y_test))) 
###score --> the coefficient determinant of R^2

Logreg Training set score:0.51043
Logreg Test set score:0.49350


# Recording Categorical Inputs (P 4-66)
FYI, We aleady changed StatusCat96NK variables A, S, F, N, E, and L into 1, 2, 3, 4, 5, and 6 each
in the "Pva97nk_step2 Decision Tree (Ch3).ipynb file". (i.e., A -> 1, S -> 2, and etc.)
It is because Python decision tree model needs all values of variables to be numeric.

In [33]:
# By the book (P 4-66), Category S should be changed into category A. Similarly F into N, and E into L.
# Which means, in our changed values, 2 should be changed into 1. Similarly 3 into 4, and 5 into 6.

dfu2['StatusCat96NK'].value_counts(dropna=False) 

1    5826
2    2365
3     660
4     574
5     227
6      34
Name: StatusCat96NK, dtype: int64

In [34]:
dfu2['StatusCat96NK']= dfu2['StatusCat96NK'].replace(2,1).replace(4,3).replace(6,5)
dfu2['StatusCat96NK'].value_counts(dropna=False) 

### After this processure, all you have to do is conducting data partition, imputation, 
### and (logistic) regression as before. 
### The following process will be shown in the "Pva97nk_step4 Neural Network (Ch 5).ipynb" file.

1    8191
3    1234
5     261
Name: StatusCat96NK, dtype: int64

# Very important!!!
Please note that, in SAS E Miner, the data partition node comes in the first place.
Meanshile, in Python, data partition comes only before the impute node. 
Which means that, in Python, transform and replacement should be conducted before data partition.