In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from pandas_profiling import ProfileReport

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
import sklearn.metrics as met
from sklearn.preprocessing import OneHotEncoder
from sklearn.feature_selection import RFE

from scipy import stats

## Importing Data

In [3]:
behaviour = pd.read_csv('CustomerBehaviorData.csv')
consumption = pd.read_csv('CreditConsumptionData.csv')
customer = pd.read_csv('CustomerDemographics.csv')

### Cleaning Customer df

In [4]:
#finding NaN values in customer df
customer.isna().sum()

customer_id                     0
account_type                    1
gender                          1
age                             0
income                          1
emp_tenure_years                0
tenure_with_bank                0
region_code                     1
net_banking_flag                0
avg_days_between_transaction    3
dtype: int64

In [5]:
#Droppping observations having NaN values
customer.dropna(inplace=True)
customer.isna().sum()

customer_id                     0
account_type                    0
gender                          0
age                             0
income                          0
emp_tenure_years                0
tenure_with_bank                0
region_code                     0
net_banking_flag                0
avg_days_between_transaction    0
dtype: int64

### Cleaning behaviour df

In [6]:
behaviour.isna().sum()

customer_id              0
cc_cons_apr              0
dc_cons_apr              0
cc_cons_may              1
dc_cons_may              0
cc_cons_jun              0
dc_cons_jun              1
cc_count_apr             1
cc_count_may             0
cc_count_jun             0
dc_count_apr             0
dc_count_may             0
dc_count_jun             0
card_lim                 0
personal_loan_active     0
vehicle_loan_active      0
personal_loan_closed     1
vehicle_loan_closed      0
investment_1             0
investment_2             0
investment_3             2
investment_4             0
debit_amount_apr         0
credit_amount_apr        0
debit_count_apr          1
credit_count_apr         0
max_credit_amount_apr    0
debit_amount_may         0
credit_amount_may        0
credit_count_may         0
debit_count_may          0
max_credit_amount_may    0
debit_amount_jun         0
credit_amount_jun        0
credit_count_jun         0
debit_count_jun          0
max_credit_amount_jun    0
l

In [7]:
#Droppping observations having NaN values
behaviour.dropna(inplace=True)
behaviour.isna().sum()

customer_id              0
cc_cons_apr              0
dc_cons_apr              0
cc_cons_may              0
dc_cons_may              0
cc_cons_jun              0
dc_cons_jun              0
cc_count_apr             0
cc_count_may             0
cc_count_jun             0
dc_count_apr             0
dc_count_may             0
dc_count_jun             0
card_lim                 0
personal_loan_active     0
vehicle_loan_active      0
personal_loan_closed     0
vehicle_loan_closed      0
investment_1             0
investment_2             0
investment_3             0
investment_4             0
debit_amount_apr         0
credit_amount_apr        0
debit_count_apr          0
credit_count_apr         0
max_credit_amount_apr    0
debit_amount_may         0
credit_amount_may        0
credit_count_may         0
debit_count_may          0
max_credit_amount_may    0
debit_amount_jun         0
credit_amount_jun        0
credit_count_jun         0
debit_count_jun          0
max_credit_amount_jun    0
l

### Cleaning consumption df

In [8]:
consumption.isna().sum() # no NaN values except the ones we have to predict

customer_id          0
cc_cons_target    5000
dtype: int64

### Merging the dataframes 
- now we will merge all three dataframes on the basis of customer_id.
- first we will merge customer and behaviour dataframes as customer_be
- next we will merge the cosumption dataframe with the custoemr_be df and store the resultant dataframe in **final**

In [9]:
customer_behaviour = pd.merge(left=customer, right=behaviour, on='customer_id')
final = pd.merge(left=customer_behaviour, right=consumption, on='customer_id')
final

Unnamed: 0,customer_id,account_type,gender,age,income,emp_tenure_years,tenure_with_bank,region_code,net_banking_flag,avg_days_between_transaction,...,debit_count_may,max_credit_amount_may,debit_amount_jun,credit_amount_jun,credit_count_jun,debit_count_jun,max_credit_amount_jun,loan_enq,emi_active,cc_cons_target
0,19427,current,M,63,MEDIUM,30.1,10,628.0,1,5.0,...,14,20770.0,44884.90,369000.75,40,96,46088.0,Y,2646.72,
1,16150,current,M,36,MEDIUM,14.4,10,656.0,0,12.0,...,63,78627.0,91073.84,243182.32,7,12,17953.0,Y,5469.79,
2,11749,current,F,28,MEDIUM,4.8,10,314.0,1,13.0,...,82,1260.0,96552.00,35467.00,16,42,41121.0,Y,7207.85,
3,11635,current,M,32,MEDIUM,9.6,2,614.0,1,19.0,...,78,23332.0,18250.00,87204.35,2,10,32003.0,Y,591.34,
4,8908,current,M,32,HIGH,12.0,7,750.0,1,18.0,...,171,41672.0,83525.75,83992.00,21,12,9626.0,Y,2621.39,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19983,1270,current,F,66,MEDIUM,32.2,6,354.0,1,7.0,...,8,24438.0,153772.21,31284.00,3,38,42872.0,Y,6334.20,
19984,15992,current,M,53,MEDIUM,19.8,5,809.0,0,14.0,...,11,213000.0,18098.47,93108.00,43,11,36747.0,Y,102216.02,
19985,7081,current,F,66,MEDIUM,18.4,1,466.0,0,10.0,...,171,34327.0,32198.63,46800.00,3,38,18810.0,Y,1930.19,
19986,6821,current,M,32,LOW,6.0,8,619.0,0,17.0,...,82,51929.0,27334.14,20201.00,47,80,11443.0,Y,24499.91,


### Profile report
After importing the data, the first tasks is to detect and treat outliers and balnk values. For that reason we will first generate a profile report of our dataframe by using pandas profiling package.\
Some info about the profiling package from their [website](https://pandas-profiling.github.io/pandas-profiling/docs/master/index.html):
pandas_profiling extends the pandas DataFrame with df.profile_report() for quick data analysis.For each column the following statistics - if relevant for the column type - are presented in an interactive HTML report:
- Type inference: detect the types of columns in a dataframe.
- Essentials: type, unique values, missing values
- Quantile statistics like minimum value, Q1, median, Q3, maximum, range, interquartile range
- Descriptive statistics like mean, mode, standard deviation, sum, median absolute deviation, coefficient of variation, kurtosis, skewness
- Most frequent values
- Histogram
- Correlations highlighting of highly correlated variables, Spearman, Pearson and Kendall matrices
- Missing values matrix, count, heatmap and dendrogram of missing values
- Text analysis learn about categories (Uppercase, Space), scripts (Latin, Cyrillic) and blocks (ASCII) of text data.
- File and Image analysis extract file sizes, creation dates and dimensions and scan for truncated images or those containing EXIF information.



#### Generating profile report


In [35]:
profile_report = ProfileReport(final)
profile_report.to_file("your_report.html")

HBox(children=(HTML(value='Summarize dataset'), FloatProgress(value=0.0, max=62.0), HTML(value='')))




HBox(children=(HTML(value='Generate report structure'), FloatProgress(value=0.0, max=1.0), HTML(value='')))




HBox(children=(HTML(value='Render HTML'), FloatProgress(value=0.0, max=1.0), HTML(value='')))




HBox(children=(HTML(value='Export report to file'), FloatProgress(value=0.0, max=1.0), HTML(value='')))




#### Tasks after analyzing pandas report



- Treat outliers


- Missing values treatment


- Remove constant columns as those are of no use to the learning algo.
    > - ***personal_loan_active***  has constant value "1"
    > - ***vehicle_loan_active***  has constant value "1"
    > - ***personal_loan_closed***  has constant value "1.0"
    > - ***vehicle_loan_closed***  has constant value "1"
    > - ***loan_enq***  has constant value "True"
    

- Remove columns having low variance



- transform columns having skewed observations
   > - ***dc_cons_may*** is highly skewed (γ1 = 23.64870391)
   > - ***credit_amount_apr*** is highly skewed (γ1 = 32.14621581)
   > - ***max_credit_amount_apr*** is highly skewed (γ1 = 47.23251791)
   > - ***credit_amount_may*** is highly skewed (γ1 = 22.1395434)
   > - ***max_credit_amount_may*** is highly skewed (γ1 = 35.18401822)
   > - ***debit_amount_jun*** is highly skewed (γ1 = 67.58004653)
   > - ***max_credit_amount_jun*** is highly skewed (γ1 = 21.54480362)
   


In [39]:
final.drop(labels='customer_id',axis=1,inplace=True)

In [47]:
final

Unnamed: 0,account_type,gender,age,income,emp_tenure_years,tenure_with_bank,region_code,net_banking_flag,avg_days_between_transaction,cc_cons_apr,...,debit_count_may,max_credit_amount_may,debit_amount_jun,credit_amount_jun,credit_count_jun,debit_count_jun,max_credit_amount_jun,loan_enq,emi_active,cc_cons_target
0,current,M,63,MEDIUM,30.1,10,628.0,1,5.0,7998.48,...,14,20770.0,44884.90,369000.75,40,96,46088.0,Y,2646.72,
1,current,M,36,MEDIUM,14.4,10,656.0,0,12.0,16479.64,...,63,78627.0,91073.84,243182.32,7,12,17953.0,Y,5469.79,
2,current,F,28,MEDIUM,4.8,10,314.0,1,13.0,29272.03,...,82,1260.0,96552.00,35467.00,16,42,41121.0,Y,7207.85,
3,current,M,32,MEDIUM,9.6,2,614.0,1,19.0,9662.31,...,78,23332.0,18250.00,87204.35,2,10,32003.0,Y,591.34,
4,current,M,32,HIGH,12.0,7,750.0,1,18.0,13239.86,...,171,41672.0,83525.75,83992.00,21,12,9626.0,Y,2621.39,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19983,current,F,66,MEDIUM,32.2,6,354.0,1,7.0,4611.00,...,8,24438.0,153772.21,31284.00,3,38,42872.0,Y,6334.20,
19984,current,M,53,MEDIUM,19.8,5,809.0,0,14.0,9545.00,...,11,213000.0,18098.47,93108.00,43,11,36747.0,Y,102216.02,
19985,current,F,66,MEDIUM,18.4,1,466.0,0,10.0,2028.93,...,171,34327.0,32198.63,46800.00,3,38,18810.0,Y,1930.19,
19986,current,M,32,LOW,6.0,8,619.0,0,17.0,11456.23,...,82,51929.0,27334.14,20201.00,47,80,11443.0,Y,24499.91,


### Splitting the dataframe in df_predict and df_train for the data to predict and data to train on respectively

Splitting the rows into df_predict and df_train. df_predict contains cc_cons_target having NA values. These are the customers that we have to predit consumption for.

df_train contains all rows that have a numerical value in cc_cons_target. this df will train out liner regression algo

In [48]:
#filtering all the rows from final that contain NaN values in cc_cons_target and storing those rows in df_predict.
df_predict = final[final.cc_cons_target.isna()]
#dropping the NaN values of cc_cons_target 
df_predict.drop(labels='cc_cons_target',axis=1,inplace=True)

# droppping  all the rows that are in the df_predict dataframe and storing the result in df_train
df_train = final.drop(df_predict.index)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(


In [49]:
df_train.shape

(14995, 48)

### Encoding categorical variables of final_train

In [50]:
#using pd.get_dummies to encode categorical variables in continunous variables and replace the categorical variables.
df_train = pd.get_dummies(df_train)
df_train.loc[:,['account_type_current','account_type_saving','gender_F','gender_M','income_HIGH','income_LOW','income_MEDIUM','loan_enq_Y']]

Unnamed: 0,account_type_current,account_type_saving,gender_F,gender_M,income_HIGH,income_LOW,income_MEDIUM,loan_enq_Y
104,1,0,0,1,0,0,1,1
105,1,0,0,1,1,0,0,1
106,1,0,0,1,0,0,1,1
107,1,0,1,0,0,0,1,1
108,1,0,0,1,0,0,1,1
...,...,...,...,...,...,...,...,...
15094,1,0,1,0,0,1,0,1
15095,1,0,0,1,0,0,1,1
15096,1,0,0,1,0,0,1,1
15097,1,0,0,1,0,0,1,1


### Outlier Removal

To remove outliers we use scipy.stats.mstats.winsorize function.

Winsorizing or winsorization is the transformation of statistics by limiting extreme values in the statistical data to reduce the effect of possibly spurious outliers. It is named after the engineer-turned-biostatistician Charles P. Winsor (1895–1951)


In [51]:
#outlier removal with limits:
#     lower limit: 5% of range
#     upper limit: 95% of range
from scipy.stats import mstats

for x in df_train.columns:
    df_train[x] = mstats.winsorize(df_train[x], limits=[0.05, 0.05])

## applying the same aboe steps of missing value removal and outlier imputation to df_predict

In [52]:
df_predict = pd.get_dummies(df_predict)
df_predict.loc[:,['account_type_current','account_type_saving','gender_F','gender_M','income_HIGH','income_LOW','income_MEDIUM','loan_enq_Y']]

Unnamed: 0,account_type_current,account_type_saving,gender_F,gender_M,income_HIGH,income_LOW,income_MEDIUM,loan_enq_Y
0,1,0,0,1,0,0,1,1
1,1,0,0,1,0,0,1,1
2,1,0,1,0,0,0,1,1
3,1,0,0,1,0,0,1,1
4,1,0,0,1,1,0,0,1
...,...,...,...,...,...,...,...,...
19983,1,0,1,0,0,0,1,1
19984,1,0,0,1,0,0,1,1
19985,1,0,1,0,0,0,1,1
19986,1,0,0,1,0,1,0,1


In [53]:
for x in df_predict.columns:
    df_predict[x] = mstats.winsorize(df_predict[x], limits=[0.05, 0.05])

### Univariate analysis

In [54]:
df_train

Unnamed: 0,age,emp_tenure_years,tenure_with_bank,region_code,net_banking_flag,avg_days_between_transaction,cc_cons_apr,dc_cons_apr,cc_cons_may,dc_cons_may,...,emi_active,cc_cons_target,account_type_current,account_type_saving,gender_F,gender_M,income_HIGH,income_LOW,income_MEDIUM,loan_enq_Y
104,35,15.0,1,708.0,0,17.0,24893.00,378.00,10288.00,25509.00,...,1674.09,20014.0,1,0,0,1,0,0,1,1
105,35,15.0,6,249.0,0,14.0,18941.62,966.00,20672.00,410.40,...,13043.34,10173.0,1,0,0,1,1,0,0,1
106,55,24.5,1,802.0,1,3.0,5678.87,2724.00,1964.50,3933.11,...,25375.27,16095.0,1,0,0,1,0,0,1,1
107,29,4.5,6,867.0,1,4.0,30489.50,1236.00,12609.88,9138.14,...,3544.33,7707.0,1,0,1,0,0,0,1,1
108,28,4.9,3,937.0,1,11.0,2328.22,1597.54,19979.75,1045.85,...,5026.50,130263.0,1,0,0,1,0,0,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15094,53,13.2,9,653.0,1,20.0,10443.76,1218.85,7252.24,3873.00,...,36788.75,4263.0,1,0,1,0,0,1,0,1
15095,44,2.4,9,535.0,1,2.0,29890.33,6308.00,16025.50,7610.00,...,4762.10,11019.0,1,0,0,1,0,0,1,1
15096,37,3.4,9,464.0,1,8.0,4148.98,948.74,19360.26,494.00,...,2877.33,310023.0,1,0,0,1,0,0,1,1
15097,66,40.0,10,536.0,1,3.0,60315.20,7191.05,44341.13,25509.00,...,13006.84,28813.0,1,0,0,1,0,0,1,1


## Implementing a linear regression model without outlier removal, with skewed data and constant columns

In [56]:
train_cols_x = ['age', 'emp_tenure_years', 'tenure_with_bank','region_code', 
              'net_banking_flag', 'avg_days_between_transaction','cc_cons_apr', 'dc_cons_apr', 
              'cc_cons_may', 'dc_cons_may','cc_cons_jun', 'dc_cons_jun', 'cc_count_apr', 'cc_count_may',
              'cc_count_jun', 'dc_count_apr', 'dc_count_may', 'dc_count_jun','card_lim', 'personal_loan_active', 
              'vehicle_loan_active','personal_loan_closed', 'vehicle_loan_closed', 'investment_1','investment_2', 
              'investment_3', 'investment_4', 'debit_amount_apr','credit_amount_apr', 'debit_count_apr', 
              'credit_count_apr','max_credit_amount_apr', 'debit_amount_may', 'credit_amount_may','credit_count_may', 
              'debit_count_may', 'max_credit_amount_may','debit_amount_jun', 'credit_amount_jun', 'credit_count_jun',
              'debit_count_jun', 'max_credit_amount_jun', 'emi_active','account_type_current', 'account_type_saving',
              'gender_F', 'gender_M', 'income_HIGH', 'income_LOW', 'income_MEDIUM','loan_enq_Y']



train_cols_y = ['cc_cons_target']

### Splitting x and y teach data into train and test data for model building and model testing

In [57]:
x_train, x_test, y_train, y_test = train_test_split(df_train[train_cols_x], df_train[train_cols_y], test_size=0.25, random_state=72)

### RFE and selecting features

In [75]:
from sklearn.tree import DecisionTreeRegressor
regressor = DecisionTreeRegressor(random_state=0)

lr = LinearRegression()
rfe = RFE(estimator=regressor, n_features_to_select=25)
rfe.fit(x_train, y_train)


# selecting features given by RFE
f = rfe.get_support(1) #the most important features
x_train_select = x_train[x_train.columns[f]] # final features`
x_train_select.head(2)

Unnamed: 0,region_code,cc_cons_apr,dc_cons_apr,cc_cons_may,dc_cons_may,cc_cons_jun,dc_cons_jun,cc_count_jun,dc_count_jun,card_lim,...,credit_amount_apr,debit_count_apr,max_credit_amount_apr,debit_amount_may,credit_amount_may,max_credit_amount_may,debit_amount_jun,credit_amount_jun,max_credit_amount_jun,emi_active
8517,799.0,1796.44,11955.0,6132.03,686.0,19843.0,4207.0,50,54,100000,...,20025.0,3.0,74499.0,19458.93,71831.0,16187.0,7195.0,84169.0,63642.0,1040.5
665,685.0,1239.4,376.0,12828.0,4336.0,12295.0,3074.0,44,1,533000,...,72412.5,42.0,146872.0,73295.0,42748.0,28595.0,21434.93,84127.0,15630.0,7997.95


In [76]:
#selecting the same features in test data
x_test_select = x_test[x_test.columns[f]]
x_test_select.head(2)

Unnamed: 0,region_code,cc_cons_apr,dc_cons_apr,cc_cons_may,dc_cons_may,cc_cons_jun,dc_cons_jun,cc_count_jun,dc_count_jun,card_lim,...,credit_amount_apr,debit_count_apr,max_credit_amount_apr,debit_amount_may,credit_amount_may,max_credit_amount_may,debit_amount_jun,credit_amount_jun,max_credit_amount_jun,emi_active
6853,867.0,13510.56,13230.0,15093.42,4051.0,3157.2,5333.6,4,80,327000,...,54279.0,69.0,94812.0,49713.75,36644.25,80735.0,216728.01,112465.0,48012.11,4271.29
1323,249.0,60315.2,2376.54,18101.06,25509.0,4654.33,1355.0,27,79,510000,...,45647.0,10.0,46446.0,36316.98,184001.77,33082.0,25539.25,86020.0,12059.0,6141.61


### Applying liner regresssion on selected features

In [77]:
# training the linear regression model on x_train_select and y_train 
lr.fit(x_train_select, y_train)

# predicting the values of x_train_select and storing them in y_hat_train
y_hat_train = lr.predict(x_train_select)

### Checking the accuracy of the train data predictions

In [78]:
#This is a value between [0 = no-fit] and [1 = perfect fit ]
r_sq = met.r2_score(y_pred=y_hat_train,y_true= y_train)
print('The accuracy of the model in r^2 :',r_sq)


# #mae
# mae = met.mean_absolute_error(y_pred=y_hat_train,y_true= y_train)
# print('The accuracy of the model in mae :',mae)


# #MAPE
# y_diff = y_hat_train - y_train
# y_diff  = np.abs(y_diff)/np.max(y_train)#/ np.maximum(np.abs(y_diff))
# print('The accuracy of the model in mape:',np.average(y_diff)*100,'%')



The accuracy of the model in r^2 : 0.003234374716892252


### Predicting the values for test dataset and checking the accuracy of the model

In [33]:
y_hat_test = lr.predict(x_test_select)

# checking the accuracy of test_data predictions
r_sq = met.r2_score(y_pred=y_hat_test, y_true= y_test)
print('The accuracy of the model in r^2 :',r_sq)



# mae = met.mean_absolute_error(y_pred=y_hat_test,y_true= y_test)

# # #mape calculations
# y_diff = y_test - y_hat_test
# y_diff  = np.abs(y_diff)/np.abs(y_test)
# print('The accuracy of the model in mape:',np.average(y_diff)*100,'%')

# print('The accuracy of the model in mae :',mae)

The accuracy of the model in r^2 : -0.0075235384800151905
