In [23]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from pandas_profiling import ProfileReport

from sklearn.model_selection import train_test_split
import sklearn.linear_model
import sklearn.metrics as met
from sklearn.preprocessing import OneHotEncoder
from sklearn.feature_selection import RFE

from scipy import stats


from sklearn import linear_model

###1. Importing and cleaning data
#### 1.1 Importing Data

In [2]:
# behaviour = pd.read_csv('CustomerBehaviorData.csv')
# consumption = pd.read_csv('CreditConsumptionData.csv')
# customer = pd.read_csv('CustomerDemographics.csv')


behaviour = pd.read_csv('https://github.com/lalit-kumr/credit_card_consumption/blob/main/CustomerBehaviorData.csv?raw=true')
consumption = pd.read_csv('https://github.com/lalit-kumr/credit_card_consumption/raw/main/CreditConsumptionData.csv')
customer = pd.read_csv('https://github.com/lalit-kumr/credit_card_consumption/raw/main/CustomerDemographics.csv')

####1.2 Cleaning Customer df

In [3]:
#finding NaN values in customer df
# customer.isna().sum()
#Droppping observations having NaN values
customer.dropna(inplace=True)
# customer.isna().sum()

####1.3 Cleaning behaviour df

In [4]:
#Droppping observations having NaN values
behaviour.dropna(inplace=True)
# behaviour.isna().sum()

####1.4 Cleaning consumption df

In [5]:
consumption.isna().sum() # no NaN values except the ones we have to predict

customer_id          0
cc_cons_target    5000
dtype: int64

###2. Merging and cleaning the dataframes



####2.1 Merging df 

In [6]:
# - now we will merge all three dataframes on the basis of customer_id.
# - first we will merge customer and behaviour dataframes as customer_be
# - next we will merge the cosumption dataframe with the custoemr_be df and store the resultant dataframe in **final**


customer_behaviour = pd.merge(left=customer, right=behaviour, on='customer_id')
final = pd.merge(left=customer_behaviour, right=consumption, on='customer_id')

####2.2 summing features and removing orignal features

In [7]:
#Total investments
final['investment'] = final['investment_1']+final['investment_2']+final['investment_3']+final['investment_4']
#Total debit amount
final["Total_debit_amount"]=final['debit_amount_apr']+final['debit_amount_may']+final['debit_amount_jun']
#Total Credit Amount
final["Total_credit_amount"]=final['credit_amount_apr']+final['credit_amount_may']+final['credit_amount_jun']
#Total Max Credit amount
final["Total_max_credit_acmout"]=final['max_credit_amount_apr']+final['max_credit_amount_may']+final['max_credit_amount_jun']
#Total Active Loan
final["Totat_active_loan"]= final['personal_loan_active'] + final['vehicle_loan_active']
#Toatal Closed Loan
final["Total_closed_loan"]= final['personal_loan_closed'] + final['vehicle_loan_closed']
#Credit Card expenditure total
final['creditcard_exp']= final['cc_cons_apr'] + final['cc_cons_may'] + final['cc_cons_jun'] 
#Debit Card expenditure total
final['debitcard_exp']= final['dc_cons_apr'] + final['dc_cons_may'] + final['dc_cons_jun']
#credit card use count total
final['cc_count']=final['cc_count_apr']+final['cc_count_may']+final['cc_count_jun']
# debit card use count total
final['dc_count']=final['dc_count_apr']+final['dc_count_may']+final['dc_count_jun']
#debit payments count total
final['debit_count']=final['debit_count_apr']+ final['debit_count_may']+ final['debit_count_jun']
#credit payments count total
final['credit_count']=final['credit_count_apr']+ final['credit_count_may']+ final['credit_count_jun']

extra = ['investment_1','investment_2','investment_3','investment_4',
         'debit_amount_apr','debit_amount_may','debit_amount_jun',
         'credit_amount_apr','credit_amount_may','credit_amount_jun',
         'max_credit_amount_apr','max_credit_amount_may','max_credit_amount_jun',
         'personal_loan_active','vehicle_loan_active',
         'personal_loan_closed','vehicle_loan_closed',
         'cc_cons_apr','cc_cons_may','cc_cons_jun',
         'dc_cons_apr','dc_cons_may','dc_cons_jun',
         'cc_count_apr','cc_count_may','cc_count_jun',
         'dc_count_apr','dc_count_may','dc_count_jun',
         'debit_count_apr','debit_count_may','debit_count_jun',
         'credit_count_apr','credit_count_may','credit_count_jun'
         ]

final.drop(labels=extra,axis=1,inplace=True)
# final.head(3)


####2.3 Encoding categorical variables

In [8]:
#using pd.get_dummies to encode categorical variables in continunous variables and replace the categorical variables.
final = pd.get_dummies(final)


###3. EDA
####3.1 Profile report
After importing the data, the first tasks is to detect and treat outliers and balnk values. For that reason we will first generate a profile report of our dataframe by using pandas profiling package.

In [9]:
# profile_report = ProfileReport(final)
# profile_report.to_widgets()

#### 3.2 Data. describe()

In [10]:
final.describe()

Unnamed: 0,customer_id,age,emp_tenure_years,tenure_with_bank,region_code,net_banking_flag,avg_days_between_transaction,card_lim,emi_active,cc_cons_target,investment,Total_debit_amount,Total_credit_amount,Total_max_credit_acmout,Totat_active_loan,Total_closed_loan,creditcard_exp,debitcard_exp,cc_count,dc_count,debit_count,credit_count,account_type_current,account_type_saving,gender_F,gender_M,income_HIGH,income_LOW,income_MEDIUM,loan_enq_Y
count,19988.0,19988.0,19988.0,19988.0,19988.0,19988.0,19988.0,19988.0,19988.0,14995.0,19988.0,19988.0,19988.0,19988.0,19988.0,19988.0,19988.0,19988.0,19988.0,19988.0,19988.0,19988.0,19988.0,19988.0,19988.0,19988.0,19988.0,19988.0,19988.0,19988.0
mean,9998.746048,47.676506,15.21713,5.496248,598.972333,0.717881,11.043776,278360.162297,15403.78,55156.748249,872114.1,212036.8,241451.2,165510.6,2.0,2.0,46178.68,22943.995663,116.61672,61.619272,121.744397,43.079398,0.846208,0.153792,0.135631,0.864369,0.080598,0.280568,0.638833,1.0
std,5773.52094,30.367037,20.486393,2.870521,220.11094,0.450042,5.480507,180779.745948,120286.9,97680.466478,2969239.0,287800.9,296102.7,316014.6,0.0,0.0,41340.46,26634.982245,119.988169,38.972461,62.444733,34.111134,0.360759,0.360759,0.342405,0.342405,0.272224,0.449288,0.480351,0.0
min,0.0,25.0,0.5,1.0,123.0,0.0,2.0,0.0,0.0,0.0,-5196.8,8443.59,1177.5,2727.91,2.0,2.0,1305.0,698.39,3.0,3.0,3.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
25%,4998.75,33.0,5.1,3.0,424.0,0.0,6.0,124000.0,1545.628,6768.0,138929.7,108591.6,121165.9,82257.65,2.0,2.0,21768.87,9635.75,55.0,29.0,76.0,15.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
50%,9998.5,40.0,9.9,6.0,623.0,1.0,11.0,272000.0,3934.66,13363.0,273426.7,161178.7,181415.7,116607.7,2.0,2.0,35155.39,15955.5,90.0,58.0,114.0,36.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0
75%,14997.25,53.0,18.0,8.0,799.0,1.0,16.0,401000.0,9376.39,20103.0,584829.1,244002.9,276712.8,175301.5,2.0,2.0,56672.71,26897.5925,134.0,87.0,159.0,58.0,1.0,0.0,0.0,1.0,0.0,1.0,1.0,1.0
max,19999.0,223.0,203.0,10.0,974.0,1.0,20.0,1000000.0,7447125.0,408382.0,66571570.0,26557130.0,10895300.0,12087490.0,2.0,2.0,1161694.0,932712.45,1249.0,229.0,424.0,230.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


###4. Data Cleaning-2

####4.1 Outlier Removal
To remove outliers we use scipy.stats.mstats.winsorize function.


In [11]:
#outlier removal with limits:
#     lower limit: 5% of range
#     upper limit: 95% of range
from scipy.stats import mstats

for x in final.columns:
    final[x] = mstats.winsorize(final[x], limits=[0.05, 0.05])

####4.2 Splitting the dataframe in df_predict and df_train for the data to predict and data to train on respectively

Splitting the rows into df_predict and df_train. df_predict contains cc_cons_target having NA values. These are the customers that we have to predit consumption for.

df_train contains all rows that have a numerical value in cc_cons_target. this df will train out liner regression algo

In [12]:
#filtering all the rows from final that contain NaN values in cc_cons_target and storing those rows in df_predict.
df_predict = final[final.cc_cons_target.isna()]
#dropping the NaN values of cc_cons_target 
df_predict.drop(labels='cc_cons_target',axis=1,inplace=True)

# droppping  all the rows that are in the df_predict dataframe and storing the result in df_train
df_train = final.drop(df_predict.index)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


####4.3 Standardization

###5. VIF calculations and removing multicollinear features

In [None]:
#R^2 value is determined to find out how well an independent variable is described by the other independent variables. 
#A high value of R^2 means that the variable is highly correlated with the other variables.
#So, the closer the R^2 value to 1, the higher the value of VIF and
# the higher the multicollinearity with the particular independent variable.


# vif = 1/(1-r^2)


# VIF starts at 1 and has no upper limit
# VIF = 1, no correlation between the independent variable and the other variables
# VIF exceeding 5 or 10 indicates high multicollinearity between this independent variable and the others




# Import library for VIF
from statsmodels.stats.outliers_influence import variance_inflation_factor

def calc_vif(X):

    # Calculating VIF
    vif = pd.DataFrame()
    vif["variables"] = X.columns
    vif["VIF"] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]

    return(vif)


X = df_train.iloc[:,:-1]
calc_vif(X)



#Fixing Multicollinearity

# #Dropping one of the correlated features will help in bringing down the multicollinearity between correlated features:
# X = df.drop(['Age','Salary'],axis=1)
# calc_vif(X)

  return 1 - self.ssr/self.centered_tss
  vif = 1. / (1. - r_squared_i)


Unnamed: 0,variables,VIF
0,customer_id,1.0013
1,age,2.001917
2,emp_tenure_years,2.00324
3,tenure_with_bank,1.001844
4,region_code,1.002175
5,net_banking_flag,1.001491
6,avg_days_between_transaction,1.001309
7,card_lim,1.000973
8,emi_active,1.002171
9,cc_cons_target,1.001473


####5.2 removing columns having high vif values

In [None]:
#dropping 'account_type_saving','gender_F','income_HIGH','emp_tenure_years','income_LOW','customer_id'
df_train.drop(labels=['account_type_saving','gender_F','income_HIGH','emp_tenure_years','income_LOW','customer_id']
              ,axis=1,inplace=True)
df_predict.drop(labels=['account_type_saving','gender_F','income_HIGH','emp_tenure_years','income_LOW','customer_id']
              ,axis=1,inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


In [None]:
##df_train.columns.difference(['cc_cons_target'])

###6 Splitting x and y teach data into train and test data for model building and model testing

In [13]:
df_train_x = df_train[df_train.columns.difference(['cc_cons_target'])]
df_train_y = df_train[['cc_cons_target']]


x_train, x_test, y_train, y_test = train_test_split(df_train_x, df_train_y, test_size=0.25, random_state=72)

###7. Scaling Data


In [14]:
# from sklearn.preprocessing import LabelEncoder,MinMaxScaler,StandardScaler


# scaler = StandardScaler()
# X = scaler.fit_transform(x_train)
# x_train = pd.DataFrame(X, columns=x_train.columns)

# x_train

# # sqrt of whole dataset
# np.log(x_train)


# # for x in x_train.columns:
# #   if x_train[x].mean() > 1000:
# #     x_train[x]=np.log(x_train[x])

y_train = np.log(y_train)
y_test = np.log(y_test)

###9. Implementing a linear regression model

####9.1 RFE and selecting features

In [16]:
from sklearn.tree import DecisionTreeRegressor
dt_reg = DecisionTreeRegressor(random_state=0)

# lr = LinearRegression()
rfe = RFE(estimator=dt_reg, n_features_to_select=5)
rfe.fit(x_train, y_train)

rfe.ranking_


# selecting features given by RFE
f = rfe.get_support(1) #the most important features
x_train_select = x_train[x_train.columns[f]] # final features`
x_train_select.head(2)

  y = column_or_1d(y, warn=True)


Unnamed: 0,Total_max_credit_acmout,creditcard_exp,customer_id,emi_active,investment
8517,154328.0,27771.47,17352,1040.5,1509658.14
665,198320.0,26058.0,16740,7997.95,607037.55


In [17]:
#selecting the same features in test data
x_test_select = x_test[x_test.columns[f]]
x_test_select.head(2)

Unnamed: 0,Total_max_credit_acmout,creditcard_exp,customer_id,emi_active,investment
6853,223559.11,31761.18,10825,4271.29,205673.25
1323,91587.0,105645.91,999,6141.61,409703.0


####9.2 OLS regression

In [18]:
### OLS regression
import statsmodels.api as sm
# y_train = y_train.reindex(x_train.index)
mod = sm.OLS(y_train,x_train_select)

res = mod.fit()

print(res.summary())



#best ols results with np.log(y_train) and rfe with DTreg features = 10
#best ols results with np.log(y_train) and rfe with DTreg features = 15

  import pandas.util.testing as tm


                                 OLS Regression Results                                
Dep. Variable:         cc_cons_target   R-squared (uncentered):                   0.884
Model:                            OLS   Adj. R-squared (uncentered):              0.884
Method:                 Least Squares   F-statistic:                          1.721e+04
Date:                Wed, 25 Aug 2021   Prob (F-statistic):                        0.00
Time:                        08:00:43   Log-Likelihood:                         -29514.
No. Observations:               11246   AIC:                                  5.904e+04
Df Residuals:                   11241   BIC:                                  5.907e+04
Df Model:                           5                                                  
Covariance Type:            nonrobust                                                  
                              coef    std err          t      P>|t|      [0.025      0.975]
----------------------------

####9.3 using ols regresssion to predit values

In [20]:
### Predicting the values for test dataset and checking the accuracy of the model
res = mod.predict(x_test_select,y_test)

ValueError: ignored

### Applying liner regresssion on selected features

In [27]:
# from sklearn.linear_model import LassoCV
# reg = LassoCV(cv=5, random_state=0).fit(x_train_select, y_train)
# reg.score(x_train_select, y_train)

# reg.predict(X[:1,])


In [29]:
from sklearn.linear_model import LinearRegression
lr = LinearRegression()
# training the linear regression model on x_train_select and y_train 
lr.fit(x_train_select, y_train.iloc[:,0])

# predicting the values of x_train_select and storing them in y_hat_train
y_hat_train = lr.predict(x_train_select)

## Checking the accuracy of the train data predictions
#This is a value between [0 = no-fit] and [1 = perfect fit ]
r_sq = met.r2_score(y_pred=y_hat_train,y_true= y_train)
print('The accuracy of the model in r^2 :',r_sq)


#mae
mae = met.mean_absolute_error(y_pred=y_hat_train,y_true= y_train)
print('The accuracy of the model in mae :',mae)


# #MAPE
# y_diff = y_hat_train - y_train
# # y_diff  = np.abs(y_diff)/np.max(y_train)#/ np.maximum(np.abs(y_diff))

# maperror = (np.sum(np.abs((y_hat_train - y_train))/y_train))/y_train.count()*100
# print('The accuracy of the model in mape:',maperror,'%')



The accuracy of the model in r^2 : 0.0007538792572816533
The accuracy of the model in mae : 1.094816471251946


In [30]:
lr.coef_

array([-1.93540677e-07,  9.10613645e-07,  3.79532899e-07,  1.29710847e-06,
        3.44354190e-08])

In [31]:
np.exp(y_hat_train)

array([16677.84074831, 16146.71399885, 16176.78588837, ...,
       15777.61595979, 15962.11492846, 15448.80592087])

### Predicting the values for test dataset and checking the accuracy of the model

In [None]:
y_hat_test = lr.predict(x_test_select)

# checking the accuracy of test_data predictions
r_sq = met.r2_score(y_pred=y_hat_test, y_true= y_test)
print('The accuracy of the model in r^2 :',r_sq)

met.mean_squared_error(y_pred=y_hat_test, y_true= y_test)

# mae = met.mean_absolute_error(y_pred=y_hat_test,y_true= y_test)

# # #mape calculations
# y_diff = y_test - y_hat_test
# y_diff  = np.abs(y_diff)/np.abs(y_test)
# print('The accuracy of the model in mape:',np.average(y_diff)*100,'%')

# print('The accuracy of the model in mae :',mae)

The accuracy of the model in r^2 : -147519.1373818071


331906.703219666