In [1]:
# importing the necessary libraries.
import pandas as pd
import numpy as np

In [2]:
# reading the raw dataset
df = pd.read_csv("Dataset/CSV_Ms_2.csv")

In [3]:
# initializing an temp blank dataframe 
dftemp = pd.DataFrame()

In [4]:
# printing the head of the loan recommend dataset which is in the form of dataframe.
df.head()

Unnamed: 0,Contact__c,Min_IT_Loan_ID__c,Opp_Number__c,Id,AccountID,Number_Of_Loans_Granted__c,Num_Of_Loans_Paid__c,Purpose_of_Loan__c,Total_Repayments__c,Amount,Term_in_Weeks__c,Payment_Frequency__c,StageName,Applicant Age
0,0032x00000d4m0bAAA,917086.0,5147774.0,0062x00000Dr7o6AAB,0012x00000IMHksAAH,8.0,6.0,Vehicle Expenses,10.0,2050.0,19.143,Fortnightly,Loan Paid,40.0
1,0032x00000d2Py7AAE,908169.0,5061612.0,0062x00000DYwSGAA1,0012x00000SIDYKAA5,10.0,5.0,Vehicle Expenses,8.0,2050.0,16.0,Fortnightly,Loan Paid,27.0
2,0032x00000bqYxSAAU,914285.0,5122056.0,0062x00000DZt4hAAD,0012x00000lIh7OAAS,4.0,1.0,Travel Expenses,15.0,2050.0,15.571,Weekly,Loan Paid,60.0
3,0032x00000bqYxSAAU,929185.0,5274428.0,0062x00000DswhfAAB,0012x00000lIh7OAAS,4.0,1.0,Travel Expenses,14.0,600.0,13.714,Weekly,Loan Paid,60.0
4,0032x00000bdwB7AAI,915382.0,5135477.0,0062x00000Da7IOAAZ,0012x00000leWAwAAM,4.0,0.0,Medical Expenses,6.0,400.0,6.429,Weekly,Loan Paid,31.0


In [5]:
# printing the describe able metric of the dataframe to get few insights about the data
df.describe()

Unnamed: 0,Min_IT_Loan_ID__c,Opp_Number__c,Number_Of_Loans_Granted__c,Num_Of_Loans_Paid__c,Total_Repayments__c,Amount,Term_in_Weeks__c,Applicant Age
count,53300.0,53300.0,53300.0,28851.0,53300.0,53300.0,53300.0,53300.0
mean,1571893.0,4296241.0,9.820769,7.227687,9.316642,1228.546904,13.318978,40.045629
std,1647458.0,1653601.0,7.949194,8.728887,6.85119,1146.742773,7.184554,12.608519
min,444007.0,1527829.0,4.0,0.0,1.0,100.0,1.714,0.0
25%,646958.0,2705498.0,5.0,2.0,5.0,400.0,8.429,30.0
50%,905321.0,4609885.0,7.0,4.0,7.0,800.0,11.429,38.0
75%,1041374.0,5553187.0,11.0,9.0,12.0,2050.0,16.143,49.0
max,5177052.0,7622300.0,99.0,71.0,40.0,5000.0,109.714,88.0


In [6]:
# printing the shape of dataframe to get the count of all rows and columns respectively
df.shape

(53313, 14)

### convert the data into a canonical format


It is important to convert the data into a canonical format. We would like to consider data for only those users for this recommendation engine who have taken at least 3 loans and serviced them to completion. Also, we would form data for only those Loan-IDs that have been given at least 5 times. This will ensure that we are working with some amount of data in the system and our recommendation will not be plagued with cold-start problem for our machine model and also, we have some active valid interactions as the basis for this intelligence that we are building.

In [7]:
# consider data for only those :  
#     --> users taken at least 3 loans & serviced them to completion.  
#     --> Loan-IDs that have been given at least 5 times
df = df[df["Num_Of_Loans_Paid__c"]>=3]
df = df[df["Number_Of_Loans_Granted__c"]>=5]

In [9]:
# checking for the possibility of null values if null values exists in dataset then we need to handle them 
df.isnull().sum()

Contact__c                    0
Min_IT_Loan_ID__c             0
Opp_Number__c                 0
Id                            0
AccountID                     0
Number_Of_Loans_Granted__c    0
Num_Of_Loans_Paid__c          0
Purpose_of_Loan__c            0
Total_Repayments__c           0
Amount                        0
Term_in_Weeks__c              0
Payment_Frequency__c          0
StageName                     0
Applicant Age                 0
dtype: int64

In [10]:
# filling NA values with its mean if they exists

# df['Total_Repayments__c'].fillna(value=df['Total_Repayments__c'].mean(), inplace=True)

In [11]:
# describing the dataframe metrices after thansformation (if exists for null values)
df.describe()

Unnamed: 0,Min_IT_Loan_ID__c,Opp_Number__c,Number_Of_Loans_Granted__c,Num_Of_Loans_Paid__c,Total_Repayments__c,Amount,Term_in_Weeks__c,Applicant Age
count,17871.0,17871.0,17871.0,17871.0,17871.0,17871.0,17871.0,17871.0
mean,1492464.0,5729668.0,13.599519,10.960439,12.55425,1917.114319,18.063637,41.749874
std,1400638.0,980270.4,9.473607,9.267936,7.987312,1392.967851,7.28873,12.000956
min,766657.0,3816623.0,5.0,3.0,1.0,100.0,2.429,18.0
25%,894518.0,4851910.0,7.0,4.0,7.0,500.0,13.714,32.0
50%,996778.0,5532011.0,10.0,7.0,10.0,2050.0,16.143,40.0
75%,1046523.0,6758814.0,17.0,15.0,16.0,2050.0,23.714,50.0
max,5177052.0,7622300.0,74.0,71.0,40.0,5000.0,109.714,87.0


In [12]:
# we need to convert the alpha-numeric AccountID and LoanId into the numeric form because the ALS model only works
#     with int values of loans and users ID.

#### As our usersID were converted in past also so to consider those users we are applying an technique so that we can use those same usersID's

In [None]:
# reading the mapper files of loan and users created with the past Datasheets.
mapper_account_df = pd.read_csv("MappingFiles/AccountMapperDf.csv")
mapper_loans_df = pd.read_csv("MappingFiles/LoanMapperDf.csv")

In [13]:
# userID & LoanID of the prev datasheets
prev_max_account_ID = max(mapper_account_df['accountIndex'])
prev_max_loan_ID = max(mapper_loans_df['loanIndex'])

In [14]:
# extracting out the mapperobj of Loan & users so to map them with current datasheet.
AccountMapperobj = {ele:index for ele,index in zip(mapper_account_df["accountId"],mapper_account_df["accountIndex"]) }
LoanMapperobj = {ele:index for ele,index in zip(mapper_loans_df["LoanId"],mapper_loans_df["loanIndex"]) }

In [15]:
# applying mapper to dataframe by respective ID's
df['userId'] = df['AccountID'].map(AccountMapperobj)
df['loanId'] = df['Id'].map(LoanMapperobj)

In [16]:
# now we know that few users are not in the datasheet we have mapped in the past so we are extract them out of the DF.
#     then we will apply the same mapping technique for those users 
df_nan_userId = df[df['userId'].isnull()]
df_nan_loanId = df[df['loanId'].isnull()]

In [17]:
# creating or generating the userId and LoanId for the rest of the users
AccountMapperobjTmp = {ele: count for count, ele in enumerate(sorted(list(set(df_nan_userId['AccountID']))), prev_max_account_ID+1) }
LoanMapperobjTmp =    {ele: count for count, ele in enumerate(sorted(list(set(df_nan_loanId['Id']))), prev_max_loan_ID+1) }

# now updating the Account mapper object by adding the newly generated userId also  updating the Loan mapper object by adding the newly generated loanId
AccountMapperobj.update(AccountMapperobjTmp)
LoanMapperobj.update(LoanMapperobjTmp)

In [18]:
# creating the dataframe of userId(alpha-numeric) and their unique userIds 
accId,accindex = [],[]
[(accId.append(key),accindex.append(val)) for key,val in AccountMapperobj.items()]
AccountMapperDf = pd.DataFrame({"accountId":accId,
                                "accountIndex":accindex})

# creating the dataframe of loanId(alpha-numeric) and their unique loanIds 
lonId,lonindex = [],[]
[(lonId.append(key),lonindex.append(val)) for key,val in LoanMapperobj.items()]
LoanMapperDf = pd.DataFrame({"LoanId":lonId,
                                "loanIndex":lonindex})

In [19]:
# saving the loanId and userID so that these loanId and userID could be furter used for same user as our data increases.
import os
# File path
file_path,loan_file_path = r"MappingFiles\AccountMapperDf.csv",r"MappingFiles\LoanMapperDf.csv"

# Check if the file exists
if os.path.exists(file_path) and os.path.exists(loan_file_path):
    # Delete the existing file
    os.remove(file_path)
    os.remove(loan_file_path)
    print(f"File '{file_path} and {loan_file_path}' deleted.")

AccountMapperDf.to_csv(file_path,index=False)
LoanMapperDf.to_csv(loan_file_path,index=False)

print(f"New files '{file_path}' and '{loan_file_path}' created.")

File 'MappingFiles\AccountMapperDf.csv and MappingFiles\LoanMapperDf.csv' deleted.
New files 'MappingFiles\AccountMapperDf.csv' and 'MappingFiles\LoanMapperDf.csv' created.


In [20]:
# applying the accountID and loanID for the current dataset.
df["userId"] = df['AccountID'].map(AccountMapperobj)
df["loanId"] = df['Id'].map(LoanMapperobj)

In [21]:
# after apply the mapper onto the dataframe preview of dataset
df.head()

Unnamed: 0,Contact__c,Min_IT_Loan_ID__c,Opp_Number__c,Id,AccountID,Number_Of_Loans_Granted__c,Num_Of_Loans_Paid__c,Purpose_of_Loan__c,Total_Repayments__c,Amount,Term_in_Weeks__c,Payment_Frequency__c,StageName,Applicant Age,userId,loanId
0,0032x00000d4m0bAAA,917086.0,5147774.0,0062x00000Dr7o6AAB,0012x00000IMHksAAH,8.0,6.0,Vehicle Expenses,10.0,2050.0,19.143,Fortnightly,Loan Paid,40.0,6583,9879
1,0032x00000d2Py7AAE,908169.0,5061612.0,0062x00000DYwSGAA1,0012x00000SIDYKAA5,10.0,5.0,Vehicle Expenses,8.0,2050.0,16.0,Fortnightly,Loan Paid,27.0,2461,2562
259,0032x00000Y6ucAAAR,1012126.0,6283714.0,0062x00000EbLV1AAN,0012x00000iIC8JAAW,8.0,3.0,Vehicle Expenses,14.0,2050.0,14.286,Weekly,Loan Paid,36.0,10138,12888
288,0032x00000Y6X7yAAF,854077.0,4601912.0,0062x00000D9rBtAAJ,0012x00000iHpCKAA0,13.0,6.0,Home Maintenance & Repairs,16.0,2050.0,15.857,Weekly,Loan Paid,46.0,2880,5429
289,0032x00000Y6X7yAAF,1006191.0,6164466.0,0062x00000ENIsOAAX,0012x00000iHpCKAA0,13.0,7.0,Other Expenses,25.0,2500.0,25.857,Weekly,Loan Paid,46.0,2880,12760


we are working with some amount of data in the system and our recommendation will not be 
plagued with cold-start problem for our machine model and also, we have some active valid interactions as the basis for this intelligence 
that we are building.
Based on the above description, we can create a representation of user-loan interaction matrix using cumulative data.


Cumulative Data Option: Each cell signifying how many times it has been taken. It will be a sum total of all the successful and unsuccessful 
servicing of that specific loan. A negative number would indicate it was served unsuccessfully more than it was served successfully. 

So, if a loan was taken 5 times with 4 of them being serviced successfully and 1 as unsuccessful loan then the overall count should indicate “3” in that particular cell (Count = 1 + 1 + 1 + 1 + (-1) = 3).  

We will have a “NaN” for not having been taken at all if there is no data for a specific user-loan interaction.

In [23]:
# Applying the Cumulative Data Option:sum of all the successful subtraction unsuccessful servicing of specific loan.
df['count'] = (df['Num_Of_Loans_Paid__c'] - (df['Number_Of_Loans_Granted__c'] - df['Num_Of_Loans_Paid__c']))

In [24]:
# transforming the count also termed as ratings
df['count'] = df['count'].apply(lambda x: 0 if x<0 else x)

In [25]:
# creating an function to perform the transformation of count column
#     if count <1                  ==> 0
#     if count >= 1 and count<=10  ==> 1
#     if count >10 and count <100  ==> int(str(x)[0])+1  [43:4, 79:7, 91:9]
#     if count >100                ==> 10
# so by this transformation 0 to 10

def applyRangefunc(x):
    if x<=10 and x>=1:
        return 1
    elif x<1:
        return 0
    elif x>10 and x<100:
        return int(str(x)[0])+1
    elif x>=100:
        return 10
    else:
        return x

In [26]:
# Run this cell only once because if this is runned twice it will convert to false ratings
df['count'] = df['count'].apply(lambda x: applyRangefunc(x))

In [27]:
# so checking our transformation of count column so it is in the range of 0 to 10
set(df['count'])

{0, 1, 2, 3, 4, 5, 6, 7}

In [29]:
# for the making of clusters we are classfying the payment_frequencies into a list 
Payment_Frequency_list = list(set(df['Payment_Frequency__c']))

In [30]:
Payment_Frequency_list

['Monthly', 'Fortnightly', 'Weekly']

<h4>Creating the clusters with the specific range of Amount and total_repayment</h4>

In [31]:
# The clusters making according to the Loan Amount and total repayment 
#     (these range of the cluster is been decided with the help of tablue clusters feature)
Loan_classify_method = [
    {"Cluster" : 1,"ClusterInfo": {"MinAmount": 1150,  "MaxAmount":2250, "Total_repayment_min":2 ,  "Total_repayment_max":10 }},
    {"Cluster" : 2,"ClusterInfo": {"MinAmount": 1200,  "MaxAmount":2300, "Total_repayment_min":11,  "Total_repayment_max":19 }},
    {"Cluster" : 3,"ClusterInfo": {"MinAmount": 100 ,  "MaxAmount":1350, "Total_repayment_min":8 ,  "Total_repayment_max":18 }},
    {"Cluster" : 4,"ClusterInfo": {"MinAmount": 100 ,  "MaxAmount":1200, "Total_repayment_min":1 ,  "Total_repayment_max":8  }},
    {"Cluster" : 5,"ClusterInfo": {"MinAmount": 1600,  "MaxAmount":2000, "Total_repayment_min":20,  "Total_repayment_max":26 }},    
    {"Cluster" : 6,"ClusterInfo": {"MinAmount": 450 ,  "MaxAmount":2500, "Total_repayment_min":33,  "Total_repayment_max":40 }},
    {"Cluster" : 7,"ClusterInfo": {"MinAmount": 4400,  "MaxAmount":5000, "Total_repayment_min":12,  "Total_repayment_max":20 }},
    {"Cluster" : 8,"ClusterInfo": {"MinAmount": 2600,  "MaxAmount":3500, "Total_repayment_min":19,  "Total_repayment_max":28 }},
    {"Cluster" : 9,"ClusterInfo": {"MinAmount": 2300,  "MaxAmount":3250, "Total_repayment_min":2 ,  "Total_repayment_max":10 }},
    {"Cluster" : 10,"ClusterInfo":{"MinAmount": 250 ,  "MaxAmount":1550, "Total_repayment_min":18,  "Total_repayment_max":32 }},
    {"Cluster" : 11,"ClusterInfo":{"MinAmount": 4400,  "MaxAmount":5000, "Total_repayment_min":2 ,  "Total_repayment_max":11 }},
    {"Cluster" : 12,"ClusterInfo":{"MinAmount": 1500,  "MaxAmount":2500, "Total_repayment_min":27,  "Total_repayment_max":32 }},
    {"Cluster" : 13,"ClusterInfo":{"MinAmount": 2400,  "MaxAmount":3350, "Total_repayment_min":11,  "Total_repayment_max":19 }},                       
    {"Cluster" : 14,"ClusterInfo":{"MinAmount": 3300,  "MaxAmount":4400, "Total_repayment_min":2 ,  "Total_repayment_max":12 }},
    {"Cluster" : 15,"ClusterInfo":{"MinAmount": 3300,  "MaxAmount":4250, "Total_repayment_min":13,  "Total_repayment_max":20 }},
    {"Cluster" : 16,"ClusterInfo":{"MinAmount": 4450,  "MaxAmount":5000, "Total_repayment_min":21,  "Total_repayment_max":30 }},
    {"Cluster" : 17,"ClusterInfo":{"MinAmount": 3500,  "MaxAmount":4400, "Total_repayment_min":21,  "Total_repayment_max":30 }},
    {"Cluster" : 18,"ClusterInfo":{"MinAmount": 2500,  "MaxAmount":3400, "Total_repayment_min":28,  "Total_repayment_max":36 }},                       
    {"Cluster" : 19,"ClusterInfo":{"MinAmount": 4000,  "MaxAmount":5000, "Total_repayment_min":35,  "Total_repayment_max":38 }},
    {"Cluster" : 20,"ClusterInfo":{"MinAmount": 3100,  "MaxAmount":3950, "Total_repayment_min":35,  "Total_repayment_max":36 }},
]

In [33]:
# applying the clusters to the dataframe.
LoanID = 1
for cluster in Loan_classify_method:
    cluster_id = cluster['Cluster'] 
    cluster_info_minAmount,cluster_info_maxAmount,cluster_info_total_replayment_min,cluster_info_total_replayment_max = cluster['ClusterInfo']['MinAmount'], cluster['ClusterInfo']['MaxAmount'], cluster['ClusterInfo']['Total_repayment_min'], cluster['ClusterInfo']['Total_repayment_max']
    df2 = pd.DataFrame()
    df2 = df[(((df['Amount']>=cluster_info_minAmount) & (df['Amount']<=cluster_info_maxAmount)) & ((df['Total_Repayments__c']>=cluster_info_total_replayment_min) & (df['Total_Repayments__c']<=cluster_info_total_replayment_max)))].copy()
    for pfl in range(len(Payment_Frequency_list)):
        df4 = df2[(df2['Payment_Frequency__c']==Payment_Frequency_list[pfl])].copy()
        df4['LoanIdFormat'] = LoanID
        LoanID += 1
        dftemp = pd.concat([dftemp, df4], ignore_index=True)

In [34]:
# printing few rows of the temp dataframe
dftemp.head()

Unnamed: 0,Contact__c,Min_IT_Loan_ID__c,Opp_Number__c,Id,AccountID,Number_Of_Loans_Granted__c,Num_Of_Loans_Paid__c,Purpose_of_Loan__c,Total_Repayments__c,Amount,Term_in_Weeks__c,Payment_Frequency__c,StageName,Applicant Age,userId,loanId,count,LoanIdFormat
0,0032x00000XaCMZAA3,1014484.0,6327397.0,0062x00000Ec3OhAAJ,0012x00000hYeJ7AAK,6.0,3.0,Living Expenses,4.0,1250.0,16.0,Monthly,Loan Paid,39.0,10132,12931,0,1
1,0030K00001MaUFQQA3,945258.0,5519027.0,0062x00000E3omCAAR,0010K00001cQWDjQAO,47.0,46.0,Travel Expenses,5.0,2050.0,20.143,Monthly,Loan Paid,43.0,37,10736,5,1
2,0030K00001MaUFQQA3,919931.0,5180314.0,0062x00000DrdLCAAZ,0010K00001cQWDjQAO,47.0,45.0,Travel Expenses,4.0,2050.0,16.429,Monthly,Loan Paid,43.0,37,2729,5,1
3,0030K00001SCJMgQAP,939812.0,5431995.0,0062x00000DVGTYAA5,0010K00001hTzwHQAS,46.0,44.0,Living Expenses,5.0,2050.0,19.857,Monthly,Loan Paid,36.0,146,2281,5,1
4,0030K00001MaUFQQA3,911555.0,5098387.0,0062x00000DZWmGAAX,0010K00001cQWDjQAO,47.0,44.0,Home Maintenance & Repairs,2.0,2000.0,5.0,Monthly,Loan Paid,43.0,37,2608,5,1


In [35]:
# printing the description of the columns of temp dataframe.
dftemp.describe()

Unnamed: 0,Min_IT_Loan_ID__c,Opp_Number__c,Number_Of_Loans_Granted__c,Num_Of_Loans_Paid__c,Total_Repayments__c,Amount,Term_in_Weeks__c,Applicant Age,userId,loanId,count,LoanIdFormat
count,17079.0,17079.0,17079.0,17079.0,17079.0,17079.0,17079.0,17079.0,17079.0,17079.0,17079.0,17079.0
mean,1506709.0,5783584.0,13.641138,11.025001,11.395222,1857.020317,17.399922,41.878037,4859.770244,8275.21383,1.382107,14.728731
std,1411302.0,1000568.0,9.490985,9.270974,7.47251,1450.436335,7.282157,12.035892,3105.444245,3919.747101,0.966543,13.210238
min,766657.0,3816623.0,5.0,3.0,1.0,100.0,2.429,18.0,1.0,3.0,0.0,1.0
25%,896084.5,4855035.0,7.0,4.0,7.0,500.0,13.429,32.0,2122.0,4493.5,1.0,6.0
50%,998967.0,5809178.0,10.0,7.0,9.0,2000.0,15.857,41.0,4644.0,8090.0,1.0,11.0
75%,1048209.0,6784469.0,17.0,15.0,15.0,2050.0,21.714,50.0,7602.5,12303.5,2.0,20.0
max,5177052.0,7622300.0,74.0,71.0,40.0,5000.0,109.714,87.0,10143.0,13902.0,7.0,60.0


In [43]:
# exporting the temp dataframe to an CSV file to use for creating the recommendation model.
dftemp.to_csv("df_created_.csv",index=False)