In [1]:
import pandas as pd
import numpy as np

In [2]:
# reading the raw dataset
df = pd.read_csv("Dataset/Loan_recommend.csv")

In [3]:
# initializing an temp blank dataframe 
dftemp = pd.DataFrame()

In [4]:
# printing the head of the loan recommend dataset which is in the form of dataframe.
df.head()

Unnamed: 0,Contact__c,Min_IT_Loan_ID__c,Opp_Number__c,Id,AccountID,Number_Of_Loans_Granted__c,Num_Of_Loans_Paid__c,Purpose_of_Loan__c,Total_Repayments__c,Amount,Term_in_Weeks__c,Payment_Frequency__c,StageName
0,0032x00000d2Py7AAE,835652,4434431,0062x00000D2E3CAAV,0012x00000SIDYKAA5,10,4.0,Living Expenses,7,200,13.857,Fortnightly,Loan Paid
1,0032x00000bqYxSAAU,846129,4529720,0062x00000D8MwOAAV,0012x00000lIh7OAAS,4,0.0,Event - Holiday - Accommodation,3,500,6.571,Fortnightly,Loan Paid
2,0032x00000WjhLpAAJ,851865,4582822,0062x00000D9S6fAAF,0012x00000gmjS2AAI,4,0.0,One-off purchase,6,1200,6.143,Weekly,Loan Paid
3,0032x00000WjgarAAB,5149616,4581851,0062x00000D9QbVAAV,0012x00000gmie2AAA,5,0.0,Living Expenses,3,200,5.571,Fortnightly,Loan Paid
4,0032x00000WjgWaAAJ,851686,4581757,0062x00000D9QRzAAN,0012x00000gmiaiAAA,4,0.0,Wedding,2,700,7.143,Monthly,Loan Paid


In [5]:
# Dropping columns because these columns are of no use for use.
# we only need columns like userId, loanId, Amount, Repayment_time, etc.
# StageName ==>  all the StageName are same i.e. "Loan Paid".
df.drop(["Contact__c", "Min_IT_Loan_ID__c", "Opp_Number__c","StageName"], axis=1,inplace=True)

In [6]:
# printing the describe able metric of the dataframe to get few insights about the data
df.describe()

Unnamed: 0,Number_Of_Loans_Granted__c,Num_Of_Loans_Paid__c,Total_Repayments__c,Amount,Term_in_Weeks__c
count,20000.0,12371.0,20000.0,20000.0,20000.0
mean,15.8129,8.208956,7.8854,983.5525,11.396368
std,15.148973,11.936401,5.02299,847.369758,5.624189
min,4.0,0.0,2.0,100.0,2.857
25%,4.0,1.0,5.0,400.0,6.429
50%,7.0,2.0,6.0,600.0,10.143
75%,31.0,10.0,10.0,1300.0,14.143
max,99.0,71.0,40.0,5000.0,66.857


In [7]:
df.head()

Unnamed: 0,Id,AccountID,Number_Of_Loans_Granted__c,Num_Of_Loans_Paid__c,Purpose_of_Loan__c,Total_Repayments__c,Amount,Term_in_Weeks__c,Payment_Frequency__c
0,0062x00000D2E3CAAV,0012x00000SIDYKAA5,10,4.0,Living Expenses,7,200,13.857,Fortnightly
1,0062x00000D8MwOAAV,0012x00000lIh7OAAS,4,0.0,Event - Holiday - Accommodation,3,500,6.571,Fortnightly
2,0062x00000D9S6fAAF,0012x00000gmjS2AAI,4,0.0,One-off purchase,6,1200,6.143,Weekly
3,0062x00000D9QbVAAV,0012x00000gmie2AAA,5,0.0,Living Expenses,3,200,5.571,Fortnightly
4,0062x00000D9QRzAAN,0012x00000gmiaiAAA,4,0.0,Wedding,2,700,7.143,Monthly


### Converting the data into canonical foramt


It is important to convert the data into a canonical format. We would like to consider data for only those users for this recommendation engine who have taken at least 3 loans and serviced them to completion. Also, we would form data for only those Loan-IDs that have been given at least 5 times. This will ensure that we are working with some amount of data in the system and our recommendation will not be plagued with cold-start problem for our machine model and also, we have some active valid interactions as the basis for this intelligence that we are building.

In [11]:
# consider data for only those :  
#     --> users taken at least 3 loans & serviced them to completion.  
df = df[df["Num_Of_Loans_Paid__c"]>=3]

In [12]:
# checking for the possibility of null values if null values exists in dataset then we need to handle them 
df.isnull().sum()

Id                            0
AccountID                     0
Number_Of_Loans_Granted__c    0
Num_Of_Loans_Paid__c          0
Purpose_of_Loan__c            0
Total_Repayments__c           0
Amount                        0
Term_in_Weeks__c              0
Payment_Frequency__c          0
dtype: int64

In [13]:
# filling NA values with its mean if they exists

# df['Total_Repayments__c'].fillna(value=df['Total_Repayments__c'].mean(), inplace=True)

In [14]:
# describing the dataframe metrices after thansformation (if exists for null values)
df.describe()

Unnamed: 0,Number_Of_Loans_Granted__c,Num_Of_Loans_Paid__c,Total_Repayments__c,Amount,Term_in_Weeks__c
count,5450.0,5450.0,5450.0,5450.0,5450.0
mean,20.635046,17.673394,10.330275,1353.229358,14.988388
std,13.329589,12.752674,6.035551,1051.341954,6.2395
min,4.0,3.0,2.0,200.0,4.571
25%,8.0,5.0,7.0,500.0,10.429
50%,18.0,15.0,8.0,1000.0,14.0
75%,31.0,28.0,14.0,2050.0,16.143
max,73.0,71.0,40.0,5000.0,66.857


In [10]:
# we need to convert the alpha-numeric AccountID and LoanId into the numeric form because the ALS model only works
#     with int values of loans and users ID.
AccountMapperobj = {ele: count for count, ele in enumerate(sorted(list(set(df['AccountID']))), 1)}
LoanMapperobj = {ele: count for count, ele in enumerate(sorted(list(set(df['Id']))), 1) }

In [11]:
# creating the dataframe of userId(alpha-numeric) and their unique userIds 
accId,accindex = [],[]
[(accId.append(key),accindex.append(val)) for key,val in AccountMapperobj.items()]
AccountMapperDf = pd.DataFrame({"accountId":accId,
                                "accountIndex":accindex})
# creating the dataframe of loanId(alpha-numeric) and their unique loanIds 
lonId,lonindex = [],[]
[(lonId.append(key),lonindex.append(val)) for key,val in LoanMapperobj.items()]
LoanMapperDf = pd.DataFrame({"LoanId":lonId,
                                "loanIndex":lonindex})

In [12]:
# saving the loanId and userID so that these loanId and userID could be furter used for same user as our data increases.
import os
# File path to save the userID and loanID
file_path,loan_file_path = r"MappingFiles\AccountMapperDf.csv",r"MappingFiles\LoanMapperDf.csv"

# Check if the file exists
if os.path.exists(file_path) and os.path.exists(loan_file_path):
    # Delete the existing file
    os.remove(file_path)
    os.remove(loan_file_path)
    print(f"File '{file_path} and {loan_file_path}' deleted.")

AccountMapperDf.to_csv(file_path,index=False)
LoanMapperDf.to_csv(loan_file_path,index=False)

print(f"New files '{file_path}' and '{loan_file_path}' created.")

File 'MappingFiles\AccountMapperDf.csv and MappingFiles\LoanMapperDf.csv' deleted.
New files 'MappingFiles\AccountMapperDf.csv' and 'MappingFiles\LoanMapperDf.csv' created.


In [13]:
# applying the accountID and loanID for the current dataset. 
df['userId'] = df['AccountID'].map(AccountMapperobj)
df['loanId'] = df['Id'].map(LoanMapperobj)

In [14]:
# after apply the mapper onto the dataframe preview of dataset
df.head(10)

Unnamed: 0,Contact__c,Min_IT_Loan_ID__c,Opp_Number__c,Id,AccountID,Number_Of_Loans_Granted__c,Num_Of_Loans_Paid__c,Purpose_of_Loan__c,Total_Repayments__c,Amount,Term_in_Weeks__c,Payment_Frequency__c,StageName,userId,loanId
0,0032x00000d2Py7AAE,835652,4434431,0062x00000D2E3CAAV,0012x00000SIDYKAA5,10,4.0,Living Expenses,7,200,13.857,Fortnightly,Loan Paid,2461,1409
1497,0032x00000U1vcRAAR,843655,4507293,0062x00000D7wReAAJ,0012x00000dbzMmAAI,12,5.0,Medical Expenses,5,300,10.0,Fortnightly,Loan Paid,2788,1546
1587,0032x00000U1DnVAAV,831751,4400020,0062x00000D1c1IAAR,0012x00000dAVT5AAO,13,6.0,Vehicle Expenses,16,5000,16.143,Weekly,Loan Paid,2778,1349
1588,0032x00000U1DnVAAV,838701,4464122,0062x00000D71SmAAJ,0012x00000dAVT5AAO,13,7.0,Medical Expenses,16,5000,15.857,Weekly,Loan Paid,2778,1457
1684,0032x00000TTqjTAAT,846608,4535022,0062x00000D8TpPAAV,0012x00000cLrzcAAC,8,4.0,Vehicle Expenses,10,1000,10.429,Weekly,Loan Paid,2743,1605
1724,0032x00000TTepWAAT,5147349,4472671,0062x00000D7DwRAAV,0012x00000cL1gEAAS,6,4.0,Medical Expenses,5,500,9.857,Fortnightly,Loan Paid,2741,1484
1731,0032x00000TTarXAAT,5146415,4417727,0062x00000D1ul3AAB,0012x00000cKw95AAC,7,4.0,Furniture or Appliances,7,600,13.0,Fortnightly,Loan Paid,2738,1385
1740,0032x00000TTY7BAAX,840683,4478475,0062x00000D7KD5AAN,0012x00000cKtUeAAK,15,6.0,Living Expenses,14,1000,13.714,Weekly,Loan Paid,2736,1495
1757,0032x00000TTP0XAAX,5149468,4575098,0062x00000D9Hp4AAF,0012x00000cKXugAAG,7,4.0,Child Expenses,7,1250,12.571,Fortnightly,Loan Paid,2733,1684
1769,0032x00000TTKiBAAX,5148312,4522205,0062x00000D8EyRAAV,0012x00000cKGcjAAG,7,4.0,Home Maintenance & Repairs,6,300,5.857,Weekly,Loan Paid,2732,1587


we are working with some amount of data in the system and our recommendation will not be 
plagued with cold-start problem for our machine model and also, we have some active valid interactions as the basis for this intelligence 
that we are building.
Based on the above description, we can create a representation of user-loan interaction matrix using cumulative data.


Cumulative Data Option: Each cell signifying how many times it has been taken. It will be a sum total of all the successful and unsuccessful 
servicing of that specific loan. A negative number would indicate it was served unsuccessfully more than it was served successfully. So, if a
loan was taken 5 times with 4 of them being serviced successfully and 1 as unsuccessful loan then the overall count should indicate “3” in that
particular cell (Count = 1 + 1 + 1 + 1 + (-1) = 3).  We will have a “NaN” for not having been taken at all if there is no data for a specific 
user-loan interaction.

In [15]:
# Applying the Cumulative Data Option:sum of all the successful subtraction unsuccessful servicing of specific loan.
df['count'] = (df['Num_Of_Loans_Paid__c'] - (df['Number_Of_Loans_Granted__c'] - df['Num_Of_Loans_Paid__c']))

In [16]:
# Checking all the possible values of count column
set(df['count'])

{-4.0,
 -3.0,
 -2.0,
 -1.0,
 0.0,
 1.0,
 2.0,
 3.0,
 4.0,
 5.0,
 6.0,
 7.0,
 8.0,
 9.0,
 10.0,
 11.0,
 12.0,
 13.0,
 14.0,
 15.0,
 16.0,
 17.0,
 18.0,
 19.0,
 20.0,
 21.0,
 22.0,
 23.0,
 24.0,
 25.0,
 26.0,
 27.0,
 28.0,
 29.0,
 30.0,
 31.0,
 32.0,
 33.0,
 34.0,
 35.0,
 36.0,
 37.0,
 38.0,
 39.0,
 40.0,
 41.0,
 42.0,
 43.0,
 44.0,
 45.0,
 46.0,
 47.0,
 48.0,
 49.0,
 50.0,
 51.0,
 52.0,
 53.0,
 54.0,
 55.0,
 56.0,
 57.0,
 58.0,
 59.0,
 60.0,
 61.0,
 62.0,
 63.0,
 65.0,
 67.0,
 69.0}

In [17]:
# transforming the count also termed as ratings  
df['count'] = df['count'].apply(lambda x: 0 if x<0 else x)

In [19]:
# creating an function to perform the transformation of count column
#     if count <1                  ==> 0
#     if count >= 1 and count<=10  ==> 1
#     if count >10 and count <100  ==> int(str(x)[0])+1  [43:4, 79:7, 91:9]
#     if count >100                ==> 10
# so by this transformation 0 to 10

def applyRangefunc(x):
    if x<=10 and x>=1:
        return 1
    elif x<1:
        return 0
    elif x>10 and x<100:
        return int(str(x)[0])+1
    elif x>=100:
        return 10
    else:
        return x

In [20]:
# Run this cell only once because if this is runned twice it will convert to false ratings
df['count'] = df['count'].apply(lambda x: applyRangefunc(x))
# df['count'] = df['count'].apply(lambda x: 10 if x>100 else x)

In [21]:
# so checking our transformation of count column so it is in the range of 0 to 10
set(df['count'])

{0, 1, 2, 3, 4, 5, 6, 7}

In [23]:
# for the making of clusters we are classfying the payment_frequencies into a list 
Payment_Frequency_list = list(set(df['Payment_Frequency__c']))

In [24]:
# The clusters making according to the Loan Amount and total repayment 
#     (these range of the cluster is been decided with the help of tablue clusters feature)
Loan_classify_method = [
    {"Cluster" : 1,"ClusterInfo":{"MinAmount":1300,"MaxAmount":2500,"Total_repayment_min":3,"Total_repayment_max":11}},
    {"Cluster" : 2,"ClusterInfo":{"MinAmount":200,"MaxAmount":1300,"Total_repayment_min":3,"Total_repayment_max":10}},
    {"Cluster" : 3,"ClusterInfo":{"MinAmount":200,"MaxAmount":1400,"Total_repayment_min":10,"Total_repayment_max":16}},
    {"Cluster" : 4,"ClusterInfo":{"MinAmount":500,"MaxAmount":1300,"Total_repayment_min":18,"Total_repayment_max":26}},
    {"Cluster" : 5,"ClusterInfo":{"MinAmount":1500,"MaxAmount":2600,"Total_repayment_min":19,"Total_repayment_max":26}},    
    {"Cluster" : 6,"ClusterInfo":{"MinAmount":1450,"MaxAmount":2500,"Total_repayment_min":12,"Total_repayment_max":19}},
    {"Cluster" : 7,"ClusterInfo":{"MinAmount":2600,"MaxAmount":3800,"Total_repayment_min":4,"Total_repayment_max":12}},
    {"Cluster" : 8,"ClusterInfo":{"MinAmount":3950,"MaxAmount":5000,"Total_repayment_min":4,"Total_repayment_max":12}},
    {"Cluster" : 9,"ClusterInfo":{"MinAmount":2800,"MaxAmount":4000,"Total_repayment_min":13,"Total_repayment_max":18}},
    {"Cluster" : 10,"ClusterInfo":{"MinAmount":4500,"MaxAmount":5000,"Total_repayment_min":13,"Total_repayment_max":30}},
    {"Cluster" : 11,"ClusterInfo":{"MinAmount":2050,"MaxAmount":2500,"Total_repayment_min":27,"Total_repayment_max":32}},
    {"Cluster" : 12,"ClusterInfo":{"MinAmount":1700,"MaxAmount":4000,"Total_repayment_min":36,"Total_repayment_max":40}},
    {"Cluster" : 13,"ClusterInfo":{"MinAmount":2800,"MaxAmount":4000,"Total_repayment_min":20,"Total_repayment_max":24}},                       
    {"Cluster" : 14,"ClusterInfo":{"MinAmount":3000,"MaxAmount":4000,"Total_repayment_min":30,"Total_repayment_max":36}},
    {"Cluster" : 15,"ClusterInfo":{"MinAmount":600,"MaxAmount":1050,"Total_repayment_min":36,"Total_repayment_max":36}}
]

In [26]:
# applying the clusters to the dataframe.
LoanID = 1
for cluster in Loan_classify_method:
    cluster_id = cluster['Cluster'] 
    cluster_info_minAmount,cluster_info_maxAmount,cluster_info_total_replayment_min,cluster_info_total_replayment_max = cluster['ClusterInfo']['MinAmount'], cluster['ClusterInfo']['MaxAmount'], cluster['ClusterInfo']['Total_repayment_min'], cluster['ClusterInfo']['Total_repayment_max']
    df2 = pd.DataFrame()
    df2 = df[(((df['Amount']>=cluster_info_minAmount) & (df['Amount']<=cluster_info_maxAmount)) & ((df['Total_Repayments__c']>=cluster_info_total_replayment_min) & (df['Total_Repayments__c']<=cluster_info_total_replayment_max)))].copy()
    for pfl in range(len(Payment_Frequency_list)):
        df4 = df2[(df2['Payment_Frequency__c']==Payment_Frequency_list[pfl])].copy()
        df4['LoanIdFormat'] = LoanID
        LoanID += 1
        dftemp = pd.concat([dftemp, df4], ignore_index=True)

In [27]:
# printing few rows of the temp dataframe
dftemp.head()

Unnamed: 0,Contact__c,Min_IT_Loan_ID__c,Opp_Number__c,Id,AccountID,Number_Of_Loans_Granted__c,Num_Of_Loans_Paid__c,Purpose_of_Loan__c,Total_Repayments__c,Amount,Term_in_Weeks__c,Payment_Frequency__c,StageName,userId,loanId,count,LoanIdFormat
0,0032800001GwIZJAA3,872195,4736264,0062x00000DGh5bAAD,0012800001Z5quwAAB,50,44.0,Vehicle Expenses,10,2000,9.857,Weekly,Loan Paid,1319,1985,4,1
1,0030K00001vJaohQAC,5155270,4833598,0062x00000DIRQxAAP,0010K000026N8lUQAS,48,46.0,Travel Expenses,10,2000,9.857,Weekly,Loan Paid,797,2147,5,1
2,00328000019urd2AAA,911400,5095196,0062x00000DZUOCAA5,0012800001SDTBIAA5,43,39.0,Living Expenses,6,2000,6.0,Weekly,Loan Paid,1090,2598,4,1
3,0030K000023uM46QAE,5157393,4942438,0062x00000DWsO3AAL,0010K000029H14BQAS,41,37.0,Life Event Expenses,10,1400,10.143,Weekly,Loan Paid,884,2411,4,1
4,0030K00001kOHSFQA4,877335,4776137,0062x00000DHOd7AAH,0010K00001xksnkQAA,39,35.0,Home Maintenance & Repairs,10,2050,10.143,Weekly,Loan Paid,438,2053,4,1


In [28]:
# printing the description of the columns of temp dataframe.
dftemp.describe()

Unnamed: 0,Min_IT_Loan_ID__c,Opp_Number__c,Number_Of_Loans_Granted__c,Num_Of_Loans_Paid__c,Total_Repayments__c,Amount,Term_in_Weeks__c,userId,loanId,count,LoanIdFormat
count,5010.0,5010.0,5010.0,5010.0,5010.0,5010.0,5010.0,5010.0,5010.0,5010.0,5010.0
mean,1959020.0,5341132.0,22.779042,19.710579,10.216168,1337.924152,14.75965,1264.946906,2339.320958,2.208982,8.827545
std,1818677.0,1021570.0,12.770361,12.253514,5.706869,1060.988985,5.980024,799.771044,1345.85317,1.223823,7.092704
min,767271.0,3817099.0,6.0,4.0,3.0,200.0,4.571,1.0,1.0,0.0,1.0
25%,842445.0,4376282.0,10.0,7.0,7.0,500.0,10.286,632.0,1152.25,1.0,4.0
50%,1021186.0,4971510.0,27.0,23.0,8.0,1000.0,14.0,1166.5,2362.0,2.0,6.0
75%,1060562.0,6495322.0,32.0,29.0,14.0,2050.0,16.143,1875.75,3537.75,3.0,9.0
max,5175861.0,7395202.0,73.0,71.0,40.0,5000.0,66.857,2880.0,4594.0,7.0,43.0


In [34]:
# exporting the temp dataframe to an CSV file to use for creating the recommendation model.
dftemp.to_csv("df_temp_cluster_rating_condition_data.csv",index=False)