In [1]:
import pandas as pd

## Notebook to put all the data together: The important information from the client information, with all loans and defaults

In [2]:
loans = pd.read_csv ('../data/loan_information.csv')
client = pd.read_csv ('../data/client_information_missing.csv')
default = pd.read_csv ('../data/default_data.csv')

In [3]:
loans.drop (['openbalance', 'accstartdate', 'first_month',
       'last_month', 'searchdate'], axis = 1, inplace = True)

In [4]:
variables = open ('../data/chosen_variables.txt').read ().split ('\n') [:-1]

In [5]:
variables = variables + [c for c in client.columns.values if c [-1] == 'o']

In [6]:
client = client [variables]

In [7]:
# Not all loan entries have a default entry, I exclude these. 

default ['joined'] = 1
default ['default'] = default.dval.apply (lambda x : 1 if x > 0 else 0)
default.drop (['dval', 'dmon'], axis = 1, inplace = True)

In [8]:
data = (loans
        .join (default.set_index (['uid', 'recordnumber']), ['uid', 'recordnumber'])
       )

In [9]:
data = data [data.joined ==1].drop ('joined', axis = 1)

In [10]:
# Restricting loans to the considered period (dropping those with null repayment)

data = data [(data.repayperiod >= 12) & (data.repayperiod <= 60)]

In [11]:
# Total loans held by the uid, as a feature

total_loans = (data.groupby ('uid')
               .recordnumber
               .count ()
               .reset_index ()
               .rename (columns = {'recordnumber':'number_of_loans'}))

In [12]:
data.drop (['recordnumber', 'repayperiod'], axis = 1, inplace = True)

In [13]:
data = (data
        .join (client.set_index ('uid'), on = 'uid')
        .join (total_loans.set_index ('uid'), on = 'uid')
       )

In [14]:
def standardise (x):
    """
    Function to standardise continuous variables
    """
    return (x - x.mean ()) / (x.std ())

In [15]:
# Standardising the continuous variables

for c in variables [1:]:
    if ((c [-1] == 'o') | (c [-1] == 'c')):
        continue
    else:
        data [c] = standardise (data [c])

In [16]:
# One hot encoding with appropriate column headings

for c in variables [1:]:
    if ((c [-1] == 'o') | (c [-1] == 'c')):
        dummies = pd.get_dummies (data[c]).iloc [:, 1:]
        cols = []
        
        for i in range (len (dummies.columns.values)):
            cols.append (c + '_' + str (dummies.columns.values[i]))
            
        data.drop (c, axis = 1, inplace = True)
        data [cols] = dummies
        
    else:
        continue

In [17]:
#default_data = data [data.default == 1]
#clean_data = data [data.default == 0]

In [18]:
#data = pd.concat ([default_data, clean_data, clean_data, clean_data.sample (1330, random_state=0)])

In [19]:
data.to_csv ('../data/data.csv', index = False)