## Import libraries

In [1]:
import pandas as pd

## 1. Normalize the loan_lenders table. In the normalized table, each row must have one loan_id and one lender.
### Load dataset

In [2]:
loans_lenders_df = pd.read_csv("additional-kiva-snapshot/loans_lenders.csv")

### Basic data exploration and statistics

In [3]:
loans_lenders_df.head()

Unnamed: 0,loan_id,lenders
0,483693,"muc888, sam4326, camaran3922, lachheb1865, reb..."
1,483738,"muc888, nora3555, williammanashi, barbara5610,..."
2,485000,"muc888, terrystl, richardandsusan8352, sherri4..."
3,486087,"muc888, james5068, rudi5955, daniel9859, don92..."
4,534428,"muc888, niki3008, teresa9174, mike4896, david7..."


In [4]:
loans_lenders_df.tail()

Unnamed: 0,loan_id,lenders
1387427,678999,"michael43411218, carol5987, gooddogg1, chris41..."
1387428,1207353,"rjhoward1986, jeffrey6870, trolltech4460, elys..."
1387429,1206220,"vicky7746, gooddogg1, fairspirit, craig9729960..."
1387430,1206425,"rich6705, sergiiy9766, angela7509, barbara5610..."
1387431,1206486,"alan5175, amy38101311"


How many records are there?

In [5]:
loans_lenders_df.shape

(1387432, 2)

How many NA values are in this dataframe?

In [6]:
loans_lenders_df.isna().sum()

loan_id    0
lenders    0
dtype: int64

Are there duplicated loan_id values?

In [7]:
len(loans_lenders_df['loan_id'].unique())

1387432

That's good, each id is unique. I can now focus on how to normalize the table: the first thing to do will be to split the strings in the lenders column, so that we can have a list of lenders

In [8]:
loans_lenders_df['lenders'] = loans_lenders_df['lenders'].apply(lambda x : x.split(','))

In [9]:
loans_lenders_df = loans_lenders_df.explode('lenders').reset_index(drop=True)

Let's see if everything work as expected:

In [10]:
loans_lenders_df.head()

Unnamed: 0,loan_id,lenders
0,483693,muc888
1,483693,sam4326
2,483693,camaran3922
3,483693,lachheb1865
4,483693,rebecca3499


In [11]:
loans_lenders_df.tail()

Unnamed: 0,loan_id,lenders
28293926,1206425,trogdorfamily7622
28293927,1206425,danny6470
28293928,1206425,don6118
28293929,1206486,alan5175
28293930,1206486,amy38101311


Just for curiosity let's have a look at a random row:

In [12]:
loans_lenders_df.iloc[45]

loan_id           483738
lenders     danhostetler
Name: 45, dtype: object

In [13]:
loans_lenders_df.shape

(28293931, 2)

## 2. For each loan, add a column duration corresponding to the number of days between the disburse time and the planned expiration time. If any of those two dates is missing, also the duration must be missing.

In [14]:
loans_df = pd.read_csv("additional-kiva-snapshot/loans.csv")

In [15]:
loans_df.columns

Index(['loan_id', 'loan_name', 'original_language', 'description',
       'description_translated', 'funded_amount', 'loan_amount', 'status',
       'activity_name', 'sector_name', 'loan_use', 'country_code',
       'country_name', 'town_name', 'currency_policy',
       'currency_exchange_coverage_rate', 'currency', 'partner_id',
       'posted_time', 'planned_expiration_time', 'disburse_time',
       'raised_time', 'lender_term', 'num_lenders_total',
       'num_journal_entries', 'num_bulk_entries', 'tags', 'borrower_genders',
       'borrower_pictured', 'repayment_interval', 'distribution_model'],
      dtype='object')

In [16]:
loans_df.head()

Unnamed: 0,loan_id,loan_name,original_language,description,description_translated,funded_amount,loan_amount,status,activity_name,sector_name,...,raised_time,lender_term,num_lenders_total,num_journal_entries,num_bulk_entries,tags,borrower_genders,borrower_pictured,repayment_interval,distribution_model
0,657307,Aivy,English,"Aivy, 21 years of age, is single and lives in ...",,125.0,125.0,funded,General Store,Retail,...,2014-01-15 04:48:22.000 +0000,7.0,3,2,1,,female,True,irregular,field_partner
1,657259,Idalia Marizza,Spanish,"Doña Idalia, esta casada, tiene 57 años de eda...","Idalia, 57, is married and lives with her husb...",400.0,400.0,funded,Used Clothing,Clothing,...,2014-02-25 06:42:06.000 +0000,8.0,11,2,1,,female,True,monthly,field_partner
2,658010,Aasia,English,Aasia is a 45-year-old married lady and she ha...,,400.0,400.0,funded,General Store,Retail,...,2014-01-24 23:06:18.000 +0000,14.0,16,2,1,"#Woman Owned Biz, #Supporting Family, user_fav...",female,True,monthly,field_partner
3,659347,Gulmira,Russian,"Гулмире 36 лет, замужем, вместе с супругом вос...",Gulmira is 36 years old and married. She and ...,625.0,625.0,funded,Farming,Agriculture,...,2014-01-22 05:29:28.000 +0000,14.0,21,2,1,user_favorite,female,True,monthly,field_partner
4,656933,Ricky\t,English,Ricky is a farmer who currently cultivates his...,,425.0,425.0,funded,Farming,Agriculture,...,2014-01-14 17:29:27.000 +0000,7.0,15,2,1,"#Animals, #Eco-friendly, #Sustainable Ag",male,True,bullet,field_partner


In [17]:
loans_df.tail()

Unnamed: 0,loan_id,loan_name,original_language,description,description_translated,funded_amount,loan_amount,status,activity_name,sector_name,...,raised_time,lender_term,num_lenders_total,num_journal_entries,num_bulk_entries,tags,borrower_genders,borrower_pictured,repayment_interval,distribution_model
1419602,988180,,,,,400.0,400.0,funded,Tailoring,Services,...,2015-12-28 15:44:18.000 +0000,14.0,16,4,2,"#Parent, #Repeat Borrower, #Woman Owned Biz",,,monthly,field_partner
1419603,988213,Perlita,English,"Perlita is 52 years old, married and has three...","Perlita is 52 years old, married and has three...",300.0,300.0,funded,Pigs,Agriculture,...,2015-12-22 10:37:06.000 +0000,14.0,12,1,1,"#Animals, #Elderly, #Repeat Borrower, #Woman O...",female,true,irregular,field_partner
1419604,989109,Okyeso Nyame Group,English,Okyeso Nyame group will begin its third cycle ...,Okyeso Nyame group will begin its third cycle ...,2425.0,2425.0,funded,Bakery,Food,...,2015-12-26 20:24:47.000 +0000,8.0,76,2,1,"user_favorite, #Parent, #Vegan, #Woman Owned B...","female, female, female, male, male, female","true, true, true, true, true, true",irregular,field_partner
1419605,989143,Exequila,English,"Exequila is from San Miguel, Bohol. She is in...","Exequila is from San Miguel, Bohol. She is in...",100.0,100.0,funded,Farming,Agriculture,...,2015-12-06 21:03:57.000 +0000,12.0,3,1,1,,female,true,irregular,field_partner
1419606,989240,Lydia,French,Lydia a 37ans et habite dans une zone rurale. ...,Lydia is 37 years old and lives in a rural are...,175.0,175.0,funded,Sewing,Services,...,2015-12-04 23:17:04.000 +0000,14.0,7,1,1,,female,true,monthly,field_partner


In [18]:
loans_df.describe()

Unnamed: 0,loan_id,funded_amount,loan_amount,currency_exchange_coverage_rate,partner_id,lender_term,num_lenders_total,num_journal_entries,num_bulk_entries
count,1419607.0,1419607.0,1419607.0,1098081.0,1402817.0,1419583.0,1419607.0,1419607.0,1419607.0
mean,723371.3,796.1254,832.2284,0.1163657,149.6207,13.05139,22.25389,1.502054,1.134976
std,415676.6,1034.257,1080.551,0.03699645,87.69345,7.56666,27.7741,0.9903614,0.4950988
min,84.0,0.0,25.0,0.1,1.0,1.0,0.0,1.0,1.0
25%,364216.5,275.0,300.0,0.1,98.0,8.0,8.0,1.0,1.0
50%,724035.0,500.0,500.0,0.1,139.0,12.0,15.0,1.0,1.0
75%,1082972.0,950.0,1000.0,0.1,174.0,14.0,27.0,2.0,1.0
max,1444085.0,100000.0,100000.0,0.2,557.0,195.0,3045.0,48.0,24.0


There are many columns! At the moment only disburse_time and planned_expiration_time seems relevant, therefore it is better to filter the df!

In [19]:
columns_of_interest = ['loan_id', 'disburse_time','planned_expiration_time']

In [20]:
loans_filtered = loans_df[columns_of_interest]

In [21]:
loans_filtered.head()

Unnamed: 0,loan_id,disburse_time,planned_expiration_time
0,657307,2013-12-22 08:00:00.000 +0000,2014-02-14 03:30:06.000 +0000
1,657259,2013-12-20 08:00:00.000 +0000,2014-03-26 22:25:07.000 +0000
2,658010,2014-01-09 08:00:00.000 +0000,2014-02-15 21:10:05.000 +0000
3,659347,2014-01-17 08:00:00.000 +0000,2014-02-21 03:10:02.000 +0000
4,656933,2013-12-17 08:00:00.000 +0000,2014-02-13 06:10:02.000 +0000


In [22]:
loans_filtered.tail()

Unnamed: 0,loan_id,disburse_time,planned_expiration_time
1419602,988180,2015-11-23 08:00:00.000 +0000,2016-01-02 01:00:03.000 +0000
1419603,988213,2015-11-24 08:00:00.000 +0000,2016-01-02 16:40:07.000 +0000
1419604,989109,2015-11-13 08:00:00.000 +0000,2016-01-03 22:20:04.000 +0000
1419605,989143,2015-11-03 08:00:00.000 +0000,2016-01-05 08:50:02.000 +0000
1419606,989240,2015-11-03 08:00:00.000 +0000,2016-01-03 20:50:06.000 +0000


Let's have a look at the two variables:

In [23]:
loans_filtered.disburse_time.describe()

count                           1416794
unique                            75668
top       2017-02-01 08:00:00.000 +0000
freq                               2800
Name: disburse_time, dtype: object

In [24]:
loans_filtered.planned_expiration_time.describe()

count                           1047773
unique                           528035
top       2017-07-20 04:34:06.000 +0000
freq                                 22
Name: planned_expiration_time, dtype: object

They are seen as a generic object from Pandas even though they are dates.
How many NAs are there?

In [25]:
loans_filtered.disburse_time.isna().sum()

2813

In [26]:
loans_filtered.planned_expiration_time.isna().sum()

371834

In [27]:
loans_filtered['diff_expiration_disburse'] = pd.to_datetime(loans_filtered.planned_expiration_time) - pd.to_datetime(loans_filtered.disburse_time)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [28]:
loans_filtered.head()

Unnamed: 0,loan_id,disburse_time,planned_expiration_time,diff_expiration_disburse
0,657307,2013-12-22 08:00:00.000 +0000,2014-02-14 03:30:06.000 +0000,53 days 19:30:06
1,657259,2013-12-20 08:00:00.000 +0000,2014-03-26 22:25:07.000 +0000,96 days 14:25:07
2,658010,2014-01-09 08:00:00.000 +0000,2014-02-15 21:10:05.000 +0000,37 days 13:10:05
3,659347,2014-01-17 08:00:00.000 +0000,2014-02-21 03:10:02.000 +0000,34 days 19:10:02
4,656933,2013-12-17 08:00:00.000 +0000,2014-02-13 06:10:02.000 +0000,57 days 22:10:02


In [29]:
loans_filtered.describe()

Unnamed: 0,loan_id,diff_expiration_disburse
count,1419607.0,1044962
mean,723371.3,52 days 02:04:44.926735
std,415676.6,29 days 14:35:07.308709
min,84.0,-138 days +08:24:08
25%,364216.5,42 days 13:50:02
50%,724035.0,52 days 12:00:02
75%,1082972.0,61 days 19:50:01
max,1444085.0,1673 days 07:07:46


In [30]:
loans_filtered.diff_expiration_disburse.isna().sum()

374645

Is the number plausible? It should be less or equal than the number of NAs in the two columns:

In [31]:
loans_filtered.disburse_time.isna().sum() + loans_filtered.planned_expiration_time.isna().sum()

374647

Apparently yes! It means that in 2 scenarios both planned_expiration_time and disburse_time where NAs. Let's see where:

In [32]:
loans_filtered[loans_filtered[['disburse_time', 'planned_expiration_time']].isna().all(axis=1)]

Unnamed: 0,loan_id,disburse_time,planned_expiration_time,diff_expiration_disburse
423734,68814,,,NaT
1129851,71582,,,NaT


To do:
- Add the computed column to the loans_lenders dataframe
## 3. Find the lenders that have funded at least twice.

In [33]:
funding_freq = loans_lenders_df.groupby('lenders').lenders.count()
funding_freq

lenders
 000               39
 00000             39
 0002              70
 00mike00           1
 0101craign0101    71
                   ..
zzanita             2
zzcyna7269          1
zzinnia             1
zzmcfate           56
zzrvmf8538          2
Name: lenders, Length: 1639026, dtype: int64

In [34]:
funding_freq = funding_freq.to_frame()

In [35]:
funding_freq

Unnamed: 0_level_0,lenders
lenders,Unnamed: 1_level_1
000,39
00000,39
0002,70
00mike00,1
0101craign0101,71
...,...
zzanita,2
zzcyna7269,1
zzinnia,1
zzmcfate,56


In [36]:
funding_freq[funding_freq.lenders >= 2]

Unnamed: 0_level_0,lenders
lenders,Unnamed: 1_level_1
000,39
00000,39
0002,70
0101craign0101,71
0132575,4
...,...
zyrorl,3
zzaman,11
zzanita,2
zzmcfate,56


## 4. For each country, compute how many loans have involved that country as borrowers.

In [37]:
loans_df.columns

Index(['loan_id', 'loan_name', 'original_language', 'description',
       'description_translated', 'funded_amount', 'loan_amount', 'status',
       'activity_name', 'sector_name', 'loan_use', 'country_code',
       'country_name', 'town_name', 'currency_policy',
       'currency_exchange_coverage_rate', 'currency', 'partner_id',
       'posted_time', 'planned_expiration_time', 'disburse_time',
       'raised_time', 'lender_term', 'num_lenders_total',
       'num_journal_entries', 'num_bulk_entries', 'tags', 'borrower_genders',
       'borrower_pictured', 'repayment_interval', 'distribution_model'],
      dtype='object')

Let's filter, once again, the loans dataset keeping only the columns of interest:
- loan_id
- country_code
- country_name

In [38]:
loans_filtered['country_code'] = loans_df['country_code']
loans_filtered['country_name'] = loans_df['country_name']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [39]:
loans_filtered.head()

Unnamed: 0,loan_id,disburse_time,planned_expiration_time,diff_expiration_disburse,country_code,country_name
0,657307,2013-12-22 08:00:00.000 +0000,2014-02-14 03:30:06.000 +0000,53 days 19:30:06,PH,Philippines
1,657259,2013-12-20 08:00:00.000 +0000,2014-03-26 22:25:07.000 +0000,96 days 14:25:07,HN,Honduras
2,658010,2014-01-09 08:00:00.000 +0000,2014-02-15 21:10:05.000 +0000,37 days 13:10:05,PK,Pakistan
3,659347,2014-01-17 08:00:00.000 +0000,2014-02-21 03:10:02.000 +0000,34 days 19:10:02,KG,Kyrgyzstan
4,656933,2013-12-17 08:00:00.000 +0000,2014-02-13 06:10:02.000 +0000,57 days 22:10:02,PH,Philippines


In [40]:
loans_filtered.country_name.value_counts()

Philippines         285336
Kenya               143699
Peru                 86000
Cambodia             79701
El Salvador          64037
                     ...  
Uruguay                  1
Botswana                 1
Papua New Guinea         1
Mauritania               1
Canada                   1
Name: country_name, Length: 96, dtype: int64

##  5. For each country, compute the overall amount of money borrowed.

In [41]:
loans_filtered['loan_amount'] = loans_df['loan_amount']
loans_filtered.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


Unnamed: 0,loan_id,disburse_time,planned_expiration_time,diff_expiration_disburse,country_code,country_name,loan_amount
0,657307,2013-12-22 08:00:00.000 +0000,2014-02-14 03:30:06.000 +0000,53 days 19:30:06,PH,Philippines,125.0
1,657259,2013-12-20 08:00:00.000 +0000,2014-03-26 22:25:07.000 +0000,96 days 14:25:07,HN,Honduras,400.0
2,658010,2014-01-09 08:00:00.000 +0000,2014-02-15 21:10:05.000 +0000,37 days 13:10:05,PK,Pakistan,400.0
3,659347,2014-01-17 08:00:00.000 +0000,2014-02-21 03:10:02.000 +0000,34 days 19:10:02,KG,Kyrgyzstan,625.0
4,656933,2013-12-17 08:00:00.000 +0000,2014-02-13 06:10:02.000 +0000,57 days 22:10:02,PH,Philippines,425.0


In [42]:
loans_filtered.groupby('country_name').loan_amount.sum()

country_name
Afghanistan        1967950.0
Albania            4307350.0
Armenia           22950475.0
Azerbaijan        14784625.0
Belize              150175.0
                     ...    
Vietnam           24681100.0
Virgin Islands       10000.0
Yemen              3444000.0
Zambia             1978975.0
Zimbabwe           5851875.0
Name: loan_amount, Length: 96, dtype: float64