In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import seaborn as sns #Graphs
import matplotlib.pyplot as plt

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

import os
print(os.listdir("../input"))

# Any results you write to the current directory are saved as output.

df = pd.read_csv('../input/kiva_loans.csv')

In [None]:
#General Info about the Dataset
df.info()
df[:3]

In [None]:
# Visualizing how loan_amount and funded_amount are related
sns.relplot(kind = 'scatter', x = 'funded_amount', y = 'loan_amount', data = df, color = '#ffa500')
plt.show()

# funded_amount is that which is provided by Kiva. So, the expectation would have been that the amount that gets loaned
# is always requal to funded_amount. But, that does not seem to be the case. 
# 1. It would be important to note this factor and make our model learn this as a subgroup
# 2. Remove the really far off outliers that skew the parameter pair

In [None]:
# Sampling from one of the areas where funded_amount <> loan_amount 
condition1 = (df['loan_amount'] > 0) & (df['funded_amount'] < 20000)
condition2 = (df['loan_amount'] != df['funded_amount'])
df2 = df[condition1 & condition2]
df2[:100]
# Visualizing this sample
sns.relplot(kind = 'scatter', x = 'funded_amount', y = 'loan_amount', data = df2, color = '#ffa500')

In [None]:
# Visualizing how a perfect funded_amount == loan_amount would look like
condition1 = df['funded_amount'] == df['loan_amount']
df3 = df[condition1]
sns.relplot(kind = 'scatter', x = 'funded_amount', y = 'loan_amount', data = df3, color = '#ff1500')
plt.show()

In [None]:
#Getting Dates out
years = df['date']
years1 = years.str.split('-', expand = True)
df4 = pd.merge(years1, df, right_index = True, left_index = True)
df4.drop(columns = [1, 2], inplace = True)
df4.rename(columns = {0: 'Year'}, inplace = True)

df4[:3]

In [None]:
#Visualizing funded_amount and loan_amount by year
sns.relplot(kind = 'line', x = 'Year', y = 'funded_amount', data = df4, ci = None)
sns.relplot(kind = 'line', x = 'Year', y = 'loan_amount', data = df4, ci = None)
plt.show()
#So, we know that both funded_amount and loan_amount has gone down from 2014 to 2017

In [None]:
sns.catplot(kind = 'strip', y = 'sector', x = 'funded_amount', data = df4, jitter = False, order = ['Food', 'Transportation'], hue = 'Year')

In [None]:
sns.catplot(kind = 'bar', x = 'funded_amount', y = 'sector', data = df4)

In [None]:
sns.catplot(kind = 'count', y = 'sector', data = df4)

In [None]:
sns.catplot(kind = 'point', x = 'sector', y = 'funded_amount', data = df4, hue = 'Year')

In [None]:
f, ax = plt.subplots(figsize = (10, 8))
sns.catplot(kind = 'count', y = 'sector', row = 'Year', data = df4)