# Microlending Competitiveness from `google.co.za`

In [None]:
#install dependencies (python 2.7)
#!conda install -n python2 google

In [1]:
import google
from urlparse import urlparse  # easy extraction of hostname from URL
import pandas as pd
import numpy as np
from bokeh.charts import Bar, show
from bokeh.io import output_notebook
output_notebook()

In [3]:
#Test it out
for result in google.search(query='micro loans', tld='co.za', num=5, start=0, stop=1):
    print result

https://en.wikipedia.org/wiki/Microcredit
https://www.izweloans.com/about-us/
https://www.moneyshop.co.za/money/micro-loans/
http://atlasfinance.co.za/about-us/
http://www.finbondmutualbank.co.za/Credit.html


## See which sites are getting the most hits
Using a few search terms off the top of my head.  Could weight them if needed.

In [4]:
queries = ['micro loans',
           'instant cash',
           'fast loans',
           'short-term loan',
           'need cash',
           'small loan',
           'micro finance',
           'cheap loan']
nperq = 20

In [5]:
# build results->site->query structure
results = {}
for query in queries:
    gsearch = google.search(query=query, tld='co.za', num=nperq, start=0, stop=1)
    sites = [urlparse(url).hostname for url in gsearch][:nperq]
    results[query] = {site: nperq - idx for idx,site in enumerate(sites)}

In [6]:
# Arrange results & enrich a bit
df = pd.DataFrame(results)
df['SCORE'] = df.sum(axis=1)
df.sort('SCORE', ascending=False, inplace=True)
df['RANK'] = range(1,len(df)+1)
df['RANKURL'] = df.RANK.astype(str).str.zfill(2) + ' ' + df.index.to_series()
assert sum(df.index.duplicated())==0
df.head(20)

Unnamed: 0,cheap loan,fast loans,instant cash,micro finance,micro loans,need cash,short-term loan,small loan,SCORE,RANK,RANKURL
www.wonga.co.za,,20.0,18.0,,,,18.0,15.0,71,1,01 www.wonga.co.za
www.wannaloan.co.za,,9.0,11.0,,,3.0,19.0,20.0,62,2,02 www.wannaloan.co.za
www.boodle.co.za,,6.0,16.0,,,,7.0,12.0,41,3,03 www.boodle.co.za
www.kathlegocashloans.co.za,,3.0,20.0,,,,16.0,,39,4,04 www.kathlegocashloans.co.za
www.moneyshop.co.za,,,19.0,,18.0,,,,37,5,05 www.moneyshop.co.za
en.wikipedia.org,,,,17.0,20.0,,,,37,6,06 en.wikipedia.org
extracashloan.co.za,,,17.0,,,19.0,,,36,7,07 extracashloan.co.za
www.sacashloans.co.za,,4.0,5.0,,,14.0,,10.0,33,8,08 www.sacashloans.co.za
za.getbucks.com,,8.0,9.0,,,,15.0,,32,9,09 za.getbucks.com
www.absa.co.za,,,,,,,17.0,14.0,31,10,10 www.absa.co.za


In [15]:
#See the importance of query terms compared to eachother
p = Bar(df.head(10)[queries].sum(),
        ylabel='Query Contribution to TOP10', title='~RELEVANCE: Lower => Less like other queries')
show(p);

# TOP 20 Load Websites
With this simple counting model we can pull the most relevant websites (*equally weighted over queries*)

In [17]:
df.head(20)[['RANK','SCORE']]

Unnamed: 0,RANK,SCORE
www.wonga.co.za,1,71
www.wannaloan.co.za,2,62
www.boodle.co.za,3,41
www.kathlegocashloans.co.za,4,39
www.moneyshop.co.za,5,37
en.wikipedia.org,6,37
extracashloan.co.za,7,36
www.sacashloans.co.za,8,33
za.getbucks.com,9,32
www.absa.co.za,10,31


In [19]:
dfstack = pd.DataFrame({'SCORE': df.head(10).iloc[:, :8].fillna(0).stack()})
dfstack = dfstack.reset_index()
dfstack['RANK SITE'] = df.loc[dfstack['level_0'], 'RANKURL'].values
dfstack.sort('SCORE', ascending=False).head(10)
p = Bar(dfstack, label='RANK SITE', values='SCORE',
        stack='level_1', legend='top_right', title='TOP10 Sites')
show(p);

# Which news sites are most relevant?
Take a look while the code is here.

In [20]:
nws_results = {}
# build results->site->query structure
for query in queries:
    gsearch = google.search(query=query, tld='co.za', num=nperq, start=0, stop=1, tpe='nws')
    sites = [urlparse(url).hostname for url in gsearch][:nperq]
    nws_results[query] = {site: nperq - idx for idx,site in enumerate(sites)}

In [21]:
dfnws = pd.DataFrame(nws_results)
dfnws['SCORE'] = dfnws.sum(axis=1)
dfnws.sort('SCORE', ascending=False, inplace=True)
dfnws['RANK'] = range(1,len(dfnws)+1)
dfnws['RANKURL'] = dfnws.RANK.astype(str).str.zfill(2) + ' ' + dfnws.index.to_series()
dfnws.head(10)[['RANK', 'SCORE']]

Unnamed: 0,RANK,SCORE
www.livemint.com,1,68
www.wsj.com,2,32
rapidcityjournal.com,3,26
www.thehindu.com,4,25
www.newsadvance.com,5,24
allafrica.com,6,24
www.mlive.com,7,20
www.bellanaija.com,8,20
news.asiaone.com,9,20
timesofindia.indiatimes.com,10,20


...Seems a bit India biased, could focus on to local