# GMPH User Analysis


In [145]:
from datetime import datetime as dt
time = dt.now
import numpy as np
import pandas as pd
import matplotlib as plt
import plotly.offline as py
import plotly.graph_objs as go

import IPython.core.display as di

# This line will hide code by default when the notebook is exported as HTML
di.display_html('<script>jQuery(function() {if (jQuery("body.notebook_app").length == 0) { jQuery(".input_area").toggle(); jQuery(".prompt").toggle();}});</script>', raw=True)

# This line will add a button to toggle visibility of code blocks, for use with the HTML export version
di.display_html('''<button onclick="jQuery('.input_area').toggle(); jQuery('.prompt').toggle();">Show / Hide Code</button>''', raw=True)


py.init_notebook_mode(connected=True)


In [135]:
users = pd.read_csv('../Data/users.csv')
#users = users.set_index('imperial_user_id')
courses = pd.read_csv('../Data/courses.csv')
course_memberships = pd.read_csv('../Data/course_memberships.csv')
country_codes = pd.read_csv('../Data/ISO_country_codes.csv')

## this sets the index of  the country_codes table to the two letter code, making it easy to join with the 'country_cd' column in df

### Courses included in analysis

In [136]:
courses = courses.dropna(subset=['course_launch_ts'])
GMPHlist = ['HStXPFUPEeijOhIn2e06zA','wYVFrFUOEeiXDgqeSsw0yA','_o2ptFUOEeiN5g6qr8gNVg','HIWhL1UPEeiYKRJKvk-prg','hSFRplRqEeiPZBL0QYL46A','ApVo5lRrEeiGKxInOEegGg','Rtiw71RrEeiuag4cjofrng']
GMPHcourses = courses[courses.course_id.isin(GMPHlist)]
GMPHcourses.loc[:,['course_name', 'course_launch_ts']].sort_values('course_launch_ts')

Unnamed: 0,course_name,course_launch_ts
11,Survival Analysis in R for Public Health,2019-01-18 19:51:50.908
34,Introduction to Statistics & Data Analysis in ...,2019-01-18 20:33:23.414
24,Linear Regression in R for Public Health,2019-01-18 20:35:31.551
47,Logistic Regression in R for Public Health,2019-01-18 20:37:18.675
10,Measuring Disease in Epidemiology,2019-02-22 17:13:18.88
41,Study Designs in Epidemiology,2019-02-22 17:16:54.621
0,Validity and Bias in Epidemiology,2019-02-22 17:18:56.697


In [140]:
### Pull in additional country information from ISO data so that we have full country name and alpha3 code which is required for the worldview Plotly map
users = users.merge(course_memberships, on = 'imperial_user_id')
users = users.merge(country_codes, left_on = 'country_cd', right_on = 'alpha-2')
GMPHusers = totalusers.merge(GMPHcourses, on = 'course_id')

total = users['imperial_user_id'].nunique()
GMPHtotal = GMPHusers['imperial_user_id'].nunique()
print('Total users included in analysis: ' + str(GMPHtotal))
### simply for easy viewing of fields:
pretty = df.iloc[1].transpose()

totalcountries = totalusers.groupby('alpha-3').count()
GMPHcountries = GMPHusers.groupby('alpha-3').count()


Total users included in analysis: 1476


In [70]:
# 134462 all courses
# 2714 GMPH

def countries (df, column, divide_by=total):
    
    df1 = df.groupby('alpha-3').count()

    data = [dict(
            type = 'choropleth',
            locations = df1.index.values,
            z = df1[column],
            text = round((df1[column]/divide_by)*100, 2),
            colorscale = [[0,"rgb(5, 10, 172)"],[0.35,"rgb(40, 60, 190)"],[0.5,"rgb(70, 100, 245)"],\
                [0.6,"rgb(90, 120, 245)"],[0.7,"rgb(106, 137, 247)"],[1,"rgb(220, 220, 220)"]],
            autocolorscale = False,
            reversescale = True,
            marker = dict(
                line = dict (
                    color = 'rgb(180,180,180)',
                    width = 0.5
                ) )
          ) ]

    layout = dict(
        width = 1000,
        height =  1000,
        geo = dict(
            showframe = False,
            showcoastlines = False,
            projection = dict(
                type = 'orthographic'
            )
        )
    )

    fig = dict( data=data, layout=layout )
    plot = py.iplot(fig)
    
    return plot



In [117]:
def countries_ratio (df, var1, var2, normaliser):
      
    data = [dict(
            type = 'choropleth',
            locations = df.index.values,
            z = round((var1/var2)*normaliser*100, 2),
            colorscale=[[0.0, 'rgb(165,0,38)'], [0.1111111111111111, 'rgb(215,48,39)'], [0.2222222222222222, 'rgb(244,109,67)'], 
                        [0.3333333333333333, 'rgb(253,174,97)'], [0.4444444444444444, 'rgb(254,224,144)'], 
                        [0.5555555555555556, 'rgb(224,243,248)'], [0.6666666666666666, 'rgb(160,160,160)'], 
                        [1.0, 'rgb(120,120,120)']],
            autocolorscale = False,
            reversescale = True,
            marker = dict(
                line = dict (
                    color = 'rgb(180,180,180)',
                    width = 0.5
                ) )
          ) ]

    layout = dict(
        width = 1000,
        height =  1000,
        geo = dict(
            showframe = False,
            showcoastlines = False,
            projection = dict(
                type = 'mercator'
            )
        )
    )

    fig = dict( data=data, layout=layout )
    plot = py.iplot( fig)
    
    return plot



# Hover to see total users per country (top) and as % of all GMPH users (bottom)

In [133]:
countries (GMPHusers, 'imperial_user_id', GMPHtotal)