# Example User Analysis

In [1]:
import numpy as np
import pandas as pd
import plotly
import plotly.plotly as py
import plotly.graph_objs as go
import plotly.tools as tls
import cufflinks as cf
import plotly.figure_factory as ff
import plotly.tools as tls

name='LA'
 
plotly.tools.set_credentials_file(username='xxxxxxxx', api_key='xxxxxxxxxxxx')


In [2]:
gradebook = pd.read_csv('Data\gradebook.csv', low_memory=False)
users = pd.read_csv('Data\\users.csv')
users = users.set_index('imperial_user_id')
country_codes = pd.read_csv('../../../Data/ISO_country_codes.csv')
country_codes = country_codes.set_index('alpha2')
## this sets the index of  the country_codes table to the two letter code, making it easy to join with the 'country_cd' column in df

In [3]:
### Join the user table to the gradebook so that we can link attainment at each stage of the module to demographics
df = gradebook.join(users, 'Anonymized Coursera ID (imperial_user_id)', how = 'left')

### Strip out unnecessary info
df.columns = df.columns.str.replace('Assessment Grade: ','')

cols = [c for c in df.columns if c.lower()[:10] != 'submission']
df = df[cols]

### Pull in additional country information from ISO data so that we have full country name and alpha3 code which is required for the worldview Plotly map
df =  df.join(country_codes,'country_cd', how = 'left')
total = df['Anonymized Coursera ID (imperial_user_id)'].nunique()
print(total)
### simply for easy viewing of fields:
pretty = df.iloc[1].transpose()
pretty

20979


Anonymized Coursera ID (imperial_user_id)                   00d5cf8bd7552ae649633a2d13fd0871e9c15de6
Solving some simultaneous equations                                                              100
Exploring parameter space                                                                    71.4286
Doing some vector operations                                                                     100
Dot product of vectors                                                                           100
Changing basis                                                                                   100
Linear dependency of a set of vectors                                                            100
Vector operations assessment                                                                     NaN
Using matrices to make transformations                                                           100
Solving linear equations using the inverse matrix                                          

In [4]:
def countries (df, column):
    
    df1 = df.groupby('alpha3').count()

    data = [dict(
            type = 'choropleth',
            locations = df1.index.values,
            z = df1[column],
            text = df1[column]/total*100,
            colorscale = [[0,"rgb(5, 10, 172)"],[0.35,"rgb(40, 60, 190)"],[0.5,"rgb(70, 100, 245)"],\
                [0.6,"rgb(90, 120, 245)"],[0.7,"rgb(106, 137, 247)"],[1,"rgb(220, 220, 220)"]],
            autocolorscale = False,
            reversescale = True,
            marker = dict(
                line = dict (
                    color = 'rgb(180,180,180)',
                    width = 0.5
                ) ),
            colorbar = dict(
                autotick = False),
          ) ]

    layout = dict(
        geo = dict(
            showframe = False,
            showcoastlines = False,
            projection = dict(
                type = 'Mercator'
            )
        )
    )

    fig = dict( data=data, layout=layout )
    plot = py.iplot( fig, validate=False, filename=name + 'learner-world-map2' )
    
    return plot



In [5]:
def countries_ratio (df, column1, column2):
    
    df2 = df.groupby('alpha3').count()
  
    data = [dict(
            type = 'choropleth',
            locations = df2.index.values,
            z = df2[column1]/df2[column2]*100,
            colorscale = [[0,"rgb(5, 10, 172)"],[0.35,"rgb(40, 60, 190)"],[0.5,"rgb(70, 100, 245)"],\
                [0.6,"rgb(90, 120, 245)"],[0.7,"rgb(106, 137, 247)"],[1,"rgb(220, 220, 220)"]],
            autocolorscale = False,
            reversescale = True,
            marker = dict(
                line = dict (
                    color = 'rgb(180,180,180)',
                    width = 0.5
                ) ),
            colorbar = dict(
                autotick = False),
          ) ]

    layout = dict(
        geo = dict(
            showframe = False,
            showcoastlines = False,
            projection = dict(
                type = 'Mercator'
            )
        )
    )

    fig = dict( data=data, layout=layout )
    plot = py.iplot( fig, validate=False, filename=name + 'learner-world-map2' )
    
    return plot



In [6]:
def progress(df, column,x):

    df3 = df.drop(columns = ['Course Grade', 'Course Passed', 'Completed with CC'])
    
    df3 = df3.groupby(column).count()
    df3 = df3[df3['Anonymized Coursera ID (imperial_user_id)'] > total*(x/100)]
#    print (df3.iloc[0])
    
    progress = df3.transpose()
    progress = progress[:-11]

    breakdown = progress.iloc[0]
    print (breakdown)
    
#     plot1 = breakdown.iplot(kind='bar', sharing='private')

    progress = (progress/progress.iloc[0]) * 100
    plot = progress.iplot(kind='line', sharing='private')
    return plot



In [7]:
def learningCurve(df, column, x):
    
#    df3 = df.groupby(column).count()
#    df3 = df3[df3['Anonymized Coursera ID (imperial_user_id)'] > total*(x/100)]
#    df3 = df3.iloc[0]

    df = df.drop(columns = ['Course Grade', 'Course Passed', 'Completed with CC'])
    df = df.groupby(column).mean()
    
    progress = df.transpose()
    progress = progress[:-1]

    breakdown = progress.iloc[1]
    print (breakdown)
    
#     plot1 = breakdown.iplot(kind='bar', sharing='private')

    plot = progress.iplot(kind='line', sharing='private')
    return plot



In [8]:
countries (df, 'Anonymized Coursera ID (imperial_user_id)')

In [9]:
countries_ratio (df, 'Eigenvalues and eigenvectors','Anonymized Coursera ID (imperial_user_id)')

In [10]:
progress(df,'browser_language_cd', 1)

browser_language_cd
en        1924
en-GB     2062
en-IN      502
en-US    11543
en-us      970
es-ES      218
ru-RU      257
zh-CN      743
Name: Anonymized Coursera ID (imperial_user_id), dtype: int64


In [11]:
learningCurve(df,'educational_attainment',1)

educational_attainment
ASSOCIATE_DEGREE                 66.326531
BACHELOR_DEGREE                  78.970719
COLLEGE_NO_DEGREE                79.629630
DOCTORATE_DEGREE                 82.815735
HIGH_SCHOOL_DIPLOMA              73.896104
LESS_THAN_HIGH_SCHOOL_DIPLOMA    71.957672
MASTERS_DEGREE                   77.603687
PROFESSIONAL_DEGREE              81.184669
Name: Exploring parameter space, dtype: float64


In [12]:
progress(df,'reported_or_inferred_gender')

TypeError: progress() missing 1 required positional argument: 'x'

In [None]:
learningCurve(df,'reported_or_inferred_gender')

In [None]:
progress(df,'browser_language_cd',1)

In [None]:
learningCurve(df,'browser_language_cd')