# PCA Initial Analysis

In [1]:
import numpy as np
import pandas as pd
import plotly
import plotly.plotly as py
import plotly.graph_objs as go
import plotly.tools as tls
import cufflinks as cf
import plotly.figure_factory as ff
import plotly.tools as tls

 
plotly.tools.set_credentials_file(username='xxxxxxxx', api_key='xxxxxxxxxxxxxxxxxx')


In [3]:
df = pd.read_csv('Data\PCA-gradebook.csv', low_memory=False)
df2 = pd.read_csv('Data\programming_submissions.csv', low_memory=False)

df = df.drop(columns  = 'Completed with CC')

print(df.shape)
print(df2.shape)


(6535, 31)
(6679, 7)


In [4]:
df.columns = df.columns.str.replace('Assessment Grade: ','')

In [5]:
gradebook = df.iloc[:,[0,1,3,5,7,9,11,13,15,17,19,21,23,25,27]]
print (gradebook.shape)
gradebook.head()


(6535, 15)


Unnamed: 0,Anonymized Coursera ID (imperial_user_id),Mean of datasets,Variance of 1D datasets,Covariance matrix of a two-dimensional dataset,Mean/covariance of a dataset + effect of a linear transformation,Dot product,Properties of inner products,General inner products: lengths and distances,Angles between vectors using a non-standard inner product,Inner products and angles,Projection onto a 1-dimensional subspace,Project 3D data onto a 2D subspace,Orthogonal projections,Chain rule practice,PCA
0,00d5cf8bd7552ae649633a2d13fd0871e9c15de6,,,,,,,,,,,,,,
1,0551a9cfaecc2d631359dc2daea932ccb686ea1b,,,,,,,,,,,,,,
2,0aecdf533015d86a0f5def4c5c2a5e8891cd5096,,,,,,,,,,,,,,
3,133626ba4c5f9f15a21f892ee83782f8375b55fd,,,,,,,,,,,,,,
4,16d025f714693a4bc010ce12b9766a36cf92f0f1,,,,,,,,,,,,,,


## Course funnel
Out of a total reach of 21,252 potential learners who have seen the PCA homepage, 30% (6535) started the course. 5% of these people went on to complete the course.

This graph describes funnel from users seeing the course, to enrolling, to the desired outcome: passing the course. 

### We can see from this graph that 91% of those who complete the course will pass

In [7]:
reach = 21252
starters = len(df)
completers = df.loc[:,'PCA'].count()
pass_fail = df.loc[:,'Course Passed'].groupby(df.loc[:,'Course Passed']).count()
passed = pass_fail[1]
failed = pass_fail[0]

funnel = [go.Bar(
            x=["reach", "enrolled learners", "completers", "passed"],
            y=[reach, starters, completers, passed],
            text=[round((reach/reach)*100), round((starters/reach)*100), round((completers/starters)*100), round((passed/completers)*100)]
            )]    
layout = dict(title = 'Course funnel',
             xaxis = dict(title = 'Stage'),
             yaxis = dict(title = 'Total numbers',
                           type='log',
                           autorange = True,
#                          hoverformat = ',.0%'
                          )
             )

fig = go.Figure(data=funnel, layout=layout)
py.iplot(fig,filename='bar-funnel', kind='bar',sharing='private')

## Progression in course

So where are the 95% dropping out between enrolment and completing the course? 

This graph shows the number of people who submitted solutions for each assignment, only one submission is counted per user, so this measure tells us the number of users who attempted each submission atleast once. Hover over the points to see the total number of students who attempted each assigment (top) and as a percentage of all enrolled students (bottom). 

Overall, 5% of enrolled students attempt the final assessment. 84% enrolled of users drop off before the first assessment. 

### Once learners have completed their 4th assignment they have a 70% chance of completing the course, and therefore a 63% chance of passing the course. 



In [8]:
progress = gradebook.count()
#progress.iplot(kind='line', sharing='private')

##Create traces
trace2 = go.Scatter(
    x = progress.index.values,
    y = progress,
    mode = 'lines+markers',
    name = 'Number of unique submissions',
    text = round((progress/6535)*100)
)

layout = dict(title = 'Progression',
             xaxis = dict(title = 'Assessments'),
             yaxis = dict(title = 'Number of unique submissions',
                         type = 'log',
                         autorange = True)
             )

fig1 = go.Figure(data=[trace2], layout=layout)
py.iplot(fig1, filename='progression', sharing = 'private')

## Learning curves
The graph below shows the average top grade submitted by learners from two cohorts: those who eventually passed and those who eventually failed. People who eventually failed had a similar learning curve (measured by average score per assignment) compared to those who eventaully passed. 

The learning curves diverge at the final PCA assignment. Further analysis is required to investigate this.


In [9]:
# To compair learning curves of passers vs failers we need to group the original dataframe by 'Course Passed' 
# the two groups are 0 = fail and 1 = pass

split_lcurve = df.groupby(['Course Passed']).mean().transpose()
#split_lcurve.iplot(kind='line',sharing='private')

In [10]:
# Create traces
trace0 = go.Scatter(
    x = split_lcurve.index.values,
    y = split_lcurve.iloc[:,0],
    mode = 'lines+markers',
    name = 'Failed'
)

trace1 = go.Scatter(
    x = split_lcurve.index.values,
    y = split_lcurve.iloc[:,1],
    mode = 'lines+markers',
    name = 'Passed'
)

layout = dict(title = 'Learning Curves',
             xaxis = dict(title = 'Assessments'),
             yaxis = dict(title = 'Average grade')
             )

data = [trace1, trace0]
fig3 = go.Figure(data=data, layout=layout)
py.iplot(fig3, filename='scatter-mode', sharing = 'private')

## Programming submissions

### 4. PCA

In [11]:
#Number of programming submissions per student per assigment - how can we reduce this number?
d = df2[df2['programming_assignment_id'].str.contains("c2AdQRBEeiS7A4CoMRj0A")]
#d.head()
d1 = d.groupby('imperial_user_id').count()
x = d1.programming_submission_id
data4 = [go.Histogram(x=x,
                     histnorm='probability')]
py.iplot(data4, filename='basic histogram', sharing = 'private')

In [14]:
#d.head()
d2 = df2.groupby('imperial_user_id').count()
x = d2.programming_submission_id
data5 = [go.Histogram(x=x,
                     histnorm='probability')]
py.iplot(data5, filename='basic histogram', sharing = 'private')

## Mean scores for all assignments

In [12]:
mean_score = df2.groupby('imperial_user_id').mean()
mean_score.head()
x = mean_score.programming_submission_score
hist_data = [x[~np.isnan(x)]]
group_labels = ['distplot']

fig = ff.create_distplot(hist_data, group_labels)
py.iplot(fig, filename='Basic Distplot',sharing = 'private')

### Link to comments and feedback

https://www.coursera.org/teach/pca-machine-learning/analytics/ratings
    