# ADA - project
# Scala MOOC - Extraction of Learning Patterns

## Introduction

### Project description

bla bla bla

### Dataset description
- Forum Events
    * field1...
    * field2...
- Problem Events
- Video Events
- metadata

### Team
- Victor
- Thibault
- Louis

### Summary

## Libraries

In [None]:
%matplotlib inline
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import random
import math

## Configuration

In [None]:
config = {
    'courseName': 'progfun-002',
}

## Data imports

### User Grades

In [None]:
df_User_Grades = pd.read_csv('../data/' + config['courseName'] + '_User_Grades.csv')
df_User_Grades = df_User_Grades[['SessionUserID','Grade','AchievementLevel']]
df_User_Grades.head()

### Assignment Events

In [None]:
df_Problem_Event = pd.read_csv(
    '../data/' + config['courseName'] + '_Problem_Events_with_Info.csv',
    dtype={"Grade":np.float64},
    na_values=['None']
)
df_Problem_Event.head(1)

In [None]:
df_Problem_Event = df_Problem_Event[['SessionUserID','Grade','TimeStamp','ProblemID','ProblemType','EventType']]
print(df_Problem_Event.shape)
df_Problem_Event.head()

In [None]:
df_Assignment_Event = df_Problem_Event[df_Problem_Event.ProblemType == "Assignment"].dropna()
print(df_Assignment_Event.shape)
df_Assignment_Event.drop('ProblemType',axis=1,inplace=True)
df_Assignment_Event.head()

In [None]:
df_Assignment_Event['Date'] = pd.to_datetime(df_Assignment_Event.TimeStamp,unit='s')
df_Assignment_Event.head()

### Video Events

In [None]:
df_Video_Event = pd.read_csv(
    '../data/' + config['courseName'] + '_Video_Events.csv',
    na_values=['None']
)
df_Video_Event.head(1)

In [None]:
df_Video_Event = df_Video_Event[['SessionUserID','TimeStamp','VideoID','EventType']]
print(df_Video_Event.shape)
df_Video_Event.head()

In [None]:
df_Video_Event['Date'] = pd.to_datetime(df_Video_Event.TimeStamp,unit='s')
df_Video_Event.head()

### Forum Events

In [None]:
df_Forum_Event = pd.read_csv(
    '../data/' + config['courseName'] + '_Forum_Events.csv',
    na_values=['None']
)
df_Forum_Event.head(1)

In [None]:
# OMG df_Video_Event does not have 'SessionUserID
df_Forum_Event.SessionUserID.head(10)

In [None]:
df_Forum_Event = df_Forum_Event[['AccountUserID','TimeStamp','EventType']]
print(df_Forum_Event.shape)
df_Forum_Event.head()

In [None]:
df_Forum_Event['Date'] = pd.to_datetime(df_Forum_Event.TimeStamp,unit='s')
df_Forum_Event.head()

In [None]:
# Hopefully the table User_Hash_Mapping gives us the mapping between AccountUserID and SessionUserID
df_User_Mapping = pd.read_csv(
    '../data/' + config['courseName'] + '_User_Hash_Mapping.csv',
    na_values=['None']
)
df_User_Mapping = df_User_Mapping[['AccountUserID','SessionUserID']]
df_User_Mapping.head()

In [None]:
print(df_Forum_Event.shape)
df_Forum_Event = df_Forum_Event.merge(df_User_Mapping,on='AccountUserID')[['SessionUserID','TimeStamp','EventType','Date']]
print(df_Forum_Event.shape)
df_Forum_Event.head()

### Concatenation

In [None]:
lengths = {
    'assignments': df_Assignment_Event.shape[0],
    'videos': df_Video_Event.shape[0],
    'forums': df_Forum_Event.shape[0]
}
print(lengths)
[x,y] = [0,lengths['assignments']]
df_Assignment_Event.index=range(x,y)
[x,y] = [y,y+lengths['videos']]
df_Video_Event.index=range(x,y)
[x,y] = [y,y+lengths['forums']]
df_Forum_Event.index=range(x,y)

df_All_Event = pd.concat([df_Assignment_Event,df_Video_Event,df_Forum_Event])
print(df_All_Event.shape)
df_All_Event.head()

In [None]:
df_All_Event['EventTypeMain'] = df_All_Event.EventType.apply(lambda x: x.split('.')[0])
df_All_Event['EventTypeDetail'] = df_All_Event.EventType.apply(lambda x: x.split('.')[1])
df_All_Event.head()

In [None]:
df_All_Event.EventTypeMain.value_counts()

## Data exploration and cleaning

### User Grade

In [None]:
successCounts = df_User_Grades.AchievementLevel.value_counts()
successCounts

In [None]:
df_User_Grades['NormalisedGrade'] = df_User_Grades.Grade / 100
df_User_Grades['Prediction'] = df_User_Grades.NormalisedGrade.mean()
df_User_Grades['SE'] = (df_User_Grades.NormalisedGrade - df_User_Grades.Prediction)**2

{
    "MEAN": df_User_Grades.NormalisedGrade.mean(),
    "RMSE": df_User_Grades.SE.mean()**0.5
}

### Assignment Events

In [None]:
df_Assignment_Event.Grade.hist(bins=25)

In [None]:
df_Assignment_Event.ProblemID.value_counts()

In [None]:
# removes 1234 which seem to be fake homeworks for testing
print(df_Assignment_Event.shape)
df_Assignment_Event = df_Assignment_Event[df_Assignment_Event.ProblemID>4]
print(df_Assignment_Event.shape)

In [None]:
df_Assignment_Event[df_Assignment_Event.TimeStamp<1.37e9].TimeStamp.hist(bins=50)

In [None]:
df_Assignment_Event_sorted = df_Assignment_Event.sort_values(by=['SessionUserID','TimeStamp'])
df_Assignment_Event_sorted.head(10)

In [None]:
# The index has gone crazy, useless, but we can reset it !
df_Assignment_Event_sorted.reset_index(drop=True, inplace=True)
df_Assignment_Event_sorted.head(10)

In [None]:
# found this here: 
# http://stackoverflow.com/questions/19530568/can-pandas-groupby-aggregate-into-a-list-rather-than-sum-mean-etc
# the big idea is to be able to manipulate the tuples with something like apply(func) later on.
df_Assignment_Event_sorted.groupby(['SessionUserID','ProblemID']).agg(lambda x: tuple(x)).head(19)

### Video Events

In [None]:
# Need to make sense out of these event types
df_Video_Event.EventType.value_counts()

In [None]:
counts = df_Video_Event.SessionUserID.value_counts()
print("Average number of video event per student: %f" % counts.mean())
counts.head(10)

In [None]:
VideoIdList = df_Video_Event.VideoID.unique()
VideoIdList.sort()
print(VideoIdList)
bins = range(df_Video_Event.VideoID.min(),df_Video_Event.VideoID.max()+1)
df_Video_Event.VideoID.hist(bins=bins)

In [None]:
df_Video_Event_sorted = df_Video_Event.sort_values(by=['SessionUserID','TimeStamp'])
df_Video_Event_sorted.reset_index(drop=True, inplace=True)
df_Video_Event_sorted.head(5)

### All Events

In [None]:
df_All_Event_sorted = df_All_Event.sort_values(by=['SessionUserID','TimeStamp'])
df_All_Event_sorted.reset_index(drop=True, inplace=True)
df_All_Event_sorted.head(10)

In [None]:
# found this here: 
# http://stackoverflow.com/questions/19530568/can-pandas-groupby-aggregate-into-a-list-rather-than-sum-mean-etc
# the big idea is to be able to manipulate the tuples with something like apply(func) later on.
df_All_Event_sorted.groupby(['SessionUserID']).agg(lambda x: tuple(x)).head(50)
# That takes long time and I have no idea how to manipulate it afterward