In [1]:
import pandas as pd
import numpy as np
import time
from pyspark.sql import SparkSession

# create spark session
spark=SparkSession.builder.appName('explore').getOrCreate()

# load csv into df
filename = 'learning_traces.csv'
df=spark.read.option("header","true").csv(filename)

# view nicely formatted pandas df
#df.limit(10).toPandas()

### Create dim_users table

In [2]:
# create users table... count unique timestamps to find number of sessions
users = (df.select(['user_id', 'timestamp']).drop_duplicates()).groupBy('user_id').count()
dim_users = users.withColumnRenamed('count', 'number_of_sessions')

# view users sorted by number of sessions
#users.orderBy(users.number_of_sessions.desc()).show()

### Create dim_time table

- time table: timestamp, year, month, day, dayofweek, hour, minute, second

- data is stored as a unix epoch (https://www.epochconverter.com)

In [3]:
from pyspark.sql.functions import udf, col
from pyspark.sql.functions import year, month, dayofmonth, hour, weekofyear, date_format
from datetime import datetime
from pyspark.sql import functions as f

# create times table
times = df.select('timestamp')
times = times.withColumnRenamed('timestamp', 'epoch')

# convert epochs to timestamps
timestamp = udf(lambda x: str(datetime.fromtimestamp(int(x))))
times = times.withColumn('timestamp', timestamp(times.epoch))

# extract granular times data, store as new columns
get_hour = udf(lambda x: datetime.fromtimestamp(int(x)).hour)
get_day = udf(lambda x: datetime.fromtimestamp(int(x)).day)
get_week = udf(lambda x: datetime.fromtimestamp(int(x)).isocalendar()[1])
get_month = udf(lambda x: datetime.fromtimestamp(int(x)).month)
get_year = udf(lambda x: datetime.fromtimestamp(int(x)).year)
get_weekday = udf(lambda x: datetime.fromtimestamp(int(x)).weekday())
times = times.withColumn('hour', get_hour(times.epoch))
times = times.withColumn('day', get_day(times.epoch))
times = times.withColumn('week', get_week(times.epoch))
times = times.withColumn('month', get_month(times.epoch))
times = times.withColumn('year', get_year(times.epoch))
dim_times = times.withColumn('weekday', get_weekday(times.epoch))

### Import language reference table

In [4]:
import json

# read language reference table into df
lang_filename = 'language-codes-full_json.json'
lang_df = spark.read.json(lang_filename)

### Form dim_langs table

In [5]:
# find languages that are used
learn_langs = df.select(['learning_language']).drop_duplicates()
ui_langs = df.select(['ui_language']).drop_duplicates()

# concat and drop duplicates
used_list = list(learn_langs.toPandas()['learning_language']) + list(ui_langs.toPandas()['ui_language'])
used_list = list(set(used_list))

# create table via Pandas
dim_langs = spark.createDataFrame(pd.DataFrame({'alpha2_code': used_list}))

# join with reference table
dim_langs = dim_langs.join(lang_df, dim_langs.alpha2_code == lang_df.alpha2).drop('French', 'alpha2', 'alpha3-b', 'alpha3-t')

dim_langs.show()

+-----------+------------------+
|alpha2_code|           English|
+-----------+------------------+
|         en|           English|
|         es|Spanish; Castilian|
|         fr|            French|
|         it|           Italian|
|         de|            German|
|         pt|        Portuguese|
+-----------+------------------+



### Form language pairs table for analysts

In [6]:
# form language pairs table
pairs = df.select(['learning_language', 'ui_language']).drop_duplicates()

# match language pairs alpha2 codes with English language names
langs = pairs.join(lang_df, pairs.learning_language == lang_df.alpha2, 'inner').drop('French', 'alpha2', 'alpha3-b', 'alpha3-t')
langs = langs.withColumnRenamed('English', 'learn_lang_name')
langs = langs.join(lang_df, pairs.ui_language == lang_df.alpha2, 'inner').drop('French', 'alpha2', 'alpha3-b', 'alpha3-t')
langs = langs.withColumnRenamed('English', 'ui_lang_name')

### Form language lookup

In [7]:
#lang_lu = lang_df.select(['alpha2', 'English'])
#lang_lu.show()

### Load lexeme reference table

In [8]:
# load txt file containing breakdown of lexeme codes
lex_filename = 'lexeme_reference.txt'
lex_df = spark.read.text(lex_filename)

# make data ingestible... pyspark not able to infer schema as-is
with open(lex_filename) as f:
    lines = f.readlines()
    f.close()
lex_list = []
for line in lines:
    all_split = line.split()
    lex_list.append([all_split[0], all_split[1], ' '.join(all_split[2:])])

# load txt file to dataframe
from pyspark.sql.types import ArrayType, StructField, StructType, StringType
schema = StructType([
    StructField('code', StringType(), True),
    StructField('type', StringType(), True),
    StructField('description', StringType(), True)
])
lex_df = spark.createDataFrame(lex_list,schema)

# filter word types... we only want parts of speech in dimension table
lex_df = lex_df.filter(lex_df.type == 'POS').select(['code', 'description'])
lex_df = lex_df.withColumnRenamed('description', 'part_of_speech')

### Create dim_words table

In [9]:
import re

# create df
words = df.select(['lexeme_id', 'lexeme_string', 'learning_language'])
words = words.withColumnRenamed('learning_language', 'language')
words = words.drop_duplicates()

# extract granular data, create new cols
get_lemma = udf(lambda x: re.search('/(.*?)<', x)[0][1:-1])
get_surface = udf(lambda x: re.search('(.*?)/', x)[0][0:-1])
get_pos = udf(lambda x: re.search('<(.*?)>', x)[0][1:-1])
words = words.withColumn('lemma', get_lemma(words.lexeme_string))
words = words.withColumn('surface', get_surface(words.lexeme_string))
words = words.withColumn('pos', get_pos(words.lexeme_string))
words = words.drop('lexeme_string')

# look-up part of speech, save to table
dim_words = words.join(lex_df, words.pos == lex_df.code, 'left').drop('code', 'pos')

### Create fact_wordviews table

In [23]:
# create word views dataframe
wv_df = df.select(['timestamp', 'user_id', 'learning_language', 'ui_language',
                   'lexeme_id', 'delta', 'history_seen', 'history_correct', 
                   'session_seen', 'session_correct'])

# lookup learning language
#wv_df = wv_df.join(dim_langs, wv_df.learning_language == dim_langs.alpha2_code, 'inner').drop('alpha2_code', 'learning_language')
#wv_df = wv_df.withColumnRenamed('English', 'learning_language')

# lookup ui language
#wv_df = wv_df.join(dim_langs, wv_df.ui_language == dim_langs.alpha2_code, 'inner').drop('alpha2_code', 'ui_language')
#wv_df = wv_df.withColumnRenamed('English', 'ui_language')

# lookup word name (lemma form)
#wv_df = wv_df.join(words, wv_df.lexeme_id == words.lexeme_id, 'left').drop('lexeme_id', 'language', 'surface', 'part_of_speech')

# calculate statistics

percent_correct = udf(lambda x, y: 100 * round(float(x)/float(y), 2))

wv_df = wv_df.withColumn('session_pct', percent_correct(wv_df.session_correct, wv_df.session_seen))
wv_df = wv_df.withColumn('history_pct', percent_correct(wv_df.history_correct, wv_df.history_seen))

# drop granular
wv_df = wv_df.drop('history_seen', 'history_correct', 'session_seen', 'session_correct')

# epoch to timestamp... could have JOIN'd with times table... but better to transform existing column than big join
timestamp = udf(lambda x: str(datetime.fromtimestamp(int(x))))
fact_wordviews = wv_df.withColumn('timestamp', timestamp(wv_df.timestamp))

# Fact Table

In [24]:
fact_wordviews.limit(10).toPandas()

Unnamed: 0,timestamp,user_id,learning_language,ui_language,lexeme_id,delta,session_pct,history_pct
0,2013-02-28 13:28:01,u:FO,de,en,76390c1350a8dac31186187e2fe1e178,27649635,100.0,67.0
1,2013-02-28 13:28:01,u:FO,de,en,7dfd7086f3671685e2cf1c1da72796d7,27649635,50.0,100.0
2,2013-02-28 13:28:01,u:FO,de,en,35a54c25a2cda8127343f6a82e6f6b7d,27649635,100.0,80.0
3,2013-02-28 13:28:01,u:FO,de,en,0cf63ffe3dda158bc3dbd55682b355ae,27649635,50.0,83.0
4,2013-02-28 13:28:01,u:FO,de,en,84920990d78044db53c1b012f5bf9ab5,27649635,100.0,100.0
5,2013-02-28 13:28:01,u:FO,de,en,56429751fdaedb6e491f4795c770f5a4,27649635,100.0,75.0
6,2013-02-28 13:28:01,u:FO,de,en,1bacf218eaaf9f944e525f7be9b31899,27649635,100.0,100.0
7,2013-02-28 15:07:12,u:dDwF,es,en,73eecb492ca758ddab5371cf7b5cca32,444407,100.0,100.0
8,2013-02-28 15:07:24,u:FO,de,en,76390c1350a8dac31186187e2fe1e178,5963,100.0,75.0
9,2013-02-28 15:07:24,u:FO,de,en,7dfd7086f3671685e2cf1c1da72796d7,5963,75.0,83.0


# Dimension Tables

In [13]:
dim_words.limit(10).toPandas()

Unnamed: 0,lexeme_id,language,lemma,surface,part_of_speech
0,53fea04539ece170abd38bc9463a1f18,fr,demain,demain,
1,eb5deee1fa10b4cdcaea9928669bf4f1,fr,que,que,
2,65d763ca3bbc0b12f476051a7b515681,fr,que,qu',
3,e0a060dbfa9d94237cb1113bbed9595b,fr,que,que,
4,4cfa43d063858472c6ff8ae5eebb881e,fr,que,qu',
5,8bd6d060bb604e17c936418f835d87c8,fr,mon,mon,Determiner
6,cb9b44daeb2fd896731bcdef127db8c3,es,nuestro,nuestros,Determiner
7,192059cfe22cedd84cd400a9b1b47171,es,nuestro,nuestra,Determiner
8,61ed985e99a3bdc4b497a5e0a46b104d,de,viel,vieles,Determiner
9,b3f8a12ba4ee5200897e30a18125d710,pt,aquele,aquela,Determiner


In [14]:
dim_users.limit(10).toPandas()

Unnamed: 0,user_id,number_of_sessions
0,u:bWsA,1
1,u:iC5O,44
2,u:e4vL,1
3,u:czPr,8
4,u:hVlN,26
5,u:inp9,94
6,u:ioKd,205
7,u:cu2y,46
8,u:iL6a,21
9,u:hIc2,2


In [15]:
dim_times.limit(10).toPandas()

Unnamed: 0,epoch,timestamp,hour,day,week,month,year,weekday
0,1362076081,2013-02-28 13:28:01,13,28,9,2,2013,3
1,1362076081,2013-02-28 13:28:01,13,28,9,2,2013,3
2,1362076081,2013-02-28 13:28:01,13,28,9,2,2013,3
3,1362076081,2013-02-28 13:28:01,13,28,9,2,2013,3
4,1362076081,2013-02-28 13:28:01,13,28,9,2,2013,3
5,1362076081,2013-02-28 13:28:01,13,28,9,2,2013,3
6,1362076081,2013-02-28 13:28:01,13,28,9,2,2013,3
7,1362082032,2013-02-28 15:07:12,15,28,9,2,2013,3
8,1362082044,2013-02-28 15:07:24,15,28,9,2,2013,3
9,1362082044,2013-02-28 15:07:24,15,28,9,2,2013,3


In [16]:
dim_langs.limit(10).toPandas()

Unnamed: 0,alpha2_code,English
0,en,English
1,es,Spanish; Castilian
2,fr,French
3,it,Italian
4,de,German
5,pt,Portuguese


# Reference Tables

In [17]:
df.limit(10).toPandas()

Unnamed: 0,p_recall,timestamp,delta,user_id,learning_language,ui_language,lexeme_id,lexeme_string,history_seen,history_correct,session_seen,session_correct
0,1.0,1362076081,27649635,u:FO,de,en,76390c1350a8dac31186187e2fe1e178,lernt/lernen<vblex><pri><p3><sg>,6,4,2,2
1,0.5,1362076081,27649635,u:FO,de,en,7dfd7086f3671685e2cf1c1da72796d7,die/die<det><def><f><sg><nom>,4,4,2,1
2,1.0,1362076081,27649635,u:FO,de,en,35a54c25a2cda8127343f6a82e6f6b7d,mann/mann<n><m><sg><nom>,5,4,1,1
3,0.5,1362076081,27649635,u:FO,de,en,0cf63ffe3dda158bc3dbd55682b355ae,frau/frau<n><f><sg><nom>,6,5,2,1
4,1.0,1362076081,27649635,u:FO,de,en,84920990d78044db53c1b012f5bf9ab5,das/das<det><def><nt><sg><nom>,4,4,1,1
5,1.0,1362076081,27649635,u:FO,de,en,56429751fdaedb6e491f4795c770f5a4,der/der<det><def><m><sg><nom>,4,3,1,1
6,1.0,1362076081,27649635,u:FO,de,en,1bacf218eaaf9f944e525f7be9b31899,kind/kind<n><nt><sg><nom>,4,4,1,1
7,1.0,1362082032,444407,u:dDwF,es,en,73eecb492ca758ddab5371cf7b5cca32,bajo/bajo<pr>,3,3,1,1
8,1.0,1362082044,5963,u:FO,de,en,76390c1350a8dac31186187e2fe1e178,lernt/lernen<vblex><pri><p3><sg>,8,6,6,6
9,0.75,1362082044,5963,u:FO,de,en,7dfd7086f3671685e2cf1c1da72796d7,die/die<det><def><f><sg><nom>,6,5,4,3


In [18]:
lang_df.limit(10).toPandas()

Unnamed: 0,English,French,alpha2,alpha3-b,alpha3-t
0,Afar,afar,aa,aar,
1,Abkhazian,abkhaze,ab,abk,
2,Achinese,aceh,,ace,
3,Acoli,acoli,,ach,
4,Adangme,adangme,,ada,
5,Adyghe; Adygei,adyghé,,ady,
6,Afro-Asiatic languages,"afro-asiatiques, langues",,afa,
7,Afrihili,afrihili,,afh,
8,Afrikaans,afrikaans,af,afr,
9,Ainu,aïnou,,ain,


In [19]:
lex_df.limit(10).toPandas()

Unnamed: 0,code,part_of_speech
0,adj,Adjective
1,adv,Adverb
2,cnjadv,Adverbial conjunction
3,cnjcoo,Co-ordinating conjunction
4,cnjsub,Subordinating conjunction
5,det,Determiner
6,ij,Interjection
7,n,Noun
8,np,Proper noun
9,num,Numeral


# Analyst Tables

In [20]:
lang_pairs.limit(10).toPandas()

NameError: name 'lang_pairs' is not defined