## Dimensionality Reduction of Knowledge & Exposure Fingerprint Features

- Main aim is to reduce dimensionality of these feature by grouping semantically similar lexemes into clusters that we can aggregate scores into
- Use [CALE-XLLEX](https://huggingface.co/gabrielloiseau/CALE-XLLEX) from Huggingface to generate emeddings
- Add in morphological tags as onehotencoded features
- Cluster using HDBSCAN / k-means

In [1]:
import sys
sys.path.append("..")

%load_ext autoreload
%autoreload 2

#### Load lexeme_dataset

In [2]:
import pandas as pd

dataset_a_path = "/Users/andrea/Documents/duolingo_hack/dataset_a_spacedrepetition.csv"

dataset_a_raw = pd.read_csv(dataset_a_path)
dataset_a_raw.info()

<class 'pandas.DataFrame'>
RangeIndex: 12854226 entries, 0 to 12854225
Data columns (total 12 columns):
 #   Column             Dtype  
---  ------             -----  
 0   p_recall           float64
 1   timestamp          int64  
 2   delta              int64  
 3   user_id            str    
 4   learning_language  str    
 5   ui_language        str    
 6   lexeme_id          str    
 7   lexeme_string      str    
 8   history_seen       int64  
 9   history_correct    int64  
 10  session_seen       int64  
 11  session_correct    int64  
dtypes: float64(1), int64(6), str(5)
memory usage: 1.9 GB


In [3]:
from src.data.preprocess_lexeme import parse_morph_tags_from_lexeme_string, parse_word_from_lexeme_string
from tqdm import tqdm
tqdm.pandas()

lexeme_df = dataset_a_raw.loc[dataset_a_raw.learning_language=="pt",['lexeme_id', 'lexeme_string']].drop_duplicates().assign(
    word = lambda x: x.lexeme_string.progress_apply(parse_word_from_lexeme_string),
    word_sf = lambda x: x.word.str.replace("^.*/", "", regex=True),
    word_clean = lambda x: x.word.str.replace("/.*$", "", regex=True),
    morph_tags = lambda x: x.lexeme_string.progress_apply(parse_morph_tags_from_lexeme_string)
)

lexeme_df.loc[lexeme_df.word_clean=="<*sf>", "word_clean"] = lexeme_df.loc[lexeme_df.word_clean=="<*sf>", "word_sf"]
lexeme_df.drop(columns="morph_tags").nunique()

100%|██████████| 2815/2815 [00:00<00:00, 626497.17it/s]
100%|██████████| 2815/2815 [00:00<00:00, 466660.04it/s]


lexeme_id        2815
lexeme_string    2815
word             2256
word_sf          1600
word_clean       1992
dtype: int64

In [4]:
## Look at when the same word is duplicated.. why are there different lexeme_ids? Looks like there are different morphological annotation which could be human error but also idk maybe same spelling different meaning like "I read a book" can be both present and past tense

lexeme_df.loc[lexeme_df.duplicated(subset=["word"], keep=False)].sort_values('word') 

Unnamed: 0,lexeme_id,lexeme_string,word,word_sf,word_clean,morph_tags
79107,87a2035d3e8db90a09b6f0e918e27422,<*sf>/abrir<vblex><pri><*pers><*numb>,<*sf>/abrir,abrir,abrir,"[<*sf>, <vblex>, <pri>, <*pers>, <*numb>]"
968967,78a3dc9a89d3269c9cb3908185b72b76,<*sf>/abrir<vblex><pp><*gndr><*numb><@present_...,<*sf>/abrir,abrir,abrir,"[<*sf>, <vblex>, <pp>, <*gndr>, <*numb>, <@pre..."
4097724,aca35a89d2157d6aa3f483d2349d2694,<*sf>/abrir<vblex><cni><*pers><*numb>,<*sf>/abrir,abrir,abrir,"[<*sf>, <vblex>, <cni>, <*pers>, <*numb>]"
5187118,9411c14227976e814c749377bbdd99df,<*sf>/abrir<vblex><ifi><*pers><*numb>,<*sf>/abrir,abrir,abrir,"[<*sf>, <vblex>, <ifi>, <*pers>, <*numb>]"
11676766,618939eca4e920e91a53c9069a9a7c9c,<*sf>/abrir<vblex><fti><*pers><*numb>,<*sf>/abrir,abrir,abrir,"[<*sf>, <vblex>, <fti>, <*pers>, <*numb>]"
...,...,...,...,...,...,...
11080420,c4521713a55acd03761a16885a4d284f,vai/ir<vblex><pri><p3><sg><@future_phrasal>,vai/ir,ir,vai,"[<vblex>, <pri>, <p3>, <sg>, <@future_phrasal>]"
113367,1acf23ffbb788d259351dc5e9cb749f8,vocês/você<prn><tn><p3><mf><pl>,vocês/você,você,vocês,"[<prn>, <tn>, <p3>, <mf>, <pl>]"
9052892,39bb5ac0043764254a436992e7ed181b,vocês/você<prn><tn><p2><mf><pl>,vocês/você,você,vocês,"[<prn>, <tn>, <p2>, <mf>, <pl>]"
913339,835d261ae0439c3956f028e35230e190,voltar/voltar<vblex><inf><@future_phrasal>,voltar/voltar,voltar,voltar,"[<vblex>, <inf>, <@future_phrasal>]"


### Load Dataset B for contextual usage of lexemes

In [5]:
from src.data.load_data import load_datasetB_txt
import os

# merge text files into one, dev + test + train
dataset_b_dir = "/Users/andrea/Documents/duolingo_hack/dataset_b/staple-2020/en_pt"
files = [os.path.join(dataset_b_dir, s) for s in os.listdir(dataset_b_dir) if s.endswith("gold.txt")]
dataset_b_path = "/Users/andrea/Documents/duolingo_hack/dataset_b_pt_gold.txt.csv"

with open(dataset_b_path, "w") as out:
    for f in files:
        with open(f, "r") as inp:
            out.write(inp.read())

dataset_b_raw = load_datasetB_txt(dataset_b_path)

print(dataset_b_raw.shape)
print(dataset_b_raw.nunique())

(654625, 3)
prompt_id        5000
translation    653075
p               49033
dtype: int64


In [6]:
from src.data.preprocess_lexeme import tokens_from_translation
import re

sentences_df = dataset_b_raw.copy().assign(
    token = lambda x: x.translation.progress_apply(tokens_from_translation)
).explode('token')

sentences_df.drop_duplicates(subset=['translation', 'token'], inplace=True)
sentences_df.sort_values("p", inplace=True)
sentences_df.drop_duplicates(subset=['token', 'prompt_id'], keep='last', inplace=True) # keep only most frequently used sentence per translation per token
sentences_df = sentences_df.loc[sentences_df.token.isin(set(lexeme_df.word_clean))]
sentences_df = sentences_df.groupby("token").tail(10).reset_index(drop=True) # limit to max 10 sentences per token

sentences_df['embed_input'] = sentences_df.progress_apply(lambda r: re.sub(f"({r.token})", "<t>\\1</t>", r.translation), axis=1) #tag target token for model
sentences_df = sentences_df.groupby('token')['embed_input'].agg("\n".join).reset_index() ## group sentences into one single input

sentences_df#.head()

100%|██████████| 654625/654625 [00:02<00:00, 288947.87it/s]
100%|██████████| 10807/10807 [00:00<00:00, 43328.49it/s]


Unnamed: 0,token,embed_input
0,a,você está chut<t>a</t>ndo <t>a</t> mes<t>a</t>...
1,abacaxi,"bem, aquilo irá ser um <t>abacaxi</t>."
2,aberto,o estabelecimento está <t>aberto</t>?\no museu...
3,abre,a gente <t>abre</t> os nossos prédios à comuni...
4,abril,ambos os aniversários são em <t>abril</t>?
...,...,...
1634,última,não deixo a pesquisa até a <t>última</t> hora....
1635,último,eles viveram na áfrica no <t>último</t> ano.\n...
1636,últimos,nós perdemos nos <t>últimos</t> momentos do jo...
1637,único,"dois anos mais tarde, ele venceu por um <t>úni..."


In [7]:
len( set(lexeme_df.word_clean) - set(sentences_df.token) )

353

In [8]:
### add in lexemes with no sentences just as it is

sentences_df = pd.concat([sentences_df,
                          pd.DataFrame({"token": list(set(lexeme_df.word_clean) - set(sentences_df.token))}).assign(
                            embed_input = lambda x: "<t>" + x.token + "</t>")
                          ])

sentences_df.nunique()

token          1992
embed_input    1992
dtype: int64

In [9]:
len(sentences_df['token']) == len(set(sentences_df.token))

True

In [10]:
# sentences_df.to_parquet("../data/tmp/sentences_df.parquet")

#### Generate Embeddings

In [11]:
## some how tensors can't run in my notebook so i have to put it in a separate script and run from cli

# python src/models/LexicalEmbed.py

In [12]:
sem_embeddings = pd.read_parquet("../data/tmp/embeddings.parquet")
sem_embeddings.index = sentences_df['token']
sem_embeddings.sort_index(inplace=True)
sem_embeddings.head(10)

Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,...,1014,1015,1016,1017,1018,1019,1020,1021,1022,1023
token,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
a,0.159608,-0.789991,-1.280254,-0.977334,0.201887,1.208807,0.968916,1.482001,0.835401,-0.096745,...,0.626819,0.031995,-1.596566,0.052414,-0.401835,-0.444202,0.052495,-0.656819,0.276474,0.807692
abacaxi,0.11286,-0.235563,-0.715782,-0.147082,-0.118477,1.047712,1.569708,1.425568,0.915193,-0.49661,...,1.051359,0.696129,-0.768505,-0.528718,-2.356428,-0.940612,0.08549,-1.276624,-0.594671,0.491628
abacaxis,0.104341,-0.850763,0.733975,0.536416,-0.168587,0.349298,1.075039,1.73135,-0.362329,0.992692,...,-0.446859,0.062616,-0.588157,0.616257,-1.667086,1.415089,-0.884691,-0.613696,-0.322179,0.046296
abelha,-0.026599,-0.907398,-1.250067,-1.786331,-0.295168,1.354755,0.893484,1.579531,0.631786,-0.815755,...,1.308146,0.419317,-1.364628,0.393218,-0.94896,0.73676,0.708868,-1.746192,-0.158883,0.874223
aberto,-0.249868,-1.083624,-0.60775,-0.709888,0.272246,0.935445,-0.26485,1.032278,0.645995,-1.262559,...,0.441675,-1.242004,-0.728554,0.676841,0.136742,0.905638,1.376844,0.309301,2.045292,0.682605
abre,-1.196108,-0.452264,-1.141945,-1.126796,0.123103,1.630528,0.358479,0.949989,0.166656,-0.469382,...,0.913266,-1.331908,-1.026353,-0.047394,0.474761,0.989215,0.839568,0.237909,0.977563,1.516168
abril,0.60718,-0.385094,-2.13738,-0.101805,0.4183,-0.987933,0.606662,1.320576,1.221782,0.231729,...,0.189767,0.067621,0.528885,-0.788262,0.063432,-0.264308,0.539284,-0.391675,-0.717743,0.732163
abrir,-1.464728,-0.388119,-0.922581,-1.405381,-0.467532,1.567218,0.568368,0.653508,0.144286,-0.811723,...,0.364821,-1.542726,-0.662923,-0.142513,0.24018,1.016949,0.573348,0.411292,1.42106,0.972885
absolutamente,0.159063,-0.590736,-0.779604,-0.515869,-0.216373,1.032541,1.422664,-0.953403,1.104002,-0.62243,...,0.73508,0.263916,-1.34884,0.063932,-0.998149,2.04638,0.42209,-0.165625,-1.011416,1.905915
acabar,0.58741,0.318691,-0.419808,-0.828312,-0.381332,1.388464,-0.653525,2.259298,0.718048,0.240456,...,-0.349814,-1.352566,-0.999245,0.813829,-2.317532,-0.22435,1.425098,-0.667375,0.273414,-0.03183


#### Add Morphological Tags

In [13]:
from sklearn.preprocessing import MultiLabelBinarizer

# Initialize the binarizer
mlb = MultiLabelBinarizer()

# Transform the 'morph_tags' column into a binary matrix
# This creates a NumPy array where 1 means the tag is present, 0 otherwise
morph_encoded = mlb.fit_transform(lexeme_df['morph_tags'])

# Create a DataFrame from the encoded tags to see the labels
morph_df = pd.DataFrame(morph_encoded, columns=mlb.classes_, index=lexeme_df.lexeme_id).merge(lexeme_df[['lexeme_id', 'word_clean']], left_index=True, right_on='lexeme_id')

# Preview the result
print(f"Total unique tags found: {len(mlb.classes_)}")
morph_df.head()

Total unique tags found: 61


Unnamed: 0,<*gndr>,<*numb>,<*pers>,<*sf>,<@cond_perfect>,<@future_perfect>,<@future_phrasal>,<@past_perfect>,<@present_perfect>,<@subjunctive_pluperfect>,...,<rel>,<sg>,<sp>,<tn>,<vbhaver>,<vblex>,<vbmod>,<vbser>,lexeme_id,word_clean
64,0,0,0,0,0,0,0,0,0,0,...,0,1,0,1,0,0,0,0,57408f89412af98111a2f87c0ab41b22,tu
65,0,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,1,0,0,8414835cb39e4315146a59fefdd6d1c6,tem
66,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,ecc3feb8e53ce936cef181dd54e7aaca,temos
67,0,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,8d28ba0fa188f1847571467189846dda,tua
68,0,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,4b3613233b3fede2e3e92ac2ef752bf6,leão


In [14]:
lexeme_feat_df = morph_df.merge(sem_embeddings.rename(columns=lambda x: "f"+str(x)), how='left', left_on='word_clean', right_index=True,
                                ).drop(columns=['word_clean']
                                ).set_index("lexeme_id")
lexeme_feat_df.head()

Unnamed: 0_level_0,<*gndr>,<*numb>,<*pers>,<*sf>,<@cond_perfect>,<@future_perfect>,<@future_phrasal>,<@past_perfect>,<@present_perfect>,<@subjunctive_pluperfect>,...,f1014,f1015,f1016,f1017,f1018,f1019,f1020,f1021,f1022,f1023
lexeme_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
57408f89412af98111a2f87c0ab41b22,0,0,0,0,0,0,0,0,0,0,...,0.151807,-0.992755,-1.395375,-0.450079,-0.641103,0.593024,1.284555,-0.836021,0.509478,1.917565
8414835cb39e4315146a59fefdd6d1c6,0,0,0,0,0,0,0,0,0,0,...,0.705593,-0.755042,-1.906103,0.615944,-1.583506,1.072103,0.980725,-0.062932,-0.980165,-0.344127
ecc3feb8e53ce936cef181dd54e7aaca,0,0,0,0,0,0,0,0,0,0,...,0.711553,-1.145459,-1.590241,0.747374,-1.808892,1.593481,1.002151,0.299135,-1.026802,-0.642223
8d28ba0fa188f1847571467189846dda,0,0,0,0,0,0,0,0,0,0,...,0.175844,0.845173,-1.38555,-0.825235,0.208458,1.289223,0.629938,-1.519976,-0.514366,-0.091442
4b3613233b3fede2e3e92ac2ef752bf6,0,0,0,0,0,0,0,0,0,0,...,1.142239,-0.429955,-0.531885,-0.165027,-0.797985,-0.302957,-0.043292,-1.206904,-0.11463,0.583535


In [23]:
lexeme_feat_df.nunique()

<*gndr>               2
<*numb>               2
<*pers>               2
<*sf>                 2
<@cond_perfect>       2
                   ... 
f1020              1991
f1021              1992
f1022              1992
f1023              1992
cluster             223
Length: 1086, dtype: int64

In [15]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
lexeme_feat_df_scaled = scaler.fit_transform(lexeme_feat_df)
lexeme_feat_df_scaled

array([[-0.40341377, -1.26461904, -0.47263214, ..., -0.35747443,
         0.58911243,  1.84960161],
       [-0.40341377, -1.26461904, -0.47263214, ...,  0.67352118,
        -1.11654319, -0.95096521],
       [-0.40341377, -1.26461904, -0.47263214, ...,  1.15637612,
        -1.16994337, -1.32008707],
       ...,
       [-0.40341377, -1.26461904, -0.47263214, ...,  2.66162436,
         0.32369183,  0.27212726],
       [-0.40341377, -1.26461904, -0.47263214, ...,  0.42043112,
         0.99759677,  1.461434  ],
       [-0.40341377, -1.26461904, -0.47263214, ..., -0.38164205,
         0.3426035 ,  1.23227425]], shape=(2815, 1085))

### Cluster Lexemes

In [89]:
import hdbscan
import umap

# 1. Reduce dimensions while preserving local structure
# This makes density-based clustering much more effective
reducer = umap.UMAP(
    n_neighbors=15, 
    n_components=60, 
    metric='cosine', # Use cosine since we're dealing with embeddings
    random_state=12345
)
u_embeddings = reducer.fit_transform(lexeme_feat_df_scaled)

# 2. Initialize and fit HDBSCAN
clusterer = hdbscan.HDBSCAN(
    min_cluster_size=15,       # Smallest group you'd consider a 'cluster'
    min_samples=1,            # Lower values = more clusters, fewer 'noise' points
    metric='euclidean',       # Use euclidean on the UMAP output
    cluster_selection_method='eom' # 'Excess of Mass' finds stable clusters
)

cluster_labels = clusterer.fit_predict(u_embeddings)

# 3. Add results back to your dataframe
lexeme_feat_df['cluster'] = cluster_labels

  warn(


In [90]:
lexeme_feat_df.reset_index().groupby('cluster')['lexeme_id'].nunique().describe()

count     82.000000
mean      34.329268
std       44.835517
min       15.000000
25%       18.250000
50%       25.000000
75%       35.500000
max      402.000000
Name: lexeme_id, dtype: float64

In [91]:
lexeme_feat_df.to_parquet("../data/lexeme_embed_cluster_results.parquet")

In [92]:
lexeme_cluster_results = lexeme_feat_df[['cluster']].merge(lexeme_df.set_index("lexeme_id")['word'], 'left', left_index=True, right_index=True)

lexeme_cluster_results.to_csv("../data/lexeme_cluster_results.csv")

In [95]:
lexeme_cluster_results.loc[lexeme_cluster_results.cluster==35]

Unnamed: 0_level_0,cluster,word
lexeme_id,Unnamed: 1_level_1,Unnamed: 2_level_1
3b90f64dfa33918b221e0aabd249dc75,35,quando/quando
826a902bc42b3ac85918087164174ab0,35,desde/desde
afcb0785116e603e09dc46ceed0be5ce,35,enquanto/enquanto
e82fa7904e0027596d88bcfc652415d7,35,logo/logo
f3ac9fa9e991238c93688ca47a12edd2,35,então/então
2ec34df87a3756d67e9fb65f3d51c164,35,sempre/sempre
4d2509dacbcc90cddac88cb840fc1dd3,35,agora/agora
80dab43ce8e8a5c1296cc45209c74283,35,já/já
9e0893bcfc6d73adb82afc03a51c1451,35,durante/durante
c198fdbbf74e825b205af212b2347253,35,às/a


In [94]:
lexeme_cluster_results.loc[lexeme_cluster_results.cluster<0]

Unnamed: 0_level_0,cluster,word
lexeme_id,Unnamed: 1_level_1,Unnamed: 2_level_1
6e39fa977508d7c2e5990cfddc80b2f7,-1,<*sf>/fantasia
3a898602c268f23d8acbeadacc9138be,-1,<*sf>/roupa
a650b5f58f811a5360c1c94772525a83,-1,gosta/gostar
0dd6934ef0f42a5151850fa8ca018c2c,-1,és/ser
b2e1f3421eb5216ac144a5714dbf7d6b,-1,lê/ler
...,...,...
c2aacad8db68efca9433ded0e0169687,-1,<*sf>/pista
8c8327dafa63dfd709efbd75117a3312,-1,<*sf>/moda
61c3e481fa09977511996a7fc728c00b,-1,<*sf>/linha
289ec4ae458f4741f9401eb234dd29a3,-1,<*sf>/acontecer


### Match back to user_fingerprint df

In [97]:
user_fingerprint_B = pd.read_csv("../data/user_fingerprint_B.csv")
user_fingerprint_B.shape

(2709, 5634)

In [98]:
user_fingerprint_B.head()

Unnamed: 0,user_id,max_history_seen,vocab_size,learning_speed,lexeme_0_seen,lexeme_1_seen,lexeme_2_seen,lexeme_3_seen,lexeme_4_seen,lexeme_5_seen,...,lexeme_2805,lexeme_2806,lexeme_2807,lexeme_2808,lexeme_2809,lexeme_2810,lexeme_2811,lexeme_2812,lexeme_2813,lexeme_2814
0,u:0X2,15,9,13.563969,15.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,u:0b,12,12,3.629236,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,u:0xw,20,108,2.633296,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,u:1EH,14,52,2.580331,12.0,0.0,0.0,0.0,7.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,u:1gx,15,71,7.584663,15.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [107]:
df_pt = dataset_a_raw[dataset_a_raw['learning_language'] == 'pt'].copy()
df_pt['lexeme_code'], uniques = pd.factorize(df_pt['lexeme_id'])

display(df_pt[['lexeme_id', 'lexeme_code']].head())

lexeme_codes = df_pt.drop_duplicates(subset=['lexeme_id', 'lexeme_code']).set_index("lexeme_id")['lexeme_code'].to_dict()
len(lexeme_codes)

lexeme_cluster_results['lexeme_code'] = lexeme_cluster_results.index.map(lexeme_codes)
lexeme_cluster_results.head()

Unnamed: 0,lexeme_id,lexeme_code
64,57408f89412af98111a2f87c0ab41b22,0
65,8414835cb39e4315146a59fefdd6d1c6,1
66,ecc3feb8e53ce936cef181dd54e7aaca,2
67,8d28ba0fa188f1847571467189846dda,3
68,4b3613233b3fede2e3e92ac2ef752bf6,4


Unnamed: 0_level_0,cluster,word,lexeme_code
lexeme_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
57408f89412af98111a2f87c0ab41b22,50,tu/tu,0
8414835cb39e4315146a59fefdd6d1c6,19,tem/ter,1
ecc3feb8e53ce936cef181dd54e7aaca,19,temos/ter,2
8d28ba0fa188f1847571467189846dda,51,tua/teu,3
4b3613233b3fede2e3e92ac2ef752bf6,69,leão/leão,4


In [135]:
user_fingerprint_B_lex_clustered = user_fingerprint_B.melt(id_vars=["user_id", "max_history_seen", "vocab_size", "learning_speed"], var_name="lexeme_metric", value_name="weighted_value").copy()
user_fingerprint_B_lex_clustered = user_fingerprint_B_lex_clustered.assign(
    lexeme_code = lambda x: x.lexeme_metric.str.extract("lexeme_([0-9]+)").astype(int),
    metric_type = lambda x: x.lexeme_metric.str.replace("lexeme_[0-9]+_?", "", regex=True).replace("", "ability")
).merge(lexeme_cluster_results[['lexeme_code', 'cluster']], how='left', on='lexeme_code')

user_fingerprint_B_lex_clustered['cluster'] = user_fingerprint_B_lex_clustered['cluster'].astype(str)
user_fingerprint_B_lex_clustered.loc[user_fingerprint_B_lex_clustered.cluster!="-1", 'cluster'] = "lex_cluster_" + user_fingerprint_B_lex_clustered.loc[user_fingerprint_B_lex_clustered.cluster!="-1", 'cluster']
user_fingerprint_B_lex_clustered.loc[user_fingerprint_B_lex_clustered.cluster=="-1", 'cluster'] = "lexeme_" + user_fingerprint_B_lex_clustered.loc[user_fingerprint_B_lex_clustered.cluster=="-1", 'lexeme_code'].astype(str)

user_fingerprint_B_lex_clustered

Unnamed: 0,user_id,max_history_seen,vocab_size,learning_speed,lexeme_metric,weighted_value,lexeme_code,metric_type,cluster
0,u:0X2,15,9,13.563969,lexeme_0_seen,15.0,0,seen,lex_cluster_50
1,u:0b,12,12,3.629236,lexeme_0_seen,0.0,0,seen,lex_cluster_50
2,u:0xw,20,108,2.633296,lexeme_0_seen,0.0,0,seen,lex_cluster_50
3,u:1EH,14,52,2.580331,lexeme_0_seen,12.0,0,seen,lex_cluster_50
4,u:1gx,15,71,7.584663,lexeme_0_seen,15.0,0,seen,lex_cluster_50
...,...,...,...,...,...,...,...,...,...
15251665,u:yT9,20,13,13.931693,lexeme_2814,0.0,2814,ability,lex_cluster_16
15251666,u:yyO,21,14,2.044361,lexeme_2814,0.0,2814,ability,lex_cluster_16
15251667,u:z4x,85,83,2.375734,lexeme_2814,0.0,2814,ability,lex_cluster_16
15251668,u:zmi,12,42,8.380809,lexeme_2814,0.0,2814,ability,lex_cluster_16


In [136]:
user_fingerprint_B_lex_clustered.metric_type.value_counts()

metric_type
seen       7625835
ability    7625835
Name: count, dtype: int64

In [138]:
user_fingerprint_B_lex_clustered = user_fingerprint_B_lex_clustered.groupby(["user_id", "max_history_seen", "vocab_size", "learning_speed", "metric_type", "cluster"]).agg(
    {"lexeme_code": set,
     "weighted_value": "mean"}
).reset_index().assign(
    lexeme_metric = lambda x: x.cluster + "_" + x.metric_type
)

user_fingerprint_B_lex_clustered.head()

Unnamed: 0,user_id,max_history_seen,vocab_size,learning_speed,metric_type,cluster,lexeme_code,weighted_value,lexeme_metric
0,u:0X2,15,9,13.563969,ability,lex_cluster_0,"{1155, 2567, 1160, 2188, 2191, 402, 1556, 1814...",0.0,lex_cluster_0_ability
1,u:0X2,15,9,13.563969,ability,lex_cluster_1,"{258, 2055, 777, 1289, 140, 1935, 1947, 2721, ...",0.01285,lex_cluster_1_ability
2,u:0X2,15,9,13.563969,ability,lex_cluster_10,"{1031, 1799, 1801, 1035, 1420, 1550, 1678, 245...",0.0,lex_cluster_10_ability
3,u:0X2,15,9,13.563969,ability,lex_cluster_11,"{1570, 1891, 1668, 197, 1990, 2087, 2584, 2762...",0.0,lex_cluster_11_ability
4,u:0X2,15,9,13.563969,ability,lex_cluster_12,"{898, 1411, 5, 1669, 524, 2473, 43, 1326, 820,...",0.0,lex_cluster_12_ability


In [140]:
user_fingerprint_B_lex_cluster_only = user_fingerprint_B_lex_clustered.loc[user_fingerprint_B_lex_clustered.lexeme_metric.str.contains("lex_cluster")
                                                                           ].pivot(columns="lexeme_metric", values="weighted_value", index=["user_id", "max_history_seen", "vocab_size", "learning_speed"])
user_fingerprint_B_lex_cluster_only.reset_index(inplace=True)
user_fingerprint_B_lex_cluster_only

lexeme_metric,user_id,max_history_seen,vocab_size,learning_speed,lex_cluster_0_ability,lex_cluster_0_seen,lex_cluster_10_ability,lex_cluster_10_seen,lex_cluster_11_ability,lex_cluster_11_seen,...,lex_cluster_79_ability,lex_cluster_79_seen,lex_cluster_7_ability,lex_cluster_7_seen,lex_cluster_80_ability,lex_cluster_80_seen,lex_cluster_8_ability,lex_cluster_8_seen,lex_cluster_9_ability,lex_cluster_9_seen
0,u:0X2,15,9,13.563969,0.000000,0.000000,0.000000,0.00,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
1,u:0b,12,12,3.629236,0.000000,0.000000,0.000000,0.00,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.026539,1.340426
2,u:0xw,20,108,2.633296,0.000000,0.000000,0.000000,0.00,0.000000,0.000000,...,0.007076,0.073171,0.000000,0.000000,0.000000,0.000000,0.010506,0.151515,0.011766,0.063830
3,u:1EH,14,52,2.580331,0.000000,0.000000,0.000000,0.00,0.000000,0.000000,...,0.000000,0.000000,0.012514,0.041667,0.008262,0.152174,0.000000,0.000000,0.000000,0.000000
4,u:1gx,15,71,7.584663,0.000000,0.000000,0.000000,0.00,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.013606,0.130435,0.016060,0.121212,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2704,u:yT9,20,13,13.931693,0.000000,0.000000,0.000000,0.00,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.005221,0.121212,0.000000,0.000000
2705,u:yyO,21,14,2.044361,0.000000,0.000000,0.000000,0.00,0.015141,0.055556,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.007238,0.303030,0.000000,0.000000
2706,u:z4x,85,83,2.375734,0.023098,0.090909,0.012643,0.15,0.008833,0.166667,...,0.000000,0.000000,0.000000,0.000000,0.007621,0.217391,0.016847,0.787879,0.002748,0.127660
2707,u:zmi,12,42,8.380809,0.000000,0.000000,0.000000,0.00,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.008015,0.043478,0.000000,0.000000,0.005306,0.170213


In [142]:
user_fingerprint_B_lex_cluster_only.to_csv("../data/user_fingerprint_B_lex_clusters.csv")

#### Normalised version

In [143]:
from sklearn.preprocessing import StandardScaler

# Select the columns to be formalized. 
cols_to_scale = user_fingerprint_B_lex_cluster_only.columns.drop("user_id")


scaler = StandardScaler()
df_scaled = user_fingerprint_B_lex_cluster_only.copy()
df_scaled[cols_to_scale] = scaler.fit_transform(user_fingerprint_B_lex_cluster_only[cols_to_scale])

df_scaled.to_csv("../data/user_fingerprint_B_lex_clusters_scaled.csv")