In [47]:
import pandas as pd
import numpy as np

## Load CSV data (Dataset A)

In [48]:
csv_path = r'C:\Users\lxa0530\OneDrive\Desktop\Datathon\Adding_B_feature\input\Spaced Repetition Data.csv'
# Change it to your actual path of dataset A

df = pd.read_csv(csv_path)

In [49]:
df.head()

Unnamed: 0,p_recall,timestamp,delta,user_id,learning_language,ui_language,lexeme_id,lexeme_string,history_seen,history_correct,session_seen,session_correct
0,1.0,1362076081,27649635,u:FO,de,en,76390c1350a8dac31186187e2fe1e178,lernt/lernen<vblex><pri><p3><sg>,6,4,2,2
1,0.5,1362076081,27649635,u:FO,de,en,7dfd7086f3671685e2cf1c1da72796d7,die/die<det><def><f><sg><nom>,4,4,2,1
2,1.0,1362076081,27649635,u:FO,de,en,35a54c25a2cda8127343f6a82e6f6b7d,mann/mann<n><m><sg><nom>,5,4,1,1
3,0.5,1362076081,27649635,u:FO,de,en,0cf63ffe3dda158bc3dbd55682b355ae,frau/frau<n><f><sg><nom>,6,5,2,1
4,1.0,1362076081,27649635,u:FO,de,en,84920990d78044db53c1b012f5bf9ab5,das/das<det><def><nt><sg><nom>,4,4,1,1


## Extract the pt data

In [50]:
df_pt = df[df['learning_language'] == 'pt'].copy()
df_pt

Unnamed: 0,p_recall,timestamp,delta,user_id,learning_language,ui_language,lexeme_id,lexeme_string,history_seen,history_correct,session_seen,session_correct
64,1.0,1362082493,1469,u:g3WM,pt,en,57408f89412af98111a2f87c0ab41b22,tu/tu<prn><tn><p2><mf><sg>,48,48,1,1
65,0.5,1362082493,1469,u:g3WM,pt,en,8414835cb39e4315146a59fefdd6d1c6,tem/ter<vblex><pri><p3><sg>,2,2,2,1
66,1.0,1362082493,1469,u:g3WM,pt,en,ecc3feb8e53ce936cef181dd54e7aaca,temos/ter<vblex><pri><p1><pl>,1,1,1,1
67,1.0,1362082493,2184,u:g3WM,pt,en,8d28ba0fa188f1847571467189846dda,tua/teu<det><pos><f><sg>,4,3,1,1
68,1.0,1362082493,1469,u:g3WM,pt,en,4b3613233b3fede2e3e92ac2ef752bf6,leão/leão<n><m><sg>,9,8,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...
12852940,1.0,1363104812,604744,u:h3b5,pt,en,09cd6338773c2b99587b336a6141b185,gatas/gato<n><f><pl>,13,12,1,1
12852941,1.0,1363104812,285948,u:h3b5,pt,en,7ab5ef9586b79b2079fcce99fadd811f,cão/cão<n><m><sg>,8,7,1,1
12852942,1.0,1363104812,841,u:h3b5,pt,en,ba634cde4013c8ff4eee5892e92b3e5b,é/ser<vbser><pri><p3><sg>,54,49,1,1
12852943,1.0,1363104812,841,u:h3b5,pt,en,630b709686520fc4f0d0e6077df220c3,bebe/beber<vblex><pri><p3><sg>,27,27,2,2


## Calculate history accuracy and assaign lexeme code

In [51]:
df_pt['history_acc_rate'] = df_pt['history_correct'] / (df_pt['history_seen'] + 0.0001)
df_pt['lexeme_code'], uniques = pd.factorize(df_pt['lexeme_id'])

## Get the specific word for each lexeme

In [52]:
def parse_word_from_lexeme_string(s: str) -> str:
    """
    Parse `word` from `lexeme_string`.

    Rules you specified:
    - Normal case (no '<*sf>' marker): take everything before the first '<'
        'sua/seu<det><pos><f><sg>' -> 'sua/seu'
    - If '<*sf>' occurs: take everything before the second '<'
        '<*sf>/ter<vblex><pri><*pers><*numb>' -> '<*sf>/ter'
      (i.e., include the '<*sf>' prefix and the lemma after '/')
    """
    if pd.isna(s):
        return None
    s = str(s).strip()

    if "<*sf>" not in s:
        return s.split("<", 1)[0].strip()

    # With '<*sf>': return substring before the second '<'
    first = s.find("<")
    if first == -1:
        return s  # no tags at all
    second = s.find("<", first + 1)
    if second == -1:
        return s  # only one '<' found
    return s[:second].strip()

# 1) Create `word` parsed from lexeme_string
df_word = df_pt.copy()
df_word["word"] = df_word["lexeme_string"].apply(parse_word_from_lexeme_string)

# 2) Aggregate per lexeme_id (and keep a consistent word)
agg = (
    df_word.groupby("lexeme_id", as_index=False)
      .agg(
          lexeme_code=("lexeme_code", "first"),
          word=("word", "first"),  # lexeme_id maps to one lexeme_string/word per your description
          seen_sum=("history_seen", "sum"),
          correct_sum=("history_correct", "sum"),
      )
)

# 3) Compute global_correctness = sum(correct) / sum(seen)
#    (avoid division by zero)
agg["global_correctness"] = agg["correct_sum"] / (agg["seen_sum"] + 0.0001)

# 4) Final output table
# 4) Final output + sort by lexeme_code ascending
result = (
    agg[["lexeme_id", "lexeme_code", "word", "global_correctness"]]
      .sort_values(["lexeme_code"], ascending=True)
      .reset_index(drop=True)
)

result.head()

Unnamed: 0,lexeme_id,lexeme_code,word,global_correctness
0,57408f89412af98111a2f87c0ab41b22,0,tu/tu,0.969137
1,8414835cb39e4315146a59fefdd6d1c6,1,tem/ter,0.967542
2,ecc3feb8e53ce936cef181dd54e7aaca,2,temos/ter,0.982818
3,8d28ba0fa188f1847571467189846dda,3,tua/teu,0.920078
4,4b3613233b3fede2e3e92ac2ef752bf6,4,leão/leão,0.926518


## Get the word frequecy in dataset B

In [53]:
import re

TAG_RE = re.compile(r"^<[^>]+>$")  # like <*sf>

def query_from_word(word: str) -> str | None:
    if pd.isna(word):
        return None
    w = str(word).strip()
    if not w:
        return None

    if "/" in w:
        left, right = w.split("/", 1)
        left, right = left.strip(), right.strip()

        # If left is a tag like <*sf>, use right
        if TAG_RE.match(left):
            q = right
        else:
            # In tem/ter, right is lemma; still use right by default
            q = right
    else:
        q = w

    # drop if query itself is a tag
    if TAG_RE.match(q):
        return None

    return q.lower()

In [54]:
def load_datasetB_txt(path: str) -> pd.DataFrame:
    rows = []
    current_prompt = None

    with open(path, "r", encoding="utf-8") as f:
        for raw in f:
            line = raw.strip()

            # blank line: end of a prompt block
            if not line:
                current_prompt = None
                continue

            # prompt header line
            if line.startswith("prompt_") and "|" in line:
                current_prompt = line.split("|", 1)[0].strip()  # keep prompt_xxx id
                continue

            # translation line
            if current_prompt is None:
                # stray line; skip (or raise)
                continue

            if "|" not in line:
                continue

            trans, prob = line.rsplit("|", 1)
            try:
                p = float(prob)
            except ValueError:
                continue

            rows.append({"prompt_id": current_prompt, "translation": trans, "p": p})

    return pd.DataFrame(rows)

In [55]:
# A fairly safe tokenizer for Portuguese-ish text: letters with diacritics + apostrophe + hyphen
TOKEN_RE = re.compile(r"[A-Za-zÀ-ÖØ-öø-ÿ]+(?:[-'][A-Za-zÀ-ÖØ-öø-ÿ]+)*", re.UNICODE)

def tokens_from_translation(text: str) -> set[str]:
    if pd.isna(text):
        return set()
    toks = TOKEN_RE.findall(str(text).lower())
    return set(toks)

def build_token_prompt_freq(dfB: pd.DataFrame) -> pd.DataFrame:
    """
    Returns a DataFrame with columns: token, prompt_id, prompt_freq
    where prompt_freq = sum(p of translations in that prompt containing token).
    """
    tmp = dfB.copy()
    tmp["token_set"] = tmp["translation"].apply(tokens_from_translation)

    # explode token set -> one row per (prompt_id, token) per translation
    exploded = tmp[["prompt_id", "p", "token_set"]].explode("token_set")
    exploded = exploded.rename(columns={"token_set": "token"}).dropna(subset=["token"])

    # sum p within each prompt for each token
    token_prompt = (
        exploded.groupby(["token", "prompt_id"], as_index=False)["p"]
                .sum()
                .rename(columns={"p": "prompt_freq"})
    )
    return token_prompt

In [56]:
def add_frequency_from_datasetB(result: pd.DataFrame, token_prompt: pd.DataFrame, total_prompts: int) -> pd.DataFrame:
    out = result.copy()
    out["query_token"] = out["word"].apply(query_from_word)

    # For each token, compute:
    # - prompt_count = number of prompts where it appears
    # - frequency = mean(prompt_freq) across those prompts
    stats = (
        token_prompt.groupby("token", as_index=False)
                   .agg(
                       prompt_count=("prompt_id", "nunique"),
                       frequency=("prompt_freq", "mean"),
                   )
                   .rename(columns={"token": "query_token"})
    )

    out = out.merge(stats, on="query_token", how="left")
    out["prompt_coverage"] = out["prompt_count"] / total_prompts

    return out

In [57]:
# merge text files into one, dev + test + train
files = [r"C:\Users\lxa0530\OneDrive\Desktop\Datathon\Adding_B_feature\input\dev.en_pt.2020-02-20.gold.txt", 
         r"C:\Users\lxa0530\OneDrive\Desktop\Datathon\Adding_B_feature\input\test.en_pt.2020-02-20.gold.txt", 
         r"C:\Users\lxa0530\OneDrive\Desktop\Datathon\Adding_B_feature\input\train.en_pt.2020-01-13.gold.txt"]
out_file = r"C:\Users\lxa0530\OneDrive\Desktop\Datathon\Adding_B_feature\input\merged_en_pt.txt"

with open(out_file, "w") as out:
    for f in files:
        with open(f, "r") as inp:
            out.write(inp.read())

In [58]:
path_B = r"C:\Users\lxa0530\OneDrive\Desktop\Datathon\merged_en_pt.txt"
dfB = load_datasetB_txt(path_B)
total_prompts = dfB["prompt_id"].nunique()
token_prompt = build_token_prompt_freq(dfB)

token_prompt.head()

Unnamed: 0,token,prompt_id,prompt_freq
0,a,prompt_001dbd157d83706b3cf32f34313ff3ab,0.19783
1,a,prompt_0036b71bbdf9cb0046e200a7741924fb,1.0
2,a,prompt_005a9290311daddd1c9170c5916f9998,0.03208
3,a,prompt_007cf192fbfca3c0f839fa90facbb28f,0.2032
4,a,prompt_007e9fdefb3c7193d4a377b602620014,0.118014


## Adding frequency to the word df

In [59]:
result_with_freq = add_frequency_from_datasetB(result, token_prompt, total_prompts)
result_with_freq = result_with_freq.sort_values("lexeme_code").reset_index(drop=True)

result_with_freq.head()

Unnamed: 0,lexeme_id,lexeme_code,word,global_correctness,query_token,prompt_count,frequency,prompt_coverage
0,57408f89412af98111a2f87c0ab41b22,0,tu/tu,0.969137,tu,632.0,0.142486,0.1264
1,8414835cb39e4315146a59fefdd6d1c6,1,tem/ter,0.967542,ter,63.0,0.269861,0.0126
2,ecc3feb8e53ce936cef181dd54e7aaca,2,temos/ter,0.982818,ter,63.0,0.269861,0.0126
3,8d28ba0fa188f1847571467189846dda,3,tua/teu,0.920078,teu,106.0,0.195032,0.0212
4,4b3613233b3fede2e3e92ac2ef752bf6,4,leão/leão,0.926518,leão,,,


In [60]:
result_with_freq['prompt_count'].isna().sum()

np.int64(377)

There are 377 words out of 2815 never appear in dataset B.

## Calculate weight for each word
$$
\mathrm{weight\_raw}
=
\left(4p(1-p)\right)^{\beta}
\cdot
\left(\sqrt{f}\right)^{\delta}
\cdot
\left(\sqrt{\mathrm{cov}}\right)^{\gamma}
$$

In [61]:
def build_word_weight_table(
    word_with_freq: pd.DataFrame,
    beta: float = 1.0,
    delta: float = 0.5,
    gamma: float = 0.5,
    freq_default: float = 0.01,  # <- NaN frequency uses this, not 0
    cov_default: float = 0.01,   # <- NaN coverage uses this, not 0
    normalize: bool = True
) -> pd.DataFrame:
    df = word_with_freq.copy()

    # correctness
    p = df["global_correctness"].astype(float).clip(0, 1).fillna(0.0)
    w_disc = 4.0 * p * (1.0 - p)

    # frequency: NaN -> default; otherwise keep (and clip)
    f = df["frequency"].astype(float)
    f = f.where(~f.isna(), freq_default).clip(lower=0)

    # coverage: NaN -> default
    cov = df["prompt_coverage"].astype(float)
    cov = cov.where(~cov.isna(), cov_default).clip(0, 1)

    w_freq = np.sqrt(f)
    w_cov  = np.sqrt(cov)

    df["weight_raw"] = (w_disc ** beta) * (w_freq ** delta) * (w_cov ** gamma)

    if normalize:
        mx = df["weight_raw"].max()
        df["weight"] = df["weight_raw"] / mx if mx and np.isfinite(mx) else 0.0
    else:
        df["weight"] = df["weight_raw"]

    cols = [c for c in ["lexeme_id","lexeme_code","word",
                        "global_correctness","frequency","prompt_coverage",
                        "weight"] if c in df.columns]
    return df[cols].sort_values(["lexeme_code","lexeme_id"]).reset_index(drop=True)


word_weight_table = build_word_weight_table(result_with_freq, freq_default=0.01, cov_default=0.01)

In [62]:
word_weight_table.head()

Unnamed: 0,lexeme_id,lexeme_code,word,global_correctness,frequency,prompt_coverage,weight
0,57408f89412af98111a2f87c0ab41b22,0,tu/tu,0.969137,0.142486,0.1264,0.067243
1,8414835cb39e4315146a59fefdd6d1c6,1,tem/ter,0.967542,0.269861,0.0126,0.046538
2,ecc3feb8e53ce936cef181dd54e7aaca,2,temos/ter,0.982818,0.269861,0.0126,0.025025
3,8d28ba0fa188f1847571467189846dda,3,tua/teu,0.920078,0.195032,0.0212,0.114432
4,4b3613233b3fede2e3e92ac2ef752bf6,4,leão/leão,0.926518,,,0.041781


## Use user word correctness and word weight to calculate user feature scores
$$
\mathrm{feature\_score}(u,l)
=
r(u,l)\cdot \sqrt{w(l)}
$$

In [63]:
def build_user_lexeme_features_A(
    user_lexeme_df: pd.DataFrame,
    lexeme_weight_df: pd.DataFrame,
    recall_col: str = "history_acc_rate",
    normalize_weight: bool = False,   # 如果你的 weight 已经是 0-1，就不用再 normalize
) -> pd.DataFrame:
    u = user_lexeme_df.copy()
    w = lexeme_weight_df.copy()

    # optional: normalize weight to [0,1] within this table
    if normalize_weight:
        mx = w["weight"].max()
        w["weight"] = w["weight"] / mx if mx and np.isfinite(mx) else 0.0

    # keep needed columns, dedupe lexeme_code in weight table
    w = w[["lexeme_code", "weight", "global_correctness"]].drop_duplicates("lexeme_code")

    # left join: only user-lexeme pairs that exist in user table are produced (sparse)
    x = u.merge(w, on="lexeme_code", how="left")

    # handle missing weight/global_correctness (if any lexemes not found in w)
    # you can decide to drop them or set safe defaults:
    x["weight"] = x["weight"].astype(float).fillna(0.0).clip(lower=0.0)
    x["global_correctness"] = x["global_correctness"].astype(float).fillna(0.0).clip(0.0, 1.0)

    # clean recall
    x[recall_col] = x[recall_col].astype(float).fillna(0.0).clip(0.0, 1.0)

    # feature score
    #x["feature_score"] = (x[recall_col] - x["global_correctness"]) * np.sqrt(x["weight"])
    x["feature_score"] = x[recall_col] * np.sqrt(x["weight"])
    # output sparse feature table
    out = x[["user_id", "lexeme_code", "feature_score"]].copy()
    return out

# usage
user_lexeme_feature_sparse = build_user_lexeme_features_A(
    user_lexeme_df=df_pt,
    lexeme_weight_df=word_weight_table,  # must include global_correctness
)
user_lexeme_feature_sparse.head()

Unnamed: 0,user_id,lexeme_code,feature_score
0,u:g3WM,0,0.259312
1,u:g3WM,1,0.215716
2,u:g3WM,2,0.158178
3,u:g3WM,3,0.253702
4,u:g3WM,4,0.181691


In [64]:
user_feature_matrix = (
    user_lexeme_feature_sparse
      .pivot_table(index="user_id", columns="lexeme_code", values="feature_score", fill_value=0.0)
)
lexeme_codes = user_feature_matrix.columns.tolist()
col_map = {code: f"lexeme_{i}" for i, code in enumerate(lexeme_codes)}

user_feature_matrix_renamed = user_feature_matrix.rename(columns=col_map)

In [65]:
user_feature_matrix_renamed.to_csv(r'C:\Users\lxa0530\OneDrive\Desktop\Datathon\Adding_B_feature\output\user_feature_matrix.csv')

In [66]:
user_feature_matrix_renamed.head()

lexeme_code,lexeme_0,lexeme_1,lexeme_2,lexeme_3,lexeme_4,lexeme_5,lexeme_6,lexeme_7,lexeme_8,lexeme_9,...,lexeme_2805,lexeme_2806,lexeme_2807,lexeme_2808,lexeme_2809,lexeme_2810,lexeme_2811,lexeme_2812,lexeme_2813,lexeme_2814
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
u:0X2,0.259311,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
u:0b,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
u:0xw,0.0,0.0,0.0,0.0,0.0,0.0,0.487101,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
u:1EH,0.259311,0.0,0.0,0.0,0.204402,0.0,0.365346,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
u:1gx,0.259311,0.0,0.0,0.0,0.0,0.0,0.324745,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## Adding other features

In [67]:
feature_path = r"C:\Users\lxa0530\OneDrive\Desktop\Datathon\Adding_B_feature\users_fingerprint.csv"
user_feature = pd.read_csv(feature_path)

In [68]:
feature_path_B = r"C:\Users\lxa0530\OneDrive\Desktop\Datathon\Adding_B_feature\output\user_feature_matrix.csv"
user_feature_B = pd.read_csv(feature_path_B)

In [69]:
user_feature_B

Unnamed: 0,user_id,lexeme_0,lexeme_1,lexeme_2,lexeme_3,lexeme_4,lexeme_5,lexeme_6,lexeme_7,lexeme_8,...,lexeme_2805,lexeme_2806,lexeme_2807,lexeme_2808,lexeme_2809,lexeme_2810,lexeme_2811,lexeme_2812,lexeme_2813,lexeme_2814
0,u:0X2,0.259311,0.0,0.0,0.0,0.000000,0.0,0.000000,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,u:0b,0.000000,0.0,0.0,0.0,0.000000,0.0,0.000000,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,u:0xw,0.000000,0.0,0.0,0.0,0.000000,0.0,0.487101,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,u:1EH,0.259311,0.0,0.0,0.0,0.204402,0.0,0.365346,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,u:1gx,0.259311,0.0,0.0,0.0,0.000000,0.0,0.324745,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2704,u:yT9,0.000000,0.0,0.0,0.0,0.000000,0.0,0.000000,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2705,u:yyO,0.000000,0.0,0.0,0.0,0.000000,0.0,0.000000,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2706,u:z4x,0.248016,0.0,0.0,0.0,0.175201,0.0,0.487085,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2707,u:zmi,0.000000,0.0,0.0,0.0,0.000000,0.0,0.324745,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [70]:
user_feature

Unnamed: 0.1,Unnamed: 0,user_id,max_history_seen,vocab_size,learning_speed,lexeme_0,lexeme_1,lexeme_2,lexeme_3,lexeme_4,...,lexeme_2805_seen,lexeme_2806_seen,lexeme_2807_seen,lexeme_2808_seen,lexeme_2809_seen,lexeme_2810_seen,lexeme_2811_seen,lexeme_2812_seen,lexeme_2813_seen,lexeme_2814_seen
0,0,u:0X2,15,9,13.563969,0.999994,0.0,0.0,0.0,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1,u:0b,12,12,3.629236,0.000000,0.0,0.0,0.0,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,2,u:0xw,20,108,2.633296,0.000000,0.0,0.0,0.0,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,3,u:1EH,14,52,2.580331,0.999992,0.0,0.0,0.0,0.999987,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,4,u:1gx,15,71,7.584663,0.999994,0.0,0.0,0.0,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2704,2704,u:yT9,20,13,13.931693,0.000000,0.0,0.0,0.0,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2705,2705,u:yyO,21,14,2.044361,0.000000,0.0,0.0,0.0,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2706,2706,u:z4x,85,83,2.375734,0.958847,0.0,0.0,0.0,0.868058,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2707,2707,u:zmi,12,42,8.380809,0.000000,0.0,0.0,0.0,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [71]:
seen_cols = [f"lexeme_{i}_seen" for i in range(2815)]  # lexeme_0_seen ... lexeme_2814_seen
lexeme_cols = [f"lexeme_{i}" for i in range(2815)]    # lexeme_0 ... lexeme_2814

user_feature_keep = (
    ["user_id", "max_history_seen", "vocab_size", "learning_speed"]
    + seen_cols
)
user_feature_B_keep = ["user_id"] + lexeme_cols

df1 = user_feature.loc[:, user_feature_keep]
df2 = user_feature_B.loc[:, user_feature_B_keep]

merged = df1.merge(df2, on="user_id", how="inner")

In [74]:
merged.to_csv(r"C:\Users\lxa0530\OneDrive\Desktop\Datathon\Adding_B_feature\output\user_fingerprint_B.csv", index=False)

## Data normalization

In [76]:
from sklearn.preprocessing import StandardScaler

# Select the columns to be formalized. 
cols_to_scale = merged.columns.drop("user_id")


scaler = StandardScaler()
df_scaled = merged.copy()
df_scaled[cols_to_scale] = scaler.fit_transform(merged[cols_to_scale])

In [78]:
df_scaled.to_csv(r"C:\Users\lxa0530\OneDrive\Desktop\Datathon\Adding_B_feature\output\user_fingerprint_B_scaled.csv", index=False)