In [1]:
!pip install /kaggle/input/sparse-dot-topn-033/sparse_dot_topn-0.3.3-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl

Processing /kaggle/input/sparse-dot-topn-033/sparse_dot_topn-0.3.3-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl
Installing collected packages: sparse-dot-topn
Successfully installed sparse-dot-topn-0.3.3
[0m

In [2]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sparse_dot_topn import awesome_cossim_topn
from tqdm import tqdm

MAX_LEN = 64


content_df = pd.read_csv("/kaggle/input/learning-equality-curriculum-recommendations/content.csv")
topic_df = pd.read_csv("/kaggle/input/learning-equality-curriculum-recommendations/topics.csv")

In [3]:
train_contents = np.load("/kaggle/input/lecr-precomputed/train_content_ids.npy", allow_pickle=True)
content_df["is_train"] = content_df["id"].isin(set(train_contents))
content_df["is_train"].mean()

1.0

In [4]:
content_df.loc[content_df["title"] == content_df["description"], "description"] = None
content_df["description"].fillna(content_df["text"].fillna("").apply(lambda x: x[:256]), inplace=True)

In [5]:
import sys, os
from tqdm import tqdm
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from torch.utils.data import Dataset, DataLoader
import torch
import torch.nn.functional as F
import torch.nn as nn
import math
from transformers import AutoConfig, AutoTokenizer, AutoModelForMaskedLM, AutoModel


class VecModel(nn.Module):
    def __init__(self, model_name, size, has_top=True):
        super(VecModel, self).__init__()
        conf = AutoConfig.from_pretrained(model_name)
        self.backbone = AutoModel.from_config(conf)
        
        self.has_top = has_top
        
        if self.has_top:
            self.bn = nn.BatchNorm1d(size)
            self.top = nn.Linear(size, size)

    def forward(self, ids, mask):
        out = self.backbone(ids, mask)[0]
        out = (out[:, 1:MAX_LEN//2, :]*mask[:, 1:MAX_LEN//2, None]).mean(axis=1)
        
        if self.has_top:
            out = self.top(self.bn(out))
        
        return F.normalize(out)

    def save(self, path):
        torch.save(self.state_dict(), path)

    def load(self, path):
        self.load_state_dict(torch.load(path, map_location='cpu'))

In [6]:
sub_df = pd.read_csv("/kaggle/input/learning-equality-curriculum-recommendations/sample_submission.csv")

topic_df["title"] = topic_df["title"].fillna("")
content_df["title"].fillna("", inplace=True)


title_map = topic_df.set_index("id")["title"].to_dict()
parent_map = topic_df.set_index("id")["parent"].to_dict()

topic_df = topic_df[topic_df["has_content"]].reset_index(drop=True)
print(topic_df.shape)

topic_df["parent_title"] = topic_df["parent"].apply(lambda x: title_map.get(x, ""))
print(topic_df.shape)

topic_df["grandpa"] = topic_df["parent"].apply(lambda x: parent_map.get(x))
topic_df["grandpa_title"] = topic_df["grandpa"].apply(lambda x: title_map.get(x, ""))
print(topic_df.shape)

topic_df["ggrandpa"] = topic_df["grandpa"].apply(lambda x: parent_map.get(x))
topic_df["ggrandpa_title"] = topic_df["ggrandpa"].apply(lambda x: title_map.get(x, ""))
print(topic_df.shape)

topic_df["parent"] = topic_df["parent"].fillna(topic_df["id"])

topic_df["sub"] = topic_df["id"].isin(sub_df["topic_id"])
topic_df.shape

(61517, 9)
(61517, 10)
(61517, 12)
(61517, 14)


(61517, 15)

In [7]:
def extract_number(x, is_subtopic=False):
    chapter, rest = x.split(":", 1)
    
    if is_subtopic:
        chapter = chapter.split(".", 1)[0]
        x = f"{chapter}:{rest}"
    chapter = int(chapter)

    return chapter, x


def extract_chapters(df, is_subtopic):
    titles = df["title"].fillna("").values
    
    chapters = np.zeros(len(titles))
    new_titles = np.array(titles)
    
    for i, title in enumerate(titles):
        try:
            chapters[i], new_titles[i] = extract_number(title, is_subtopic)
        except:
            pass
        
    df["chapter"], df["title"] = chapters, new_titles
    df["chapter"] = df["chapter"].astype(int)
    
    df.loc[df["chapter"] == 0, "chapter"] = None
    return df
    
content_df["t"] = content_df["title"].fillna("") + " | " + content_df["kind"].fillna("") + " | " + content_df["description"].fillna("")

topic_df = extract_chapters(topic_df, False)
content_df = extract_chapters(content_df, True)

In [8]:
topic_df["t"] = topic_df["title"] + " @ " + topic_df["parent_title"] + " @ " + topic_df["grandpa_title"] + " @ " + topic_df["ggrandpa_title"] + " | " + topic_df["description"].fillna("")

In [9]:
corr_df = pd.read_csv("/kaggle/input/learning-equality-curriculum-recommendations/correlations.csv")

corr_df["content_ids"] = corr_df["content_ids"].apply(lambda x: x.split())

corr_df = corr_df.explode('content_ids').reset_index(drop=True).rename(columns={"content_ids": "content_id"})

corr_df.head()

Unnamed: 0,topic_id,content_id
0,t_00004da3a1b2,c_1108dd0c7a5d
1,t_00004da3a1b2,c_376c5a8eb028
2,t_00004da3a1b2,c_5bc0e1e2cba0
3,t_00004da3a1b2,c_76231f9d0b5e
4,t_00068291e9a4,c_639ea2ef9c95


In [10]:
MODEL_DIR = "/kaggle/input/xlm-roberta-base-config"

roberta_tokenizer = AutoTokenizer.from_pretrained(MODEL_DIR)

MODEL_DIR = "/kaggle/input/bert-base-multilingual-uncased-config"

bert_tokenizer = AutoTokenizer.from_pretrained(MODEL_DIR)

In [11]:
model_w = [0.3, 0.3, 0.4]


models = [VecModel("/kaggle/input/bert-base-multilingual-uncased-config", 768, has_top=False), 
          VecModel("/kaggle/input/xlm-roberta-base-config", 768, has_top=False), 
          VecModel("/kaggle/input/xlm-roberta-large-config", 1024)]
for model in models:
    model.cuda()
    model.eval()
    
models[0].load("/kaggle/input/lecr-models-758/vec_model_v36bert_full.pth")
models[1].load("/kaggle/input/lecr-models-754/vec_model_v36_full.pth")
models[2].load("/kaggle/input/lecr-models-754/vec_model_v35large_full.pth")

In [12]:
import gc


gc.collect()

78

In [13]:
NW = 8
BS = 64

class LECRDataset(Dataset):
    def __init__(self, df, max_len=512):
        self.df = df.reset_index(drop=True)
        self.max_len = max_len

    def __len__(self):
        return self.df.shape[0]

    def __getitem__(self, index):
        row = self.df.iloc[index]
        encoding = roberta_tokenizer(str(row.t), padding='max_length', 
                                  truncation=True, max_length=self.max_len)

        encoding = {key: torch.as_tensor(val) for key, val in encoding.items()}

        encoding2 = bert_tokenizer(str(row.t), padding='max_length', 
                              truncation=True, max_length=self.max_len)
        encoding2 = {f"bert_{key}": torch.as_tensor(val) for key, val in encoding2.items()}
        
        encoding.update(encoding2)

        return encoding

In [14]:
from sklearn.neighbors import NearestNeighbors


def to_vec(models, ds):
    val_loader = DataLoader(ds, batch_size=BS, shuffle=False, num_workers=NW,
                             pin_memory=False, drop_last=False)

    tbar = tqdm(val_loader, file=sys.stdout)

    vectors = []

    with torch.no_grad():
        for idx, data in enumerate(tbar):
            ids, mask = data["input_ids"].cuda(), data["attention_mask"].cuda()
            bert_ids, bert_mask = data["bert_input_ids"].cuda(), data["bert_attention_mask"].cuda()
            
            vec_list = [model_w[0]*models[0](bert_ids, bert_mask)]
            for i in range(1, 3):
                vec_list.append(model_w[i]*models[i](ids, mask))
                

            vec = torch.cat(vec_list, axis=1)
            vectors.append(vec.detach().cpu().numpy())

    V = np.concatenate(vectors)
    
    return V



def get_matches(V_topic, V_content, topic_ids, content_ids, n_neighbors=5):
    
    neighbors_model = NearestNeighbors(n_neighbors=n_neighbors, metric='cosine', n_jobs=-1)
    neighbors_model.fit(V_content)
    dists, indices = neighbors_model.kneighbors(V_topic)
    
    res_df = pd.DataFrame({"topic_id": np.repeat(topic_ids, n_neighbors),
                           'content_id': content_ids[indices.ravel()],
                           'vec_dist': dists.ravel()
                          })
    return res_df


topic_sub_df = topic_df[topic_df["sub"]].reset_index(drop=True)

topic_ds = LECRDataset(topic_sub_df, MAX_LEN)
content_ds = LECRDataset(content_df, MAX_LEN)


V_topic = to_vec(models, topic_ds)
V_content = to_vec(models, content_ds)

  cpuset_checked))


100%|██████████| 1/1 [00:02<00:00,  2.51s/it]
100%|██████████| 2407/2407 [24:09<00:00,  1.66it/s]


In [15]:
res_dfs = []

for lang in topic_sub_df["language"].unique():
    topic_ix = np.where(topic_sub_df["language"] == lang)[0]
    content_ix = np.where(content_df["language"] == lang)[0]
    
    res_dfs.append(get_matches(V_topic[topic_ix], V_content[content_ix], 
                               topic_sub_df["id"].values[topic_ix], content_df["id"].values[content_ix],
                    n_neighbors=20))
res_df = pd.concat(res_dfs)
res_df.shape

(100, 3)

In [16]:
train_df = topic_df[~topic_df["sub"]].reset_index(drop=True)


topic_ds = LECRDataset(train_df, MAX_LEN)

V_topic_train = to_vec(models, topic_ds)


def get_max_train(df, V):
    max_train_scores = np.zeros(df.shape[0])
    matches = np.zeros(df.shape[0])

    for lang in df["language"].unique():
        topic_ix = np.where(train_df["language"] == lang)[0]
        ix = np.where(df["language"] == lang)[0]

        neighbors_model = NearestNeighbors(n_neighbors=1, metric='cosine', n_jobs=-1)
        neighbors_model.fit(V_topic_train[topic_ix])
        dists, idx = neighbors_model.kneighbors(V[ix])

        max_train_scores[ix] = dists.ravel()
        matches[ix] = topic_ix[idx.ravel()]  
        
    return max_train_scores, matches


content_max_train_scores, _ = get_max_train(content_df, V_content)
    
content_df["max_train_score"] = content_max_train_scores

  cpuset_checked))


100%|██████████| 962/962 [09:40<00:00,  1.66it/s]


In [17]:
topic_max_train_scores, topic_matches = get_max_train(topic_sub_df, V_topic)
    
topic_sub_df["max_train_score"] = topic_max_train_scores
topic_sub_df["matched_topic"] = train_df["id"].values[topic_matches.astype(int)]

In [18]:
del V_content, V_topic, V_topic_train
del models

gc.collect()

474

In [19]:
second_degree_match_df = topic_sub_df.merge(corr_df, left_on="matched_topic", right_on="topic_id")[["id", "content_id"]]
second_degree_match_df["second_degree"] = True
second_degree_match_df.rename(columns={"id": "topic_id"}, inplace=True)
second_degree_match_df.shape

(23, 3)

In [20]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sparse_dot_topn import awesome_cossim_topn


def get_matches(topic_df, content_df):
    tfidf = TfidfVectorizer(analyzer="char", ngram_range=(4, 4), min_df=2)

    tfidf.fit(pd.concat([content_df["t"], topic_df["t"]]))
    print(len(tfidf.vocabulary_), topic_df.shape[0], content_df.shape[0])
    
    topic_df = topic_df[topic_df["sub"]].reset_index(drop=True)
    
    V_topic = tfidf.transform(topic_df["title"] + " " + topic_df["t"])
    V_content = tfidf.transform(content_df["title"] + " " + content_df["t"])

    d = awesome_cossim_topn(V_topic, V_content.T, 20, 0.01, use_threads=True, n_jobs=4)

    non_zeros = d.nonzero()

    name_indices = non_zeros[0]
    gt_indices = non_zeros[1]

    content_ids = content_df["id"].values
    topic_ids = topic_df["id"].values

    left_side = np.empty(gt_indices.size, dtype=object)
    right_side = np.empty(gt_indices.size, dtype=object)
    match_score = np.zeros(gt_indices.size)

    for index in range(gt_indices.size):
        left_side[index] = topic_ids[name_indices[index]]
        right_side[index] = content_ids[gt_indices[index]]
        match_score[index] = d.data[index]

    res_df = pd.DataFrame({"topic_id": left_side,
                           'content_id': right_side,
                           'match_score': match_score
                          })
    return res_df


res_dfs = []

for lang in topic_sub_df["language"].unique():
    print(lang)
    content_df_lang = content_df[(content_df["language"] == lang)].reset_index(drop=True)
    topic_df_lang = topic_df[(topic_df["language"] == lang)].reset_index(drop=True)
    
    if content_df_lang.shape[0] > 0:
        res_dfs.append(get_matches(topic_df_lang, content_df_lang))
        
    print("----")
    
res_df2 = pd.concat(res_dfs)
res_df2.shape

bg
34619 2420 6050
----
pt
37485 3425 10435
----
en
136622 28053 65939
----


(100, 3)

In [21]:
topic_df.columns = [f"topic_{col}" for col in topic_df.columns]
content_df.columns = [f"content_{col}" for col in content_df.columns]

corr_df["target"] = 1

topic_df["key"] = topic_df["topic_title"].fillna("") + " | " + topic_df["topic_description"].fillna("")

train_df = topic_df[~topic_df["topic_sub"]]

lookup = train_df.merge(corr_df, on="topic_id").groupby(["key", "content_id"])["topic_channel"].count().reset_index()
lookup.rename(columns={"topic_channel": "dup_count"}, inplace=True)

    
dup_df = topic_df[topic_df["topic_sub"]][["topic_id", "key"]].merge(lookup, on=["key"])[["topic_id", 'content_id', "dup_count"]]
dup_df

Unnamed: 0,topic_id,content_id,dup_count
0,t_00069b63a70a,c_01a0e6e59063,1
1,t_00069b63a70a,c_037b8be79422,1
2,t_00069b63a70a,c_03b5ed305bcc,1
3,t_00069b63a70a,c_04a65a739d47,1
4,t_00069b63a70a,c_05ff8bd1fd30,1
...,...,...,...
149,t_00069b63a70a,c_fbb631d460b8,2
150,t_00069b63a70a,c_fbf85d018b8a,1
151,t_00069b63a70a,c_fda21411f22d,1
152,t_00069b63a70a,c_fe0ef07ada86,1


In [22]:
res_df = res_df.merge(res_df2, how="outer", on=["topic_id", "content_id"])
res_df = res_df.merge(dup_df, how="outer", on=["topic_id", "content_id"])
res_df = res_df.merge(second_degree_match_df, how="outer", on=["topic_id", "content_id"])

res_df.shape

(305, 6)

In [23]:
topic_df["key"] = topic_df["topic_title"].fillna("")

train_df = topic_df[~topic_df["topic_sub"]]


lookup = train_df.merge(corr_df, on="topic_id").groupby(["key", "content_id"])["topic_channel"].count().reset_index()
lookup.rename(columns={"topic_channel": "tdup_count"}, inplace=True)

dup_df = topic_df[topic_df["topic_sub"]].merge(lookup, on=["key"])[["topic_id", 'content_id', "tdup_count"]]

res_df = res_df.merge(dup_df, how="outer", on=["topic_id", "content_id"])
res_df.shape

(305, 7)

In [24]:
topic_df["key"] = topic_df["topic_parent"].fillna(topic_df["topic_id"])

train_df = topic_df[~topic_df["topic_sub"]]


lookup = train_df.merge(corr_df, on="topic_id").groupby(["key", "content_id"])["topic_channel"].count().reset_index()
lookup.rename(columns={"topic_channel": "pdup_count"}, inplace=True)

dup_df = topic_df[topic_df["topic_sub"]].merge(lookup, on=["key"])[["topic_id", 'content_id', "pdup_count"]]

res_df = res_df.merge(dup_df, how="outer", on=["topic_id", "content_id"])
res_df.shape

(510, 8)

In [25]:
res_df = res_df.merge(topic_df[topic_df["topic_sub"]], on="topic_id", how="left")
res_df = res_df.merge(content_df, on="content_id", how="left")

res_df.shape

(510, 36)

In [26]:
res_df["topic_language"] = res_df["topic_language"].astype("category")
res_df["topic_category"] = res_df["topic_category"].astype("category")
res_df["content_kind"] = res_df["content_kind"].astype("category")
res_df["topic_channel"] = res_df["topic_channel"].astype("category")

res_df["len_topic_title"] = res_df["topic_title"].fillna("").apply(len)
res_df["len_topic_description"] = res_df["topic_description"].fillna("").apply(len)
res_df["len_content_title"] = res_df["content_title"].fillna("").apply(len)
res_df["len_content_description"] = res_df["content_description"].fillna("").apply(len)
res_df["len_content_text"] = res_df["content_text"].fillna("").apply(len)

In [27]:
res_df["vec_dist_max"] = res_df.groupby("topic_id")["vec_dist"].transform("max")
res_df["vec_dist_min"] = res_df.groupby("topic_id")["vec_dist"].transform("min")

res_df["dup_count"] = res_df["dup_count"].fillna(0)
res_df["total_count"] = res_df.groupby("topic_id")["content_id"].transform("count")
res_df["dup_count_mean"] = res_df.groupby("topic_id")["dup_count"].transform("mean")

res_df["tdup_count"] = res_df["tdup_count"].fillna(0)
res_df["tdup_count_mean"] = res_df.groupby("topic_id")["tdup_count"].transform("mean")

res_df["pdup_count"] = res_df["pdup_count"].fillna(0)
res_df["pdup_count_mean"] = res_df.groupby("topic_id")["pdup_count"].transform("mean")

res_df["same_chapter"] = res_df["topic_chapter"] == res_df["content_chapter"]
res_df["starts_same"] = res_df["topic_title"].apply(lambda x: x.split(" ", 1)[0]) == res_df["content_title"].apply(lambda x: x.split(" ", 1)[0])

res_df.loc[~res_df["content_is_train"], "content_max_train_score"] = None
res_df["second_degree"].fillna(False, inplace=True)
res_df["topic_max_train_score"] = res_df["topic_id"].map(topic_sub_df.set_index("id")["max_train_score"].to_dict())

In [28]:
import lightgbm as lgb

N_FOLDS = 4

res_df["pred"] = 0    
    
features = ["match_score",
            "vec_dist", "vec_dist_max", "vec_dist_min", "len_content_text",
            "len_topic_title", "len_topic_description", "len_content_title", "len_content_description",
             "dup_count", "total_count", "dup_count_mean",
            "tdup_count", "tdup_count_mean", "pdup_count", "pdup_count_mean",
            "topic_language", "topic_category", "content_kind", "topic_level",
            "same_chapter", "starts_same", "content_is_train", "topic_channel", 
            "content_max_train_score", "topic_max_train_score", "second_degree"
           ]

for f in range(N_FOLDS):
    lgb_model = lgb.Booster(model_file=f"/kaggle/input/lecr-models-758/lgb_{f}.txt")
    res_df["pred"] += lgb_model.predict(res_df[features]) / N_FOLDS / 2

In [29]:
features = ["match_score", #"match_score_max", "match_score_min",
            "vec_dist", #"vec_dist_max", "vec_dist_min", 
            "len_content_text",
            "len_topic_title", "len_topic_description", "len_content_title", "len_content_description",
             "dup_count", "total_count", "dup_count_mean",
            "tdup_count", "tdup_count_mean", "pdup_count", "pdup_count_mean",
            "topic_language", "topic_category", 
            "content_kind", "topic_level",
            "same_chapter", "starts_same", "content_is_train", "topic_channel",
            "content_max_train_score", "topic_max_train_score", "second_degree"
           ]

for f in range(N_FOLDS):
    lgb_model = lgb.Booster(model_file=f"/kaggle/input/lecr-models-752-exp-diverse/lgb_{f}.txt")
    res_df["pred"] += lgb_model.predict(res_df[features]) / N_FOLDS / 2

In [30]:
res_df["rank"] = res_df.groupby("topic_id")["pred"].rank(method="first", ascending=False)

res_df["gap"] = res_df.groupby("topic_id")["pred"].transform("max") - res_df["pred"]

res_df["content_dist"] = res_df["gap"] + res_df.groupby("content_id")["pred"].transform("max") - res_df["pred"]

res_df["content_rank"] = res_df.groupby("content_id")["content_dist"].rank(method="first")

pred_df = res_df[(res_df["rank"] == 1) | 
                 ((res_df["gap"] < 0.25) & res_df["content_is_train"] & (res_df["pred"] > 0.05)) | 
                 (~res_df["content_is_train"] & (res_df["content_rank"] == 1) & (res_df["content_dist"] < 0.55)) | 
                 (~res_df["content_is_train"] & (res_df["gap"] < 0.05))]

pred_df = pred_df.groupby("topic_id")["content_id"].apply(lambda x: " ".join(list(x)))
pred_df = pred_df.reset_index().rename(columns={"content_id": 'content_ids'})

In [31]:
res_df = topic_df[topic_df["topic_sub"]][["topic_id"]].merge(pred_df, on="topic_id", how="left")
res_df["content_ids"].fillna("", inplace=True)
res_df

Unnamed: 0,topic_id,content_ids
0,t_00004da3a1b2,c_5bc0e1e2cba0 c_76231f9d0b5e c_1108dd0c7a5d c...
1,t_00068291e9a4,c_ebb7fdf10a7e c_639ea2ef9c95 c_89ce9367be10 c...
2,t_00069b63a70a,c_11a1dc0bfb99
3,t_0006d41a73a8,c_0c6473c3480d c_d7a0d7eaf799 c_b972646631cb c...
4,t_4054df11a74e,c_3695c5dc1df6


In [32]:
res_df.to_csv("submission.csv", index=False)