In [1]:
!pip install /kaggle/input/sparse-dot-topn-033/sparse_dot_topn-0.3.3-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl

Processing /kaggle/input/sparse-dot-topn-033/sparse_dot_topn-0.3.3-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl
Installing collected packages: sparse-dot-topn
Successfully installed sparse-dot-topn-0.3.3
[0m

In [2]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sparse_dot_topn import awesome_cossim_topn
from tqdm import tqdm

MAX_LEN = 16


content_df = pd.read_csv("/kaggle/input/learning-equality-curriculum-recommendations/content.csv")
topic_df = pd.read_csv("/kaggle/input/learning-equality-curriculum-recommendations/topics.csv")

In [3]:
train_contents = np.load("/kaggle/input/lecr-precomputed/train_content_ids.npy", allow_pickle=True)
content_df["is_train"] = content_df["id"].isin(set(train_contents))
content_df["is_train"].mean()

1.0

In [4]:
content_df.loc[content_df["text"] == content_df["description"], "text"] = None
content_df.loc[content_df["title"] == content_df["description"], "description"] = None

In [5]:
import lightgbm as lgb

N_FOLDS = 4

model_list = []
for f in range(N_FOLDS):
    lgb_model = lgb.Booster(model_file=f"/kaggle/input/lecr-eff-525/lgb_{f}.txt")
    model_list.append(lgb_model)


In [6]:
import sys, os
from tqdm import tqdm
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

sub_df = pd.read_csv("/kaggle/input/learning-equality-curriculum-recommendations/sample_submission.csv")

topic_df["title"] = topic_df["title"].fillna("")
content_df["title"].fillna("", inplace=True)


title_map = topic_df.set_index("id")["title"].to_dict()
parent_map = topic_df.set_index("id")["parent"].to_dict()

topic_df = topic_df[topic_df["has_content"]].reset_index(drop=True)
print(topic_df.shape)

topic_df["parent_title"] = topic_df["parent"].apply(lambda x: title_map.get(x, ""))
print(topic_df.shape)

topic_df["grandpa"] = topic_df["parent"].apply(lambda x: parent_map.get(x))
topic_df["grandpa_title"] = topic_df["grandpa"].apply(lambda x: title_map.get(x, ""))
print(topic_df.shape)

topic_df["ggrandpa"] = topic_df["grandpa"].apply(lambda x: parent_map.get(x))
topic_df["ggrandpa_title"] = topic_df["ggrandpa"].apply(lambda x: title_map.get(x, ""))
print(topic_df.shape)

topic_df["parent"] = topic_df["parent"].fillna(topic_df["id"])

topic_df["sub"] = topic_df["id"].isin(sub_df["topic_id"])
topic_df.shape

(61517, 9)
(61517, 10)
(61517, 12)
(61517, 14)


(61517, 15)

In [7]:
def extract_number(x, is_subtopic=False):
    chapter, rest = x.split(":", 1)
    
    if is_subtopic:
        chapter = chapter.split(".", 1)[0]
        x = f"{chapter}:{rest}"
    chapter = int(chapter)

    return chapter, x


def extract_chapters(df, is_subtopic):
    titles = df["title"].fillna("").values
    
    chapters = np.zeros(len(titles))
    new_titles = np.array(titles)
    
    for i, title in enumerate(titles):
        try:
            chapters[i], new_titles[i] = extract_number(title, is_subtopic)
        except:
            pass
        
    df["chapter"], df["title"] = chapters, new_titles
    df["chapter"] = df["chapter"].astype(int)
    
    df.loc[df["chapter"] == 0, "chapter"] = None
    return df
    
content_df["t"] = (content_df["title"].fillna("") + " | " + content_df["kind"].fillna("")
                   + " | " + content_df["description"].fillna("").apply(lambda x: x[:256]) 
                   + " | " + content_df["text"].fillna("").apply(lambda x: x[:128]))

topic_df = extract_chapters(topic_df, False)
content_df = extract_chapters(content_df, True)

In [8]:
topic_df["t"] = topic_df["title"] + " @ " + topic_df["parent_title"] + " @ " + topic_df["grandpa_title"] + " @ " + topic_df["ggrandpa_title"] + " | " + topic_df["description"].fillna("")

In [9]:
corr_df = pd.read_csv("/kaggle/input/learning-equality-curriculum-recommendations/correlations.csv")

corr_df["content_ids"] = corr_df["content_ids"].apply(lambda x: x.split())

corr_df = corr_df.explode('content_ids').reset_index(drop=True).rename(columns={"content_ids": "content_id"})

corr_df.head()

Unnamed: 0,topic_id,content_id
0,t_00004da3a1b2,c_1108dd0c7a5d
1,t_00004da3a1b2,c_376c5a8eb028
2,t_00004da3a1b2,c_5bc0e1e2cba0
3,t_00004da3a1b2,c_76231f9d0b5e
4,t_00068291e9a4,c_639ea2ef9c95


In [10]:
topic_sub_df = topic_df[topic_df["sub"]].reset_index(drop=True)

In [11]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sparse_dot_topn import awesome_cossim_topn


def get_matches(topic_df, content_df, title_only):
    if title_only:
        tfidf = TfidfVectorizer(ngram_range=(1, 1), min_df=2)
        tfidf.fit(pd.concat([content_df["title"], topic_df["title"]]))
    else:
        tfidf = TfidfVectorizer(analyzer="char", ngram_range=(4, 4), min_df=2)
        tfidf.fit(pd.concat([content_df["t"], topic_df["t"]]))    
    print(len(tfidf.vocabulary_), topic_df.shape[0], content_df.shape[0])
    
    train_df = topic_df[~topic_df["sub"]].reset_index(drop=True)
    
    topic_df = topic_df[topic_df["sub"]].reset_index(drop=True)
    
    if title_only:
        V_topic = tfidf.transform(topic_df["title"])
        V_content = tfidf.transform(content_df["title"])
        V_train = tfidf.transform(train_df["title"])
    else:
        V_topic = tfidf.transform(topic_df["title"] + " " + topic_df["t"])
        V_content = tfidf.transform(content_df["title"] + " " + content_df["t"])
        V_train = tfidf.transform(train_df["title"] + " " + train_df["t"])
        
        
    score_col = "match_score"
    if title_only:
        score_col = "match_score2"

    d = awesome_cossim_topn(V_topic, V_content.T, 20, 0.01, use_threads=True, n_jobs=4)

    non_zeros = d.nonzero()

    name_indices = non_zeros[0]
    gt_indices = non_zeros[1]

    content_ids = content_df["id"].values
    topic_ids = topic_df["id"].values

    left_side = np.empty(gt_indices.size, dtype=object)
    right_side = np.empty(gt_indices.size, dtype=object)
    match_score = np.zeros(gt_indices.size)

    for index in range(gt_indices.size):
        left_side[index] = topic_ids[name_indices[index]]
        right_side[index] = content_ids[gt_indices[index]]
        match_score[index] = d.data[index]

    res_df = pd.DataFrame({"topic_id": left_side,
                           'content_id': right_side,
                           score_col: match_score
                          })
    
    d = awesome_cossim_topn(V_topic, V_train.T, 1, 0.0, use_threads=True, n_jobs=4)

    non_zeros = d.nonzero()

    name_indices = non_zeros[0]
    gt_indices = non_zeros[1]

    train_ids = train_df["id"].values

    left_side = np.empty(gt_indices.size, dtype=object)
    right_side = np.empty(gt_indices.size, dtype=object)
    match_score = np.zeros(gt_indices.size)

    for index in range(gt_indices.size):
        left_side[index] = topic_ids[name_indices[index]]
        right_side[index] = train_ids[gt_indices[index]]
        match_score[index] = d.data[index]

    topic_ref_df = pd.DataFrame({"id": left_side,
                           'train_id': right_side,
                           score_col: match_score
                          })


    return res_df, topic_ref_df

def get_dfs(title_only):
    res_dfs, topic_ref_dfs = [], []

    for lang in topic_sub_df["language"].unique():
        print(lang)
        content_df_lang = content_df[(content_df["language"] == lang)].reset_index(drop=True)
        topic_df_lang = topic_df[(topic_df["language"] == lang)].reset_index(drop=True)

        if content_df_lang.shape[0] > 0:
            res_df, topic_ref_df = get_matches(topic_df_lang, content_df_lang, title_only)
            res_dfs.append(res_df)
            topic_ref_dfs.append(topic_ref_df)        
        print("----")

    res_df = pd.concat(res_dfs)
    topic_ref_df = pd.concat(topic_ref_dfs)
    
    return res_df, topic_ref_df

res_df, topic_ref_df = get_dfs(False)
res_df2, topic_ref_df2 = get_dfs(True)

bg
38991 2420 6050
----
pt
39484 3425 10435
----
en
121492 28053 65939
----
bg
2940 2420 6050
----
pt
4172 3425 10435
----
en
12763 28053 65939
----


In [12]:
second_degree_match_df = topic_ref_df.merge(corr_df, left_on="train_id", right_on="topic_id")[["id", "content_id"]]
second_degree_match_df["second_degree"] = True
second_degree_match_df.rename(columns={"id": "topic_id"}, inplace=True)
second_degree_match_df.shape

(25, 3)

In [13]:
second_degree_match_df2 = topic_ref_df2.merge(corr_df, left_on="train_id", right_on="topic_id")[["id", "content_id"]]
second_degree_match_df2["second_degree2"] = True
second_degree_match_df2.rename(columns={"id": "topic_id"}, inplace=True)
second_degree_match_df2.shape

(26, 3)

In [14]:
topic_df.columns = [f"topic_{col}" for col in topic_df.columns]
content_df.columns = [f"content_{col}" for col in content_df.columns]

corr_df["target"] = 1

topic_df["key"] = topic_df["topic_title"].fillna("") + " | " + topic_df["topic_description"].fillna("")

train_df = topic_df[~topic_df["topic_sub"]]

lookup = train_df.merge(corr_df, on="topic_id").groupby(["key", "content_id"])["topic_channel"].count().reset_index()
lookup.rename(columns={"topic_channel": "dup_count"}, inplace=True)

    
dup_df = topic_df[topic_df["topic_sub"]][["topic_id", "key"]].merge(lookup, on=["key"])[["topic_id", 'content_id', "dup_count"]]
dup_df

Unnamed: 0,topic_id,content_id,dup_count
0,t_00069b63a70a,c_01a0e6e59063,1
1,t_00069b63a70a,c_037b8be79422,1
2,t_00069b63a70a,c_03b5ed305bcc,1
3,t_00069b63a70a,c_04a65a739d47,1
4,t_00069b63a70a,c_05ff8bd1fd30,1
...,...,...,...
149,t_00069b63a70a,c_fbb631d460b8,2
150,t_00069b63a70a,c_fbf85d018b8a,1
151,t_00069b63a70a,c_fda21411f22d,1
152,t_00069b63a70a,c_fe0ef07ada86,1


In [15]:
res_df = res_df.merge(res_df2, how="outer", on=["topic_id", "content_id"])
res_df = res_df.merge(dup_df, how="outer", on=["topic_id", "content_id"])
res_df = res_df.merge(second_degree_match_df, how="outer", on=["topic_id", "content_id"])
res_df = res_df.merge(second_degree_match_df2, how="outer", on=["topic_id", "content_id"])

res_df.shape

(329, 7)

In [16]:
topic_df["key"] = topic_df["topic_title"].fillna("")

train_df = topic_df[~topic_df["topic_sub"]]


lookup = train_df.merge(corr_df, on="topic_id").groupby(["key", "content_id"])["topic_channel"].count().reset_index()
lookup.rename(columns={"topic_channel": "tdup_count"}, inplace=True)

dup_df = topic_df[topic_df["topic_sub"]].merge(lookup, on=["key"])[["topic_id", 'content_id', "tdup_count"]]

res_df = res_df.merge(dup_df, how="outer", on=["topic_id", "content_id"])
res_df.shape

(329, 8)

In [17]:
topic_df["key"] = topic_df["topic_parent"].fillna(topic_df["topic_id"])

train_df = topic_df[~topic_df["topic_sub"]]


lookup = train_df.merge(corr_df, on="topic_id").groupby(["key", "content_id"])["topic_channel"].count().reset_index()
lookup.rename(columns={"topic_channel": "pdup_count"}, inplace=True)

dup_df = topic_df[topic_df["topic_sub"]].merge(lookup, on=["key"])[["topic_id", 'content_id', "pdup_count"]]

res_df = res_df.merge(dup_df, how="outer", on=["topic_id", "content_id"])
res_df.shape

(531, 9)

In [18]:
res_df = res_df.merge(topic_df[topic_df["topic_sub"]], on="topic_id", how="left")
res_df = res_df.merge(content_df, on="content_id", how="left")

res_df.shape

(531, 36)

In [19]:
res_df["topic_language"] = res_df["topic_language"].astype("category")
res_df["topic_category"] = res_df["topic_category"].astype("category")
res_df["content_kind"] = res_df["content_kind"].astype("category")
res_df["topic_channel"] = res_df["topic_channel"].astype("category")

res_df["len_topic_title"] = res_df["topic_title"].fillna("").apply(len)
res_df["len_topic_description"] = res_df["topic_description"].fillna("").apply(len)
res_df["len_content_title"] = res_df["content_title"].fillna("").apply(len)
res_df["len_content_description"] = res_df["content_description"].fillna("").apply(len)
res_df["len_content_text"] = res_df["content_text"].fillna("").apply(len)

In [20]:
res_df["match_score_max"] = res_df.groupby("topic_id")["match_score"].transform("max")
res_df["match_score_min"] = res_df.groupby("topic_id")["match_score"].transform("min")

res_df["match_score2_max"] = res_df.groupby("topic_id")["match_score2"].transform("max")
res_df["match_score2_min"] = res_df.groupby("topic_id")["match_score2"].transform("min")

res_df["dup_count"] = res_df["dup_count"].fillna(0)
res_df["total_count"] = res_df.groupby("topic_id")["content_id"].transform("count")
res_df["dup_count_mean"] = res_df.groupby("topic_id")["dup_count"].transform("mean")

res_df["tdup_count"] = res_df["tdup_count"].fillna(0)
res_df["tdup_count_mean"] = res_df.groupby("topic_id")["tdup_count"].transform("mean")

res_df["pdup_count"] = res_df["pdup_count"].fillna(0)
res_df["pdup_count_mean"] = res_df.groupby("topic_id")["pdup_count"].transform("mean")

res_df["same_chapter"] = res_df["topic_chapter"] == res_df["content_chapter"]
res_df["starts_same"] = res_df["topic_title"].apply(lambda x: x.split(" ", 1)[0]) == res_df["content_title"].apply(lambda x: x.split(" ", 1)[0])

In [21]:
res_df["second_degree"].fillna(False, inplace=True)
res_df["second_degree2"].fillna(False, inplace=True)

res_df["topic_max_train_score"] = res_df["topic_id"].map(topic_ref_df.set_index("id")["match_score"].to_dict())

In [22]:
features = ["match_score", "match_score_max", "match_score_min",
            "match_score2", "match_score2_max", "match_score2_min",
            "len_content_text",
            "len_topic_title", "len_topic_description", "len_content_title", "len_content_description",
             "dup_count", "total_count", "dup_count_mean",
            "tdup_count", "tdup_count_mean", "pdup_count", "pdup_count_mean",
            "topic_language", "topic_category", "content_kind", "topic_level",
            "same_chapter", "starts_same", "content_is_train", "topic_channel", 
            #"content_max_train_score", 
            "topic_max_train_score", "second_degree", "second_degree2"
           ]
res_df["pred"] = 0

for f in range(N_FOLDS):
    res_df["pred"] += model_list[f].predict(res_df[features]) / N_FOLDS

In [23]:
res_df["rank"] = res_df.groupby("topic_id")["pred"].rank(method="first", ascending=False)

res_df["gap"] = res_df.groupby("topic_id")["pred"].transform("max") - res_df["pred"]

res_df["content_dist"] = res_df["gap"] + res_df.groupby("content_id")["pred"].transform("max") - res_df["pred"]

res_df["content_rank"] = res_df.groupby("content_id")["content_dist"].rank(method="first")

pred_df = res_df[(res_df["rank"] == 1) | 
                 ((res_df["gap"] < 0.25) & res_df["content_is_train"] & (res_df["pred"] > 0.05)) | 
                 (~res_df["content_is_train"] & (res_df["content_rank"] == 1) & (res_df["content_dist"] < 0.65)) | 
                 (~res_df["content_is_train"] & (res_df["gap"] < 0.05))]

pred_df = pred_df.groupby("topic_id")["content_id"].apply(lambda x: " ".join(list(x)))
pred_df = pred_df.reset_index().rename(columns={"content_id": 'content_ids'})

In [24]:
res_df = topic_df[topic_df["topic_sub"]][["topic_id"]].merge(pred_df, on="topic_id", how="left")
res_df["content_ids"].fillna("", inplace=True)
res_df

Unnamed: 0,topic_id,content_ids
0,t_00004da3a1b2,c_2b6301d9ada5 c_e1e8557d7c61 c_0b4a3ea959ba c...
1,t_00068291e9a4,c_e88be716634d c_ebb7fdf10a7e c_639ea2ef9c95 c...
2,t_00069b63a70a,c_428ea1835953
3,t_0006d41a73a8,c_9c8d565139fb c_1c57a1316568 c_0efcce41c54e c...
4,t_4054df11a74e,c_11a1dc0bfb99 c_841ceaeb5125 c_ba9ecebede11 c...


In [25]:
res_df.to_csv("submission.csv", index=False)