# The Quadrilingual Land of Lonpestia

In [1]:
import pandas as pd
import numpy as np
from itertools import permutations
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans

In [2]:
df = pd.read_csv("test_data.csv")
df

Unnamed: 0,datapointID,textA,textB
0,120,o utse airs eaien eii llin cri.,
1,1168,e str er's ga ii hie ied al wi e aad en swih e...,
2,796,"den diin eipri skis, eie ren ers ie taend ie c...",
3,114,he e kekcleazologuclea nem e hejen eleajecleaz...,
4,989,licrhcrevbe coheleaenlicr gleaoup crhiileaed i...,
...,...,...,...
1582,782,e eldi taams i event r uns ae diii ers (i ee i...,
1583,416,elegucleagucleauk eclea felegucleagucleauk cle...,
1584,760,"hev er, iis eded ieet urn, wi e t20 iraif ee, ...",
1585,656,hbele ciicrlicrlecr eleae ucred licro hold ii ...,


## Subtask 1

In [3]:
df1 = df[df['textB'].notnull()]

In [4]:
vectorizer = TfidfVectorizer(analyzer='char', ngram_range=(2, 4))

In [5]:
subtask1_rows = []

for idx, row in df1.iterrows():
    vectors = vectorizer.fit_transform([row['textA'], row['textB']])
    sim = cosine_similarity(vectors[0], vectors[1])[0, 0]
    
    same_lang = sim > 0.4
    subtask1_rows.append([1, row['datapointID'], str(same_lang)])

## Subtask 2

In [6]:
df2 = df[df["textB"].isnull()]

In [7]:
langs = ["Englcrevbeh", "Hungeleabeen", "En Gli", "Hure"]

In [8]:
texts = df2["textA"].tolist()
vectors = vectorizer.fit_transform(texts)

In [9]:
pca = PCA(3)
vectors = pca.fit_transform(vectors)

In [10]:
kmeans = KMeans(n_clusters=len(langs), random_state=42)
clusters = kmeans.fit_predict(vectors)

In [11]:
lang_vectors = vectorizer.transform(langs)
lang_vectors = pca.transform(lang_vectors)

In [12]:
best_perm = None
best_score = -np.inf

for perm in permutations(range(4)):
    l_perm = lang_vectors[list(perm)]
    sim_matrix = cosine_similarity(kmeans.cluster_centers_, l_perm)
    score = np.trace(sim_matrix)  # sum of diagonal elements (pairwise sim)
    
    if score > best_score:
        best_score = score
        best_perm = perm
best_perm, best_score

((1, 0, 2, 3), np.float64(1.4518432190427306))

In [13]:
subtask2_rows = []

for did, cidx in zip(df2['datapointID'], clusters):
    lang = langs[best_perm[cidx]]
    subtask2_rows.append([2, did, lang])

## Save answers

In [14]:
submission_rows = subtask1_rows + subtask2_rows
df_submission = pd.DataFrame(submission_rows, columns=["subtaskID", "datapointID", "answer"])
df_submission.to_csv("submission.csv", index=False)

## Submission results

Subtask 1:
- F1 Score: 0.85475
- Score: 30/30

Subtask 2:
- Accuracy: 0.978172
- Score: 70/70