In [1]:
import pandas as pd

In [2]:
from sklearn.model_selection import train_test_split

def split_data(df):
    # First split: train vs test
    X_train, X_test, y_train, y_test = train_test_split(
        df['sentence'],
        df['l1'],
        test_size=0.2,
        random_state=42,
        stratify=df['l1']
    )
    
    # Second split: train vs validation  
    X_train, X_val, y_train, y_val = train_test_split(
        X_train,
        y_train,
        test_size=0.25,  # 0.25 * 0.8 = 0.2 of total
        random_state=42,
        stratify=y_train
    )
    
    train_data = pd.DataFrame({'sentence': X_train, 'l1': y_train}).dropna()
    val_data = pd.DataFrame({'sentence': X_val, 'l1': y_val}).dropna()
    test_data = pd.DataFrame({'sentence': X_test, 'l1': y_test}).dropna()
    
    train_data.to_csv("nli_train.csv", index=False)
    val_data.to_csv("nli_val.csv", index=False)
    test_data.to_csv("nli_test.csv", index=False)
    
    return train_data, val_data, test_data

In [5]:
path = "dataset_raw.csv"
df = pd.read_csv(path).dropna()
min_samples = 3
class_counts = df['l1'].value_counts()
valid_classes = class_counts[class_counts >= min_samples].index
df = df[df['l1'].isin(valid_classes)]
df.head()

Unnamed: 0,sentence,l1,l2,source,word_count
0,If you're so concerned with human rights viola...,Kazakhstan,English,https://github.com/masalha-alaa/native-languag...,19
1,"Not really, I mean 75 years have passed",Kazakhstan,English,https://github.com/masalha-alaa/native-languag...,8
2,"On the contrary, I've heard a lot of positive ...",Kazakhstan,English,https://github.com/masalha-alaa/native-languag...,40
3,Isn't the company Voss owned by the Chinese now,Kazakhstan,English,https://github.com/masalha-alaa/native-languag...,9
4,"Lol, you just opened a historical loophole in ...",Kazakhstan,English,https://github.com/masalha-alaa/native-languag...,11


In [6]:
split_data(df)

(                                                 sentence  \
 266284  In the UK, tree cover was down to about 10% by...   
 604748  S-bahn trains travel many kilometers outside t...   
 784223                    mlajša  hčerka  je  moja mama .   
 531837  That battle and oppresion of Chinese caused Tu...   
 654847             Taky černu taškua polibek od manželky.   
 ...                                                   ...   
 574970  Some of their aims was to replan housing to fo...   
 96474           "Predictions only at the end of the game"   
 762226    Nisam  imel  takvo  situacijo  v restavraciji .   
 193683  We call it “fön” which is short for ”hårfön” (...   
 743410                 Prihaje  iz  mesto [XKrajX],  iz .   
 
                             l1  
 266284                 Croatia  
 604748                   Italy  
 784223                  Poland  
 531837                  Turkey  
 654847                  Russia  
 ...                        ...  
 574970         