In [6]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.dummy import DummyClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.svm import LinearSVC
from sklearn.metrics import classification_report

In [2]:
train_data = pd.read_csv("nli_train_upsampled.csv")
test_data = pd.read_csv("nli_test.csv")
X_train = train_data["sentence"]
y_train = train_data["l1"]
X_test = test_data["sentence"]
y_test = test_data["l1"]

In [3]:
pipeline = Pipeline([
    ("tfidf", TfidfVectorizer(
        analyzer="char_wb",    # character n-grams (syntactic cues)
        ngram_range=(3, 5),    # 3- to 5-grams
        max_features=5000,     # cap vocabulary size for efficiency
    )),
    ("clf", LinearSVC(random_state=42))
])
pipeline.fit(X_train, y_train)

In [4]:
y_pred = pipeline.predict(X_test)
print(classification_report(y_test, y_pred))

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


                        precision    recall  f1-score   support

               Albania       0.27      0.07      0.11       251
               Andorra       0.01      0.07      0.01        15
             Argentina       0.23      0.02      0.04       296
               Armenia       0.04      0.09      0.05        97
             Astrakhan       0.02      0.17      0.04        52
             Australia       0.34      0.09      0.15       602
               Austria       0.36      0.12      0.18      3672
            Azerbaijan       0.01      0.02      0.01        44
               Bavaria       0.00      0.00      0.00       214
               Belarus       0.14      0.09      0.11       213
               Belgium       0.33      0.14      0.20      3219
Bosnia and Herzegovina       0.30      0.14      0.19      3613
                Brazil       0.07      0.07      0.07       144
              Bulgaria       0.41      0.11      0.17      1449
              Cambodia       0.00      

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [8]:
# Majority baseline
majority_clf = DummyClassifier(strategy="most_frequent", random_state=42)
majority_clf.fit(X_train, y_train)
y_pred_majority = majority_clf.predict(X_test)
print(classification_report(y_test, y_pred_majority))

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


                        precision    recall  f1-score   support

               Albania       0.00      0.00      0.00       251
               Andorra       0.00      0.00      0.00        15
             Argentina       0.00      0.00      0.00       296
               Armenia       0.00      0.00      0.00        97
             Astrakhan       0.00      0.00      0.00        52
             Australia       0.00      0.00      0.00       602
               Austria       0.00      0.00      0.00      3672
            Azerbaijan       0.00      0.00      0.00        44
               Bavaria       0.00      0.00      0.00       214
               Belarus       0.00      0.00      0.00       213
               Belgium       0.00      0.00      0.00      3219
Bosnia and Herzegovina       0.00      0.00      0.00      3613
                Brazil       0.00      0.00      0.00       144
              Bulgaria       0.00      0.00      0.00      1449
              Cambodia       0.00      

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [9]:
# Random baseline
random_clf = DummyClassifier(strategy="stratified", random_state=42)
random_clf.fit(X_train, y_train)
y_pred_random = random_clf.predict(X_test)
print(classification_report(y_test, y_pred_random))

                        precision    recall  f1-score   support

               Albania       0.01      0.01      0.01       251
               Andorra       0.00      0.00      0.00        15
             Argentina       0.00      0.00      0.00       296
               Armenia       0.00      0.01      0.01        97
             Astrakhan       0.00      0.00      0.00        52
             Australia       0.01      0.00      0.01       602
               Austria       0.02      0.02      0.02      3672
            Azerbaijan       0.00      0.00      0.00        44
               Bavaria       0.00      0.00      0.00       214
               Belarus       0.00      0.00      0.00       213
               Belgium       0.02      0.02      0.02      3219
Bosnia and Herzegovina       0.02      0.02      0.02      3613
                Brazil       0.00      0.00      0.00       144
              Bulgaria       0.02      0.02      0.02      1449
              Cambodia       0.00      