In [1]:
import json
import pandas as pd
import numpy as np

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


In [2]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import ComplementNB
from sklearn.metrics import accuracy_score, classification_report

In [3]:
with open('data/train.json', 'r', encoding='utf-8') as fp:
    train_data = json.load(fp)
train_df = pd.DataFrame(train_data)

with open('data/valid.json', 'r', encoding='utf-8') as fp:
    valid_data = json.load(fp)
valid_df = pd.DataFrame(valid_data)

with open('data/valid_new.json', 'r', encoding='utf-8') as fp:
    valid_new_data = json.load(fp)
valid_new_df = pd.DataFrame(valid_new_data)

In [4]:
unique_lang_ids = train_df['langid'].nunique()
print("Number of unique lang IDs:", unique_lang_ids)

Number of unique lang IDs: 13


In [5]:
def compute_micro_f1_score(preds, golds):
    TP, FP, FN = 0, 0, 0

    assert len(preds) == len(golds)

    for pp, gg in zip(preds, golds):
        if gg == 'en':
            if pp != gg:
                FP += 1
                FN += 1
        else:
            if pp != gg:
                FP += 1
                FN += 1
            else:
                TP += 1

    prec = TP / (TP + FP)
    rec =  TP / (TP + FN)
    f1 = (2 * prec * rec) / (prec + rec)

    return f1


def compute_macro_f1_score(preds, golds):
    total = 0
    all_langs = list(set(golds))
    all_langs.remove('en')

    for ln in all_langs:
        TP, FP, FN = 0, 0, 0
        for pp, gg in zip(preds, golds):
            if (gg == ln) and (pp == gg):
                TP += 1
            elif (gg == ln) and (pp != gg):
                FN += 1
            elif (pp == ln) and (pp != gg):
                FP += 1

        prec = TP / (TP + FP)
        rec =  TP / (TP + FN)
        f1 = (2 * prec * rec) / (prec + rec)

        total += f1

    return total / (len(all_langs))

In [6]:
# Split the data into training and validing sets
X_train, y_train = train_df['text'], train_df['langid']
X_valid, y_valid = valid_df['text'], valid_df['langid']
X_valid_new, y_valid_new = valid_new_df['text'], valid_new_df['langid']

X_train = pd.concat([X_train, X_valid], ignore_index=True)
y_train = pd.concat([y_train, y_valid], ignore_index=True)


In [7]:
# vectorizer = CountVectorizer()
# vectorizer = TfidfVectorizer()

vectorizer = TfidfVectorizer(
    ngram_range=(1, 2),
    sublinear_tf=True,
    token_pattern=r'(?u)\b\w+\b',
)

X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_valid)
X_test_new_vec = vectorizer.transform(X_valid_new)

In [8]:
num_features = len(vectorizer.get_feature_names_out())
print("Number of features:", num_features)

Number of features: 4505047


In [9]:
# # Create and train the Naive Bayes classifier
# class_prior_equal = [1 / 13] * 13 

# macro_f1_scores = [0.93, 0.86, 0.88, 0.81, 0.88, 0.94, 0.83, 0.84, 0.70, 0.91, 0.84, 0.90, 0.84]
# inverse_macro_f1_scores = [1 / score for score in macro_f1_scores]
# class_priors_weighted = [weight / sum(inverse_macro_f1_scores) for weight in inverse_macro_f1_scores]

naive_bayes = MultinomialNB(alpha=0.001)
naive_bayes.fit(X_train_vec, y_train)

In [10]:
# Make predictions on the test set
predictions = naive_bayes.predict(X_test_vec)

# Evaluate the model
accuracy = accuracy_score(y_valid, predictions)
print("Accuracy:", accuracy)

# Display classification report
print("\nClassification Report:")
print(classification_report(y_valid, predictions))

Accuracy: 0.99758

Classification Report:
              precision    recall  f1-score   support

          bn       1.00      1.00      1.00       451
          de       0.99      0.99      0.99     10120
          en       1.00      1.00      1.00     39475
          es       1.00      1.00      1.00      9814
          fr       1.00      1.00      1.00     10297
          hi       1.00      1.00      1.00       446
          it       1.00      1.00      1.00      9316
          kn       1.00      1.00      1.00       473
          ml       1.00      1.00      1.00       445
          mr       1.00      1.00      1.00       425
          pt       1.00      1.00      1.00      9194
          sv       1.00      1.00      1.00      9099
          ta       1.00      1.00      1.00       445

    accuracy                           1.00    100000
   macro avg       1.00      1.00      1.00    100000
weighted avg       1.00      1.00      1.00    100000



In [11]:
# Make predictions on the test set
predictions_new = naive_bayes.predict(X_test_new_vec)

# Evaluate the model
accuracy = accuracy_score(y_valid_new, predictions_new)
print("Accuracy:", accuracy)

# Display classification report
print("\nClassification Report:")
print(classification_report(y_valid_new, predictions_new))

Accuracy: 0.901471790678659

Classification Report:
              precision    recall  f1-score   support

          bn       1.00      0.86      0.93        74
          de       0.94      0.84      0.89       208
          en       0.88      0.95      0.91       812
          es       0.86      0.89      0.87       202
          fr       0.91      0.94      0.93       212
          hi       0.93      0.94      0.94        71
          it       0.88      0.88      0.88       192
          kn       0.91      0.82      0.86        60
          ml       0.93      0.62      0.74        68
          mr       0.95      0.88      0.91       102
          pt       0.89      0.94      0.91       189
          sv       0.96      0.90      0.93       188
          ta       0.93      0.79      0.86        68

    accuracy                           0.90      2446
   macro avg       0.92      0.87      0.89      2446
weighted avg       0.90      0.90      0.90      2446



In [12]:
print("macro f1 score is ", compute_macro_f1_score(y_valid_new, predictions_new))
print("micro f1 score is ", compute_micro_f1_score(y_valid_new, predictions_new))

macro f1 score is  0.8868891872625873
micro f1 score is  0.8562052505966588


In [13]:
print("macro f1 score is ", compute_macro_f1_score(y_valid, predictions))
print("micro f1 score is ", compute_micro_f1_score(y_valid, predictions))

macro f1 score is  0.9986579905427471
micro f1 score is  0.996009498054218


In [14]:
# macro f1 score is  0.856344669531483
# micro f1 score is  0.8056537102473497

In [15]:
#macro f1 score is  0.8753133788012702
# micro f1 score is  0.8431723315444246