
## Imports

In [1]:
import pandas as pd
import csv
import fasttext
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

## Loading Data

In [3]:
df = pd.read_csv("tokenized_non-eng.csv",
                 na_values=['[deleted]', '[removed]']).dropna()

### Converting to FastText Format

In [7]:
df

Unnamed: 0.1,Unnamed: 0,auhtor_ID,post,nationality,Poles,post_tokens
0,0,t2_10uons,"isn't, show. legally binding. depends also ran...",Finland,Western,"['is', ""n't"", ',', 'show', '.', 'legally', 'bi..."
1,1,t2_10uons,game game played. game played calculated compa...,Finland,Western,"['game', 'game', 'played', '.', 'game', 'playe..."
2,2,t2_10uons,logic usa shitty place live (with ~1 shooting ...,Finland,Western,"['logic', 'usa', 'shitty', 'place', 'live', '(..."
3,3,t2_10uons,"referring add lot ea, activision etc.. re-rele...",Finland,Western,"['referring', 'add', 'lot', 'ea', ',', 'activi..."
4,4,t2_10uons,get another role support lel people thinking 2...,Finland,Western,"['get', 'another', 'role', 'support', 'lel', '..."
...,...,...,...,...,...,...
76400,82611,t2_57ogt82f,sergeant york. wwi lend it'self films. fought ...,United Kingdom,Western,"['sergeant', 'york', '.', 'wwi', 'lend', ""it's..."
76401,82612,t2_57ogt82f,position person get short term fame committing...,United Kingdom,Western,"['position', 'person', 'get', 'short', 'term',..."
76402,82613,t2_57ogt82f,"read label. high red meat, yet others low. qua...",United Kingdom,Western,"['read', 'label', '.', 'high', 'red', 'meat', ..."
76403,82614,t2_57ogt82f,traitors. suggesting treason. treason death pe...,United Kingdom,Western,"['traitors', '.', 'suggesting', 'treason', '.'..."


In [8]:
# Creating a copy as data
data = df.copy()

# Split the dataset into training and testing sets
train_data, test_data = train_test_split(data, test_size=0.2, random_state=42)

# Function to flatten the list of tokens into a single string
def flatten_tokens(tokens):
    return ' '.join(tokens)

# Add the __label__ prefix to each label
train_data['category'] = train_data['Poles'].apply(lambda x: f'__label__{x}')
test_data['category'] = test_data['Poles'].apply(lambda x: f'__label__{x}')

# Saving the CSV file as a text file to train/test the classifier
train_data[['category', 'post']].to_csv('train.txt',
                                          index = False,
                                          sep = ' ',
                                          header = None,
                                          quoting = csv.QUOTE_NONE,
                                          quotechar = "",
                                          escapechar = " ")

test_data[['category', 'post']].to_csv('test.txt',
                                     index = False,
                                     sep = ' ',
                                     header = None,
                                     quoting = csv.QUOTE_NONE,
                                     quotechar = "",
                                     escapechar = " ")

### Train and Test

In [19]:
# Training the fastText classifier
model = fasttext.train_supervised('train.txt', wordNgrams = 2)

# Evaluating performance on the entire test file
model.test('test.txt', k=1)

(15281, 0.9464040311497939, 0.9464040311497939)

* 15281 is the number of data points in the test set
* 0.946469 is both the precision and the recall

#### Meaning of k parameter

(Taken from Chat)
Now, when you set k=1, it means you are only considering the top-1 prediction for each example. In your case, the precision and recall at 1 are both approximately 0.9465.

When you set k=2, it means you are considering the top-2 predictions for each example. This changes the evaluation metric because now the model is allowed to provide two predictions per example, and correctness is assessed based on whether the correct category is among the top-2 predictions.

So it does not make sense to set k to anything above 1 in case of binary predictions since there are only two categories.

In [22]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report

# Load the test data
with open('test.txt', 'r', encoding='utf-8') as f:
    test_lines = f.readlines()

# Extract labels and texts from the test data
test_labels = [line.split()[0] for line in test_lines]
test_texts = [' '.join(line.split()[1:]) for line in test_lines]

# Get predictions from the model
predictions = [model.predict(text)[0][0].replace('__label__', '') for text in test_texts]

# Create label mapping to map strings as integers (0 or 1)
label_mapping = {'__label__Western': 0, '__label__Eastern': 1}
label_mapping_pred = {'Western': 0, 'Eastern': 1}

# Convert labels to integers using the mapping
test_labels = [label_mapping[label] for label in test_labels]
predictions = [label_mapping_pred[pred] for pred in predictions]

# Calculate and print evaluation metrics
accuracy = accuracy_score(test_labels, predictions)
precision = precision_score(test_labels, predictions, average='weighted')
recall = recall_score(test_labels, predictions, average='weighted')
f1 = f1_score(test_labels, predictions, average='weighted')
classification_rep = classification_report(test_labels, predictions)

print(f'Accuracy: {accuracy}')
print(f'Precision: {precision}')
print(f'Recall: {recall}')
print(f'F1 Score: {f1}')
print('Classification Report:\n', classification_rep)

Accuracy: 0.9464040311497939
Precision: 0.9469357542616063
Recall: 0.9464040311497939
F1 Score: 0.9441547757598423
Classification Report:
               precision    recall  f1-score   support

           0       0.94      0.99      0.97     12174
           1       0.96      0.77      0.85      3107

    accuracy                           0.95     15281
   macro avg       0.95      0.88      0.91     15281
weighted avg       0.95      0.95      0.94     15281

