
## Imports

In [1]:
import numpy as np
import pandas as pd
import csv
import fasttext
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report, matthews_corrcoef

## Loading Data

In [2]:
df_balanced = pd.read_csv("Data/Undersampled_balanced_data.csv")

### Converting to FastText Format

In [3]:
# Creating a copy as data
data = df_balanced.copy()

# Split the dataset into training and testing sets
train_data, test_data = train_test_split(data, test_size=0.3, random_state=42)

# Add the __label__ prefix to each label
train_data['category'] = train_data['Poles'].apply(lambda x: f'__label__{x}')
test_data['category'] = test_data['Poles'].apply(lambda x: f'__label__{x}')

# Saving the CSV file as a text file to train/test the classifier
train_data[['category', 'post']].to_csv('train.txt',
                                          index = False,
                                          sep = ' ',
                                          header = None,
                                          quoting = csv.QUOTE_NONE,
                                          quotechar = "",
                                          escapechar = " ")

test_data[['category', 'post']].to_csv('test.txt',
                                     index = False,
                                     sep = ' ',
                                     header = None,
                                     quoting = csv.QUOTE_NONE,
                                     quotechar = "",
                                     escapechar = " ")

In [4]:
# Creating a copy as data
data = df_balanced.copy()

# Add the __label__ prefix to each label
data['category'] = data['Poles'].apply(lambda x: f'__label__{x}')

# Saving the CSV file as a text file to train/test the classifier
data[['category', 'post']].to_csv('data.txt',
                                          index = False,
                                          sep = ' ',
                                          header = None,
                                          quoting = csv.QUOTE_NONE,
                                          quotechar = "",
                                          escapechar = " ")

### Train and Test FastText model

In [12]:
# Training the fastText classifier
model = fasttext.train_supervised('train.txt', wordNgrams = 2)

# Evaluating performance on the entire test file
model.test('test.txt', k=1)

(9283, 0.8700851017989873, 0.8700851017989873)

* 9283 is the number of data points in the test set
* 0.87 is both the precision and the recall

In [13]:
# Evaluate on training data
train_result = model.test("train.txt")
train_result

(21659, 0.8854517752435477, 0.8854517752435477)

## Evaluation Metrics

### 5-fold Cross Validation (check for overfitting)

In [14]:
# Replace with your actual data and parameters
data_path = 'data.txt'
k_folds = 5  # Choose the number of folds

# Load your data into a list
with open(data_path, 'r', encoding='utf-8') as file:
    data_lines = file.readlines()

# Extract labels and texts
labels = [line.split()[0] for line in data_lines]
texts = [' '.join(line.split()[1:]) for line in data_lines]

# Initialize StratifiedKFold
kf = StratifiedKFold(n_splits=k_folds, shuffle=True, random_state=42)

# Lists to store precision scores for each fold
precision_scores = []

# Perform k-fold cross-validation
for train_index, test_index in kf.split(texts, labels):
    # Split the data into training and testing sets
    train_data = [texts[i] for i in train_index]
    train_labels = [labels[i] for i in train_index]
    test_data = [texts[i] for i in test_index]
    test_labels = [labels[i] for i in test_index]

    # Create FastText training and testing files
    with open('train_fold.txt', 'w', encoding='utf-8') as train_file:
        for label, text in zip(train_labels, train_data):
            train_file.write(f'__label__{label} {text}\n')

    with open('test_fold.txt', 'w', encoding='utf-8') as test_file:
        for label, text in zip(test_labels, test_data):
            test_file.write(f'__label__{label} {text}\n')

    # Train the FastText model
    model = fasttext.train_supervised(input='train_fold.txt', wordNgrams=2, epoch=10, lr=0.1)

    # Evaluate on the test set
    result = model.test('test_fold.txt')[1]
    precision_scores.append(result)

# Print precision scores for each fold
for i, precision in enumerate(precision_scores, start=1):
    print(f'Fold {i}: Precision = {precision:.4f}')

# Calculate and print the average precision across all folds
average_precision = np.mean(precision_scores)
print(f'Average Precision: {average_precision:.4f}')

Fold 1: Precision = 0.9249
Fold 2: Precision = 0.9224
Fold 3: Precision = 0.9190
Fold 4: Precision = 0.9192
Fold 5: Precision = 0.9207
Average Precision: 0.9212


### Accuracy, Percision, Recall, F1 score, Matthews Correlation Coefficient

In [16]:
# Load the test data
with open('test.txt', 'r', encoding='utf-8') as f:
    test_lines = f.readlines()

# Extract labels and texts from the test data
test_labels = [line.split()[0] for line in test_lines]
test_texts = [' '.join(line.split()[1:]) for line in test_lines]

# Get predictions from the model
predictions = [model.predict(text)[0][0].replace('__label__', '') for text in test_texts]

# Create label mapping to map strings as integers (0 or 1)
label_mapping = {'__label__Western': 0, '__label__Eastern': 1}
label_mapping_pred = {'Western': 0, 'Eastern': 1}

# Convert labels to integers using the mapping
test_labels = [label_mapping[label] for label in test_labels]
predictions = [label_mapping_pred[pred] for pred in predictions]

# Calculate and print evaluation metrics
accuracy = accuracy_score(test_labels, predictions)
precision = precision_score(test_labels, predictions, average='weighted')
recall = recall_score(test_labels, predictions, average='weighted')
f1 = f1_score(test_labels, predictions, average='weighted')
classification_rep = classification_report(test_labels, predictions)
mcc = matthews_corrcoef(test_labels, predictions)

print(f'Accuracy: {accuracy}')
print(f'Precision: {precision}')
print(f'Recall: {recall}')
print(f'F1 Score: {f1}')
print(f'Matthews Correlation Coefficient: {mcc}')
print('Classification Report:\n', classification_rep)

Accuracy: 0.941075083485942
Precision: 0.9410788998588502
Recall: 0.941075083485942
F1 Score: 0.9410756702185508
Matthews Correlation Coefficient: 0.8821444891819901
Classification Report:
               precision    recall  f1-score   support

           0       0.94      0.94      0.94      4686
           1       0.94      0.94      0.94      4597

    accuracy                           0.94      9283
   macro avg       0.94      0.94      0.94      9283
weighted avg       0.94      0.94      0.94      9283

