In [None]:
import json
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, f1_score
import os
import joblib
import numpy as np
from data_extraction import get_raw_dataset

In [None]:
### Training and Evaluation on Dev Set

# Load raw data from jsonl
X_train, y_train = get_raw_dataset(mode='train')
X_dev, y_dev = get_raw_dataset(mode='dev')

# Feature extraction: convert texts to TF-IDF vectors
vectorizer = TfidfVectorizer(max_features=10000)
X_train_tfidf = vectorizer.fit_transform(X_train)
X_dev_tfidf = vectorizer.transform(X_dev)

# Save train and dev sets to a numpy file
if (not os.path.exists('data')):
    os.makedirs('data')
np.save('data/X_train_vectorized.npy', X_train_tfidf.toarray())
np.save('data/y_train.npy', y_train.to_numpy())
np.save('data/X_dev_vectorized.npy', X_dev_tfidf.toarray())
np.save('data/y_dev.npy', y_dev.to_numpy())

# Train a Logistic Regression classifier
clf = LogisticRegression(max_iter=1000)
clf.fit(X_train_tfidf, y_train)

In [None]:
# Evaluate the model on the dev set
y_dev_pred = clf.predict(X_dev_tfidf)
acc = accuracy_score(y_dev, y_dev_pred)
macro_f1 = f1_score(y_dev, y_dev_pred, average='macro')
micro_f1 = f1_score(y_dev, y_dev_pred, average='micro')

print("Evaluation on Dev Set:")
print(f"Accuracy: {acc:.4f}")
print(f"Macro F1: {macro_f1:.4f}")
print(f"Micro F1: {micro_f1:.4f}")

Evaluation on Dev Set:
Accuracy: 0.5754
Macro F1: 0.5316
Micro F1: 0.5754


In [None]:
### Prediction on Test Data

# Load test data (this file does not include labels)
X_test, ids_test = get_raw_dataset(mode='test')

# Transform test texts using the same TF-IDF vectorizer
X_test_tfidf = vectorizer.transform(X_test)
y_test_pred = clf.predict(X_test_tfidf)

# Save data to a numpy file
np.save('data/X_test_vectorized.npy', X_test_tfidf.toarray())
np.save('data/ids_test.npy', ids_test.to_numpy())

# Define a relative path for the output file
relative_output_file = os.path.join(os.curdir, 'content', 'Result_baseline.jsonl')

# Write predictions to the output file in JSONL format
with open(relative_output_file, 'w') as f:
    for id_val, label_val in zip(ids_test, y_test_pred):
        result = {"id": id_val, "label": int(label_val)}
        f.write(json.dumps(result) + "\n")

print(f"\nPrediction file '{relative_output_file}' has been generated.")

NameError: name 'test_file' is not defined

In [None]:
# Saving the model
if (not os.path.exists('models')):
    os.makedirs('models')
joblib.dump(clf, 'models/model_baseline_LogisticRegression.pkl')

['models/model_baseline_LogisticRegression.pkl']