In [10]:
# pip install datasets

In [80]:
import re
import string
import random
import pandas as pd
from datasets import load_dataset

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import svm
from sklearn.metrics import accuracy_score, balanced_accuracy_score

import nltk
from nltk.corpus import stopwords
nltk.download("stopwords")

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [81]:
def remove_stop_words(text):
  stop_words = stopwords.words("english")
  text = " ".join(word for word in text.split(" ") if word not in stop_words)
  return text

def stem_text(text):
  stemmer = nltk.SnowballStemmer("english")
  text = " ".join(stemmer.stem(word) for word in text.split(" "))
  return text


def clean_text(text):
  text = text.lower()
  text = re.sub(r"\[.*?\]", "", text)
  text = re.sub(r"https?://\S+|www\.\S+", " ", text)
  text = re.sub(r"<.*?>+", " ", text)
  text = re.sub(r"[%s]" % re.escape(string.punctuation), " ", text)
  text = re.sub(r"\n", " ", text)
  text = re.sub(r"\w*\d\w*", " ", text)
  text = re.sub(r" +", " ", text)
  text = remove_stop_words(text)
  text = stem_text(text)
  return text

def sample_data(dataset, num_samples_per_class, labels):
    sampled_data = []
    for label in range(len(labels)):
        class_data = [example for example in dataset if example["label"] == label]
        sampled_data.extend(random.sample(class_data, num_samples_per_class))
        random.shuffle(sampled_data)
    return sampled_data

def format_instruction(example):
    instruction = """
    Instruction: Determine the category of the given text (provided below).
    Choose exactly one category from the following options: World, Sports, Business, or Sci/Tech.
    Your output should be a single word representing the selected category."""
    text = example["text"]
    output = labels[example["label"]]
    return {
        "instruction": instruction,
        "text": text,
        "output": output
    }

## Data Loading

In [82]:
labels = ["World", "Sports", "Business", "Sci/Tech"]

dataset = load_dataset("ag_news")

print(dataset)

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 120000
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 7600
    })
})


In [83]:
train_samples = sample_data(dataset["train"], 400, labels)
test_samples = sample_data(dataset["test"], 100, labels)

formatted_train = [format_instruction(sample) for sample in train_samples]
formatted_test = [format_instruction(sample) for sample in test_samples]

# save_to_file(formatted_train, "small_train_data.txt")
# save_to_file(formatted_test, "small_test_data.txt")

In [84]:
train_texts = [o['text'] for o in formatted_train]
test_texts = [o['text'] for o in formatted_test]

train_outputs = [o['output'] for o in formatted_train]
test_outputs = [o['output'] for o in formatted_test]

df_train = pd.DataFrame({'text': train_texts, 'label': train_outputs})
df_test = pd.DataFrame({'text': test_texts, 'label': test_outputs})

df_train['text'] = df_train['text'].astype(str)
df_test['text'] = df_test['text'].astype(str)

df_train['cleaned_text'] = df_train['text'].apply(lambda x: clean_text(x))
df_test['cleaned_text'] = df_test['text'].apply(lambda x: clean_text(x))

In [87]:
len(vectorizer.vocabulary_)

In [105]:
vectorizer = TfidfVectorizer(max_features=3500)
x_train_tfidf = vectorizer.fit_transform(df_train['cleaned_text'])
x_test_tfidf = vectorizer.transform(df_test['cleaned_text'])

model = svm.SVC(kernel='linear', probability=True)
model.fit(x_train_tfidf, df_train['label'])

In [108]:
test_predictions = model.predict(x_test_tfidf)
overall_accuracy = accuracy_score(df_test['label'], test_predictions)
weighted_accuracy = balanced_accuracy_score(df_test['label'], test_predictions)
print("Total Accuracy:", overall_accuracy)
print("Weighted Accuracy", weighted_accuracy)

Total Accuracy: 0.8475
Weighted Accuracy 0.8474999999999999
