## Data Wrangling

### Imports

In [1]:
# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datasets import load_dataset, load_from_disk
from collections import Counter
import re
from wordcloud import WordCloud, STOPWORDS
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, TweetTokenizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, confusion_matrix
from transformers import (
    AutoTokenizer, 
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer,
    DataCollatorWithPadding,
)
import evaluate

### Loading Data

In [None]:
# Load "amazon_polarity" dataset (https://huggingface.co/datasets/amazon_polarity)
data = load_dataset('amazon_polarity')

In [None]:
print(data)

In [None]:
train = data['train']

In [None]:
train.features

In [None]:
train[:5]

In [None]:
# Check for missing values
cols = ['label', 'title', 'content']
for col in cols:
    print(any(value is None for value in train[col]))

## Exploratory Data Analysis

In [None]:
# Check for class imbalance
label_counts = Counter(train["label"])
print(label_counts)

In [None]:
# Combine title and content
def combine(data):
    data['text'] = data['title'] + ' ' + data['content']
    return data

train = train.map(combine)

In [None]:
train.features

In [None]:
train[0]

In [None]:
test = data['test']
test = test.map(combine)
print(test.features)

In [None]:
train = train.remove_columns(['title', 'content'])
test = test.remove_columns(['title', 'content'])
print(train.features)
print(test.features)

In [None]:
train.save_to_disk('E:/datasets/train')
test.save_to_disk('E:/datasets/test')

In [None]:
train = load_from_disk('train')
test = load_from_disk('test')

In [None]:
empty = [i for i, text in enumerate(train['text']) if not text.strip()]
print(f'Empty texts: {len(empty)}')

In [None]:
empty = [i for i, text in enumerate(test['text']) if not text.strip()]
print(f'Empty texts: {len(empty)}')

In [None]:
# Most common words
stop_words = set(stopwords.words('english'))

def tokenize(text):
    return [word for word in re.findall(r'\b\w+\b', text.lower()) if word not in stop_words]

sample = train.select(range(5000))['text']
all_words = [word for text in sample for word in tokenize(text)]
common_words = Counter(all_words).most_common(20)

print(common_words)


In [None]:
# Check that labels match text
for i in range(20):
    print(f'Label: {train[i]['label']}, Text: {train[i]['text'][:200]}')

In [None]:
# Check for duplicate reviews
dup_count = sum(count > 1 for count in Counter(train['text']).values())
print(f'Duplicate entries: {dup_count}')

In [None]:
# For completion, check duplicates in test set as well
dup_count = sum(count > 1 for count in Counter(test['text']).values())
print(f'Duplicate entries: {dup_count}')

In [None]:
df = train.to_pandas()

In [None]:
# Check text length distribution in words
df['text_length'] = df['text'].apply(lambda x: len(x.split()))
df['text_length'].hist(bins=50)

In [None]:
import re
import nltk
import matplotlib.pyplot as plt
from nltk.tokenize import TweetTokenizer
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from wordcloud import WordCloud

# Download stopwords if not already present
nltk.download('stopwords')

# Initialize the TweetTokenizer
tweet_tokenizer = TweetTokenizer(preserve_case=False)

# Custom tokenizer using TweetTokenizer
def custom_tokenizer(text):
    tokens = tweet_tokenizer.tokenize(text)
    # Keep only alphabetic tokens or contractions (e.g., "don't")
    tokens = [t for t in tokens if re.match(r"[a-z]+('[a-z]+)?$", t)]
    return tokens

# Base stopwords plus custom extras (keep contraction suffixes like "n't" etc.)
base_stopwords = set(stopwords.words('english'))
extra_stopwords = {'one', 'book'}
custom_stopwords = list(base_stopwords.union(extra_stopwords))

# Sample the DataFrame to balance labels
sampled_df = df.groupby('label', group_keys=False).apply(
    lambda x: x.sample(min(len(x), 2000), random_state=42)
)

# Get unique labels
labels = sampled_df['label'].unique()
num_labels = len(labels)

# Set up subplots
fig, axes = plt.subplots(1, num_labels, figsize=(6 * num_labels, 6))
if num_labels == 1:
    axes = [axes]

# Generate TF-IDF word cloud for each class
for i, label in enumerate(labels):
    class_texts = sampled_df[sampled_df['label'] == label]['text'].dropna().astype(str)

    tfidf = TfidfVectorizer(
        tokenizer=custom_tokenizer,
        stop_words=custom_stopwords,
        token_pattern=None,  # Required when using a custom tokenizer
        max_features=1000
    )
    tfidf_matrix = tfidf.fit_transform(class_texts)

    # Get average TF-IDF scores
    scores = tfidf_matrix.mean(axis=0).A1
    words = tfidf.get_feature_names_out()
    tfidf_scores = dict(zip(words, scores))

    # Generate word cloud
    wc = WordCloud(width=800, height=400, background_color='white').generate_from_frequencies(tfidf_scores)

    axes[i].imshow(wc, interpolation='bilinear')
    axes[i].set_title(f'Label: {label}', fontsize=16)
    axes[i].axis('off')

plt.tight_layout()
plt.show()


In [None]:
# Compute unique word counts by class
sampled_df['word_count'] = sampled_df['text'].apply(lambda x: len(tokenize(str(x))))

# Plot histograms by class
labels = sampled_df['label'].unique()
plt.figure(figsize=(10, 6))

for label in labels:
    subset = sampled_df[sampled_df['label'] == label]
    plt.hist(subset['word_count'], bins=30, alpha=0.6, label=f'Label {label}', edgecolor='black')

plt.title('Histogram of Word Counts per Document by Class')
plt.xlabel('Number of Words (Stopwords Removed)')
plt.ylabel('Number of Documents')
plt.legend()
plt.grid(True)
plt.show()

In [None]:
# Bigram frequency
vectorizer = CountVectorizer(
    tokenizer=custom_tokenizer,
    ngram_range=(2, 2),  # Bigrams
    stop_words='english',
    token_pattern=None  # Important: disables default pattern so custom tokenizer is used
)
X = vectorizer.fit_transform(sampled_df['text'])
sum_words = X.sum(axis=0)
bigrams_freq = [(word, sum_words[0, idx]) for word, idx in vectorizer.vocabulary_.items()]
bigrams_freq = sorted(bigrams_freq, key=lambda x: x[1], reverse=True)

In [None]:
bigrams_freq

In [None]:
# Trigram frequency
vectorizer = CountVectorizer(
    tokenizer=custom_tokenizer,
    ngram_range=(3, 3),
    stop_words='english',
    token_pattern=None
)
X = vectorizer.fit_transform(sampled_df['text'])
sum_words = X.sum(axis=0)
trigrams_freq = [(word, sum_words[0, idx]) for word, idx in vectorizer.vocabulary_.items()]
trigrams_freq = sorted(trigrams_freq, key=lambda x: x[1], reverse=True)

In [None]:
trigrams_freq

In [None]:
# Check corpus-wide vocabulary size
all_tokens = [token for text in sample for token in tokenize(text)]
vocab = set(all_tokens)
print("Vocabulary size:", len(vocab))

In [None]:
# Check for text overlap between training and test sets
train_df = train.to_pandas()
test_df = test.to_pandas()

train_texts = set(train_df['text'].str.strip().str.lower())
test_texts = set(test_df['text'].str.strip().str.lower())

# Intersection
overlap_texts = train_texts.intersection(test_texts)

print(f"Exact overlapping texts: {len(overlap_texts)}")

While exact overlap can be problematic between the training and test sets, 82 is a small fraction given the size of each set.

In [None]:
print(f"Train size: {len(train_df)}")
print(f"Test size: {len(test_df)}")
print(f"Percent of test overlapped: {100 * len(overlap_texts) / len(test_df):.2f}%")

## Preprocessing

In [None]:
def preprocess(data, rm_stop_words=True):
    text = data['text'].lower()
    text = re.sub(r"[^\w\s']", '', text)
    if rm_stop_words:
        tokens = text.split()
        tokens = [t for t in tokens if t not in stop_words]
        return {'text': ' '.join(tokens)}
    else:
        return {'text': text}

In [None]:
train_tfidf = train.map(preprocess)
test_tfidf = test.map(preprocess)

In [None]:
train_tfidf.save_to_disk('E:/datasets/train_tfidf')
test_tfidf.save_to_disk('E:/datasets/test_tfidf')

In [None]:
train_transformer = train.map(lambda x: preprocess(x, rm_stop_words=False))
test_transformer = test.map(lambda x: preprocess(x, rm_stop_words=False))

In [None]:
train_transformer.save_to_disk('E:/datasets/train_transformer')
test_transformer.save_to_disk('E:/datasets/test_transformer')

## Modeling

### Logistic Regression

In [None]:
# Load locally saved datasets
train_tfidf = load_from_disk('train_tfidf')
test_tfidf = load_from_disk('test_tfidf')

In [None]:
# Extract columns for X and y
X_train = train_tfidf['text']
y_train = train_tfidf['label']
X_test = test_tfidf['text']
y_test = test_tfidf['label']

In [None]:
# Create pipeline
model = Pipeline([
    ('tfidf', TfidfVectorizer(max_features=10000, ngram_range=(1,2))),
    ('clf', LogisticRegression(solver='liblinear'))
])

In [None]:
# Train model and evaluate
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

In [None]:
from sklearn.metrics import roc_curve, auc
import matplotlib.pyplot as plt

y_probs = model.predict_proba(X_test)[:, 1]

fpr, tpr, _ = roc_curve(y_test, y_probs)
roc_auc = auc(fpr, tpr)

plt.figure()
plt.plot(fpr, tpr, label=f"ROC curve (AUC = {roc_auc:.2f})")
plt.plot([0, 1], [0, 1], 'k--')
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("ROC Curve")
plt.legend(loc="lower right")
plt.show()

### DistilBERT base (uncased)

In [None]:
# Load locally stored datasets
train_tranformer = load_from_disk('train_transformer')
test_transformer = load_from_disk('test_transformer')

In [None]:
# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained('distilbert-base-uncased')

In [None]:
# Tokenize text function
def tokenize_function(data):
    return tokenizer(data['text'], truncation=True, padding='max_length')

In [None]:
# Tokenize text using distilBERT tokenizer
train_tokenized = train_transformer.map(tokenize_function, batched=True)
test_tokenized = test_transformer.map(tokenize_function, batched=True)

In [None]:
# Save datasets for later use
train_tokenized.save_to_disk('E:/datasets/train_dbert')
test_tokenized.save_to_disk('E:/datasets/test_dbert')

In [None]:
# Load datasets
train_dbert = load_from_disk('E:/datasets/train_dbert')
test_dbert = load_from_disk('E:/datasets/test_dbert')

In [11]:
from transformers import (
    AutoModelForSequenceClassification,
    AutoTokenizer,
    TrainingArguments,
    Trainer,
    DataCollatorWithPadding,
)
from transformers.trainer_utils import get_last_checkpoint
import numpy as np
import evaluate
from datasets import load_from_disk

# Load dataset from your E: drive (HDD)
train_dataset = load_from_disk("E:/datasets/train_dbert")
test_dbert = load_from_disk('E:/datasets/test_dbert')

# Subsample to ~344,000 examples to target ~43,000 iterations in 2 epochs
train_dataset = train_dataset.shuffle(seed=42).select(range(344_000))
train_dataset.set_format(type="torch")  # Efficient memory usage

# Load tokenizer and model
tokenizer = AutoTokenizer.from_pretrained('distilbert-base-uncased')
model = AutoModelForSequenceClassification.from_pretrained(
    'distilbert-base-uncased', num_labels=2
)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

# Metrics
accuracy = evaluate.load('accuracy')
f1 = evaluate.load('f1')

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)
    return {
        'accuracy': accuracy.compute(predictions=preds, references=labels)['accuracy'],
        'f1': f1.compute(predictions=preds, references=labels, average='weighted')['f1'],
    }

# Training arguments: no intermediate eval, save checkpoints
training_args = TrainingArguments(
    output_dir='E:/datasets/results',
    eval_strategy="no",              # ❌ No intermediate evaluation
    save_strategy="steps",                 # ✅ Save model periodically
    save_steps=5000,
    save_total_limit=1,                    # ✅ Keep only the latest checkpoint
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=32,
    num_train_epochs=2,
    weight_decay=0.01,
    logging_dir='E:/datasets/logs',
    logging_steps=100,
    report_to='none',
    load_best_model_at_end=False,          # ❌ No best model tracked without eval
    dataloader_num_workers=4,
    fp16=True,
    gradient_accumulation_steps=1,
)

# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dbert,       # Still used for final evaluation only
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

# Train the model
last_checkpoint = get_last_checkpoint(training_args.output_dir)

if last_checkpoint is not None:
    trainer.train(resume_from_checkpoint=last_checkpoint)
else:
    trainer.train()

# Final evaluation
metrics = trainer.evaluate()
print(metrics)

Loading dataset from disk:   0%|          | 0/22 [00:00<?, ?it/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Step,Training Loss
100,0.4568
200,0.2887
300,0.2559
400,0.2332
500,0.2283
600,0.2427
700,0.2348
800,0.1965
900,0.2211
1000,0.197


{'eval_loss': 0.14977984130382538, 'eval_accuracy': 0.9596925, 'eval_f1': 0.9596924780764868, 'eval_runtime': 4919.8766, 'eval_samples_per_second': 81.303, 'eval_steps_per_second': 2.541, 'epoch': 2.0}


In [15]:
# Save model
trainer.save_model('model_checkpoint/')
tokenizer.save_pretrained('model_checkpoint/')

('model_checkpoint/tokenizer_config.json',
 'model_checkpoint/special_tokens_map.json',
 'model_checkpoint/vocab.txt',
 'model_checkpoint/added_tokens.json',
 'model_checkpoint/tokenizer.json')

In [None]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer

model = AutoModelForSequenceClassification.from_pretrained("model_checkpoint/")
tokenizer = AutoTokenizer.from_pretrained("model_checkpoint/")