In [2]:
import pandas as pd
import re
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Download required NLTK resources
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('punkt_tab')


[nltk_data] Downloading package punkt to /home/ian/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /home/ian/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt_tab to /home/ian/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [20]:

# Load dataset (Replace 'your_dataset.csv' with your actual dataset)
train_df = pd.read_csv("../data/train_set.csv")
test_df = pd.read_csv("../data/dev_set.csv")

# Replace NaN values with empty strings
train_df["text"] = train_df["text"].fillna("")
test_df["text"] = test_df["text"].fillna("")


# Preprocessing function
def preprocess_text(text):
    text = text.lower()  # Convert to lowercase
    text = re.sub(r'\W+', ' ', text)  # Remove punctuation and special characters
    tokens = word_tokenize(text)  # Tokenize
    tokens = [word for word in tokens if word not in stopwords.words("english")]  # Remove stopwords
    return ' '.join(tokens)

# Apply preprocessing
train_df["processed_text"] = train_df["text"].apply(preprocess_text)
test_df["processed_text"] = test_df["text"].apply(preprocess_text)



In [22]:
# Shuffle rows
train_df = train_df.sample(frac=1).reset_index(drop=True)
test_df = test_df.sample(frac=1).reset_index(drop=True)

X_train = train_df["processed_text"]
y_train = train_df["label"]

X_test = test_df["processed_text"]
y_test = test_df["label"]

y_train.value_counts()

label
0    7581
1     794
Name: count, dtype: int64

In [23]:
# Create Bag-of-Words features
vectorizer = CountVectorizer(binary=False)  # Binary presence of words (can use binary=False for raw counts)
X_train_bow = vectorizer.fit_transform(X_train)
X_test_bow = vectorizer.transform(X_test)

print("Vocabulary Size:", len(vectorizer.get_feature_names_out()))


Vocabulary Size: 26728


In [24]:
# Train Logistic Regression Model
clf = LogisticRegression()
clf.fit(X_train_bow, y_train)

# Make predictions
y_pred = clf.predict(X_test_bow)

# Evaluate model
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))


Accuracy: 0.9006685768863419

Classification Report:
               precision    recall  f1-score   support

           0       0.92      0.98      0.95      1895
           1       0.44      0.18      0.26       199

    accuracy                           0.90      2094
   macro avg       0.68      0.58      0.60      2094
weighted avg       0.87      0.90      0.88      2094


Confusion Matrix:
 [[1850   45]
 [ 163   36]]


In [25]:
# Get misclassified examples
df_test = X_test.reset_index(drop=True).to_frame()
df_test["true_label"] = y_test.reset_index(drop=True)
df_test["predicted_label"] = y_pred

misclassified = df_test[df_test["processed_text"] == test_df[test_df["par_id"] == 9423]["processed_text"].values[0]]


if not misclassified.empty:
    example = misclassified.sample(1)  # Pick a random misclassified example
    print("\nMisclassified Example:")
    print("Text:", example["processed_text"].values[0])
    print("Original Text:", test_df.loc[example.index[0], "text"])
    print("True Label:", example["true_label"].values[0])
    print("Predicted Label:", example["predicted_label"].values[0])



Misclassified Example:
Text: christmas celebration birth merely child child changed destiny humans forever celebration fact god wanted part human race took flesh blood became human like us also show unconditional love good deeds helping need help care human merciful
Original Text: Christmas is celebration of the birth of not merely a child , but a child who changed the destiny of humans forever . It is celebration of the fact that God wanted to be a part of the human race and so he took on flesh and blood and became human like us . We can also show unconditional love through our good deeds and helping those who are in need of our help and care . Be human and merciful .
True Label: 0
Predicted Label: 1


In [19]:
# Find how often the word 'help' appears per-class
word_index = vectorizer.vocabulary_['helping']
word_counts = X_train_bow[:, word_index].toarray().flatten()
train_df["helping_count"] = word_counts
print("\nAverage 'helping' count for class 0:", train_df[train_df["label"] == 0]["helping_count"].mean())
print("Average 'helping' count for class 1:", train_df[train_df["label"] == 1]["helping_count"].mean())

word_index = vectorizer.vocabulary_['help']
word_counts = X_train_bow[:, word_index].toarray().flatten()
train_df["help_count"] = word_counts
print("\nAverage 'help' count for class 0:", train_df[train_df["label"] == 0]["help_count"].mean())
print("Average 'help' count for class 1:", train_df[train_df["label"] == 1]["help_count"].mean())

word_index = vectorizer.vocabulary_['love']
word_counts = X_train_bow[:, word_index].toarray().flatten()
train_df["love_count"] = word_counts
print("\nAverage 'love' count for class 0:", train_df[train_df["label"] == 0]["love_count"].mean())
print("Average 'love' count for class 1:", train_df[train_df["label"] == 1]["love_count"].mean())

word_index = vectorizer.vocabulary_['fact']
word_counts = X_train_bow[:, word_index].toarray().flatten()
train_df["fact_count"] = word_counts
print("\nAverage 'fact' count for class 0:", train_df[train_df["label"] == 0]["fact_count"].mean())
print("Average 'fact' count for class 1:", train_df[train_df["label"] == 1]["fact_count"].mean())

word_index = vectorizer.vocabulary_['christmas']
word_counts = X_train_bow[:, word_index].toarray().flatten()
train_df["christmas_count"] = word_counts
print("\nAverage 'christmas' count for class 0:", train_df[train_df["label"] == 0]["christmas_count"].mean())
print("Average 'christmas' count for class 1:", train_df[train_df["label"] == 1]["christmas_count"].mean())



Average 'helping' count for class 0: 0.006463527239150508
Average 'helping' count for class 1: 0.03526448362720403

Average 'help' count for class 0: 0.04141933781822978
Average 'help' count for class 1: 0.13350125944584382

Average 'love' count for class 0: 0.009233610341643583
Average 'love' count for class 1: 0.022670025188916875

Average 'fact' count for class 0: 0.013190871916633689
Average 'fact' count for class 1: 0.011335012594458438

Average 'christmas' count for class 0: 0.001319087191663369
Average 'christmas' count for class 1: 0.031486146095717885


Misclassified Example:
Text: christmas celebration birth merely child child changed destiny humans forever celebration fact god wanted part human race took flesh blood became human like us also show unconditional love good deeds helping need help care human merciful
Original Text: Christmas is celebration of the birth of not merely a child , but a child who changed the destiny of humans forever . It is celebration of the fact that God wanted to be a part of the human race and so he took on flesh and blood and became human like us . We can also show unconditional love through our good deeds and helping those who are in need of our help and care . Be human and merciful .
True Label: 0
Predicted Label: 1

this example has original label (0,0) meaning two annotators marked it as non-PCL.