# Bag of Words Model

In [1]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report
import pandas as pd
from dont_patronize_me import DontPatronizeMe

### load in raw data
dpm = DontPatronizeMe('./Data', '.')
dpm.load_task1()
# dpm.load_task2(return_one_hot=True)
data=dpm.train_task1_df

trids = pd.read_csv('Data/train_semeval_parids-labels.csv')
teids = pd.read_csv('Data/dev_semeval_parids-labels.csv')
trids.par_id = trids.par_id.astype(str)
teids.par_id = teids.par_id.astype(str)

# make train dataset
rows = [] 
for idx in range(len(trids)):  
  parid = trids.par_id[idx]
  keyword = data.loc[data.par_id == parid].keyword.values[0]
  text = data.loc[data.par_id == parid].text.values[0]
  label = data.loc[data.par_id == parid].label.values[0]
  rows.append({
      'par_id':parid,
      'community':keyword,
      'text':text,
      'label':label
  })
trdf1 = pd.DataFrame(rows)

# make dev dataset
rows = []
for idx in range(len(teids)):  
  parid = teids.par_id[idx]
  keyword = data.loc[data.par_id == parid].keyword.values[0]
  text = data.loc[data.par_id == parid].text.values[0]
  label = data.loc[data.par_id == parid].label.values[0]
  rows.append({
      'par_id':parid,
      'community':keyword,
      'text':text,
      'label':label
  })
tedf1 = pd.DataFrame(rows)

## Run BoW on raw dataset

In [3]:
# Step 1: Prepare dataset
train_df = trdf1.copy()
train_df = train_df.sample(frac=1).reset_index(drop=True)
dev_df = tedf1

# Step 2: Train-test split
X_train = train_df["text"]
y_train = train_df['label']
X_test = dev_df["text"]
y_test = dev_df["label"]

# Step 3: Extract features using BoW
vectorizer = CountVectorizer()
X_train_counts = vectorizer.fit_transform(X_train)
X_test_counts = vectorizer.transform(X_test)

# Step 4: Train the classifier
clf = MultinomialNB()
clf.fit(X_train_counts, y_train)

# Step 5: Evaluate the classifier
y_pred = clf.predict(X_test_counts)
print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

# Append predictions to validation dataset
tedf1["prediction"] = y_pred

# extract example where prediction was PCL but it was actually no-PCL
new_df = tedf1[(tedf1["label"] == 0) & (tedf1["prediction"] == 1)]
print(new_df.iloc[0])
print(new_df["text"].iloc[0])

mispredict_example = new_df["text"].iloc[0]

Accuracy: 0.9054441260744985
              precision    recall  f1-score   support

           0       0.91      1.00      0.95      1895
           1       0.54      0.04      0.07       199

    accuracy                           0.91      2094
   macro avg       0.72      0.52      0.51      2094
weighted avg       0.87      0.91      0.87      2094

par_id                                                     8542
community                                              homeless
text          A homeless couple is seen by the roadside alon...
label                                                         0
prediction                                                    1
Name: 345, dtype: object
A homeless couple is seen by the roadside along Jalan Tuanku Abdul Rahman . -- Picture by Choo Choy May


In [None]:
# extract log probabilities 
log_probabilities = clf.feature_log_prob_

feature_names = vectorizer.get_feature_names_out()
class_0_probabilities = log_probabilities[0]
class_1_probabilities = log_probabilities[1]

# build a dataframe containing all words and their probabilities
probabilities_df = pd.DataFrame({
    'Feature': feature_names,
    'Class 0 Log Prob': class_0_probabilities,
    'Class 1 Log Prob': class_1_probabilities
}).round(3)

# "tokenise" mispredicted example
mispredict_example_list = mispredict_example.split(" ")
print(mispredict_example_list)

# extract the log probabilities of all the words mispredicted example
probs_of_example_df = probabilities_df[probabilities_df["Feature"].isin(mispredict_example_list)]
print(probs_of_example_df)

# calculate the total log probability of the sentence (for both class 0 and 1)
class_0_prob = probs_of_example_df["Class 0 Log Prob"].sum()
class_1_prob = probs_of_example_df["Class 1 Log Prob"].sum()

# print
print("class 0 prob",class_0_prob)
print("class 1 prob", class_1_prob)

['A', 'homeless', 'couple', 'is', 'seen', 'by', 'the', 'roadside', 'along', 'Jalan', 'Tuanku', 'Abdul', 'Rahman', '.', '--', 'Picture', 'by', 'Choo', 'Choy', 'May']
        Feature  Class 0 Log Prob  Class 1 Log Prob
1546      along            -8.358            -9.270
3986         by            -5.354            -5.926
5878     couple            -9.141            -9.963
11332  homeless            -6.232            -6.058
12578        is            -4.678            -5.021
20719  roadside           -11.366           -11.062
21582      seen            -8.462            -8.664
24164       the            -2.915            -3.393
class 0 prob -56.506
class 1 prob -59.357


# Upsample Dataset

## Upsampled train-train vs train val

In [None]:
# Calculate the number of rows for the 20% testing split
test_size = int(len(data) * 0.2)

# Step 1: Shuffle and split raw data
shuffled_data = data.sample(frac=1).reset_index(drop=True)
val_df = shuffled_data.iloc[:test_size]  # First 20%
train_df = shuffled_data.iloc[test_size:]  # Last 80%

# Separate positive and negative samples in TRAIN data only (don't touch TEST data)
positive_samples = train_df[train_df['label'] == 1]
negative_samples = train_df[train_df['label'] == 0]

# Calculate resampling ratio
ratio = len(negative_samples) / len(positive_samples)
print("ratio", ratio)

# Upsample positive samples
upsampled_positive = positive_samples.sample(frac=ratio, replace=True, random_state=42)

# Concatenate upsampled positive samples with original negative samples
upsampled_df = pd.concat([negative_samples, upsampled_positive])

# Shuffle the DataFrame
upsampled_df = upsampled_df.sample(frac=1, random_state=42).reset_index(drop=True)

# Check dataset is balanced
num_positive = len(upsampled_df[upsampled_df['label'] == 1])
num_negative = len(upsampled_df[upsampled_df['label'] == 0])
assert num_positive == num_negative, "Training data is not balanced"
print("Training data is balanced")
print("Training data length",len(upsampled_df))

# Step 2: Train-test split
X_train = upsampled_df["text"]
y_train = upsampled_df['label']
X_val = val_df["text"]
y_val = val_df["label"]

# Step 3: Extract features using BoW
vectorizer = CountVectorizer()
X_train_counts = vectorizer.fit_transform(X_train)
X_val_counts = vectorizer.transform(X_val) # only transform, dont fit

# Step 4: Train the classifier
clf = MultinomialNB()
clf.fit(X_train_counts, y_train)

# Step 5: Evaluate the classifier
y_pred = clf.predict(X_val_counts)
print("Accuracy:", accuracy_score(y_val, y_pred))
print(classification_report(y_val, y_pred))

# Append predictions to validation dataset
val_df["prediction"] = y_pred
print(val_df)

ratio 9.54911838790932
Training data is balanced
Training data length 15164
Accuracy: 0.8375537505972288
              precision    recall  f1-score   support

           0       0.95      0.87      0.91      1894
           1       0.31      0.57      0.40       199

    accuracy                           0.84      2093
   macro avg       0.63      0.72      0.65      2093
weighted avg       0.89      0.84      0.86      2093

     par_id      art_id        keyword country  \
0      9946   @@9395009     vulnerable      ke   
1      2421  @@15077401       disabled      za   
2     10138   @@3695746      immigrant      gb   
3      6976  @@25505334        refugee      lk   
4      9410   @@2424134     vulnerable      ie   
...     ...         ...            ...     ...   
2088   5482   @@4538978       disabled      pk   
2089   1119  @@16163174     vulnerable      pk   
2090   7540  @@24149675  poor-families      ke   
2091   4137   @@4553170          women      jm   
2092   3830  @@216

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  val_df["prediction"] = y_pred


## Upsampled Train vs Dev

In [None]:
# Step 1: Shuffle and split raw data
train_df = trdf1.copy()
train_df = train_df.sample(frac=1).reset_index(drop=True)
dev_df = tedf1

# Separate positive and negative samples in TRAIN data only (don't touch TEST data)
positive_samples = train_df[train_df['label'] == 1]
negative_samples = train_df[train_df['label'] == 0]

# Calculate resampling ratio
ratio = len(negative_samples) / len(positive_samples)
print("ratio", ratio)

# Upsample positive samples
upsampled_positive = positive_samples.sample(frac=ratio, replace=True, random_state=42)

# Concatenate upsampled positive samples with original negative samples
upsampled_df = pd.concat([negative_samples, upsampled_positive])

# Shuffle the DataFrame
upsampled_df = upsampled_df.sample(frac=1, random_state=42).reset_index(drop=True)

# Check dataset is balanced
num_positive = len(upsampled_df[upsampled_df['label'] == 1])
num_negative = len(upsampled_df[upsampled_df['label'] == 0])
assert num_positive == num_negative, "Training data is not balanced"
print("Training data is balanced")
print("Training data length",len(upsampled_df))

# Step 2: Train-test split
X_train = upsampled_df["text"]
y_train = upsampled_df['label']
X_test = dev_df["text"]
y_test = dev_df["label"]

# Step 3: Extract features using BoW
vectorizer = CountVectorizer()
X_train_counts = vectorizer.fit_transform(X_train)
X_test_counts = vectorizer.transform(X_test) # only transform, dont fit

# Step 4: Train the classifier
clf = MultinomialNB()
clf.fit(X_train_counts, y_train)

# Step 5: Evaluate the classifier
y_pred = clf.predict(X_test_counts)
print("Accuracy:", accuracy_score(y_val, y_pred))
print(classification_report(y_val, y_pred))

# Append predictions to validation dataset
tedf1["prediction"] = y_pred
print(tedf1)

ratio 9.547858942065492
Training data is balanced
Training data length 15162


ValueError: Found input variables with inconsistent numbers of samples: [2093, 2094]

## Downsampling Dataset

## Downsample train-train vs train-val

In [None]:
# Calculate the number of rows for the 20% testing split
test_size = int(len(data) * 0.2)

# Step 1: Shuffle and split raw data
shuffled_data = data.sample(frac=1).reset_index(drop=True)
test_df = shuffled_data.iloc[:test_size]  # First 20%
train_df = shuffled_data.iloc[test_size:]  # Last 80%

# Separate positive and negative samples in TRAIN data only (don't touch TEST data)
positive_samples = train_df[train_df['label'] == 1]
negative_samples = train_df[train_df['label'] == 0]

# Determine the number of positive samples
num_positive_samples = len(positive_samples)

# Downsample negative samples to match the number of positive samples
downsampled_negative_samples = negative_samples.iloc[0:num_positive_samples]  # Adjust random_state if needed

# Combine positive and downsampled negative samples
balanced_train_df = pd.concat([positive_samples, downsampled_negative_samples])

# Shuffle the DataFrame
balanced_train_df = balanced_train_df.sample(frac=1, random_state=42).reset_index(drop=True)  # Shuffle for randomness, adjust random_state if needed

# Check dataset is balanced
num_positive = len(balanced_train_df[balanced_train_df['label'] == 1])
num_negative = len(balanced_train_df[balanced_train_df['label'] == 0])
assert num_positive == num_negative, "Training data is not balanced"
print("Training data is balanced")
print("Training data length",len(upsampled_df))

# Step 2: Train-test split
X_train = balanced_train_df["text"]
y_train = balanced_train_df['label']
X_val = test_df["text"]
y_val = test_df["label"]

# Step 3: Extract features using BoW
vectorizer = CountVectorizer()
X_train_counts = vectorizer.fit_transform(X_train)
X_val_counts = vectorizer.transform(X_val) # only transform, dont fit

# Step 4: Train the classifier
clf = MultinomialNB()
clf.fit(X_train_counts, y_train)

# Step 5: Evaluate the classifier
y_pred = clf.predict(X_val_counts)
print("Accuracy:", accuracy_score(y_val, y_pred))
print(classification_report(y_val, y_pred))

Training data is balanced
Training data length 15162
Accuracy: 0.5962732919254659
              precision    recall  f1-score   support

           0       0.98      0.57      0.72      1907
           1       0.16      0.86      0.27       186

    accuracy                           0.60      2093
   macro avg       0.57      0.72      0.50      2093
weighted avg       0.90      0.60      0.68      2093



## Downsample train vs dev

In [None]:
train_df = trdf1.copy()
train_df = train_df.sample(frac=1).reset_index(drop=True)
dev_df = tedf1

# Separate positive and negative samples in TRAIN data only (don't touch TEST data)
positive_samples = train_df[train_df['label'] == 1]
negative_samples = train_df[train_df['label'] == 0]

# Determine the number of positive samples
num_positive_samples = len(positive_samples)

# Downsample negative samples to match the number of positive samples
downsampled_negative_samples = negative_samples.iloc[0:num_positive_samples]  # Adjust random_state if needed

# Combine positive and downsampled negative samples
balanced_train_df = pd.concat([positive_samples, downsampled_negative_samples])

# Shuffle the DataFrame
balanced_train_df = balanced_train_df.sample(frac=1, random_state=42).reset_index(drop=True)  # Shuffle for randomness, adjust random_state if needed

# Check dataset is balanced
num_positive = len(balanced_train_df[balanced_train_df['label'] == 1])
num_negative = len(balanced_train_df[balanced_train_df['label'] == 0])
assert num_positive == num_negative, "Training data is not balanced"
print("Training data is balanced")
print("Training data length",len(upsampled_df))

# Step 2: Train-test split
X_train = balanced_train_df["text"]
y_train = balanced_train_df['label']
X_test = dev_df["text"]
y_test = dev_df["label"]

# Step 3: Extract features using BoW
vectorizer = CountVectorizer()
X_train_counts = vectorizer.fit_transform(X_train)
X_test_counts = vectorizer.transform(X_test) # only transform, dont fit

# Step 4: Train the classifier
clf = MultinomialNB()
clf.fit(X_train_counts, y_train)

# Step 5: Evaluate the classifier
y_pred = clf.predict(X_test_counts)
print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

Training data is balanced
Training data length 15162
Accuracy: 0.5830945558739254
              precision    recall  f1-score   support

           0       0.98      0.55      0.70      1895
           1       0.17      0.90      0.29       199

    accuracy                           0.58      2094
   macro avg       0.58      0.72      0.50      2094
weighted avg       0.90      0.58      0.67      2094

