In [1]:
# This Python 3 environment was run through kaggle notebooks
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# The image is functionally a jupyter notebook, although items were saved to subfolders on the kaggle cloud storage system

# In order to run this code on another system, please make sure that Python 3 is installed correctly, and change any path names to match
# your computer system

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import random
from collections import defaultdict

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))
subfolder_path = 'kaggle/output/outputdata'

if not os.path.exists(subfolder_path):
    os.makedirs(subfolder_path)

/kaggle/input/amar-protest/protes.csv
/kaggle/input/short-articles/training_data_for_short_article_testing.csv
/kaggle/input/short-articles/100_short_articles_testing.csv
/kaggle/input/indices-amar/test_indices_fold_0.csv
/kaggle/input/indices-amar/train_indices_fold_1.csv
/kaggle/input/indices-amar/downsampled_index.csv
/kaggle/input/indices-amar/train_indices_fold_0.csv
/kaggle/input/indices-amar/test_indices_fold_1.csv
/kaggle/input/indices-amar/train_indices_fold_4.csv
/kaggle/input/indices-amar/test_indices_fold_2.csv
/kaggle/input/indices-amar/train_indices_fold_2.csv
/kaggle/input/indices-amar/indices.txt
/kaggle/input/indices-amar/data_downsampled.csv
/kaggle/input/indices-amar/train_indices_fold_3.csv
/kaggle/input/indices-amar/test_indices_fold_4.csv
/kaggle/input/indices-amar/test_indices_fold_3.csv


In [2]:
#!pip uninstall -y transformers
!pip install --upgrade accelerate
!pip install transformers==4.28.0

Collecting accelerate
  Downloading accelerate-0.26.1-py3-none-any.whl (270 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m270.9/270.9 kB[0m [31m11.8 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: accelerate
  Attempting uninstall: accelerate
    Found existing installation: accelerate 0.12.0
    Uninstalling accelerate-0.12.0:
      Successfully uninstalled accelerate-0.12.0
Successfully installed accelerate-0.26.1
[0mCollecting transformers==4.28.0
  Downloading transformers-4.28.0-py3-none-any.whl (7.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.0/7.0 MB[0m [31m71.2 MB/s[0m eta [36m0:00:00[0m:00:01[0m0:01[0m
Installing collected packages: transformers
  Attempting uninstall: transformers
    Found existing installation: transformers 4.29.2
    Uninstalling transformers-4.29.2:
      Successfully uninstalled transformers-4.29.2
Successfully installed transformers-4.28.0
[0m

In [3]:
# For machine learning tools and evaluation
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, classification_report
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import confusion_matrix

# For deep learning
# https://pytorch.org/tutorials/beginner/basics/quickstart_tutorial.html
import torch

# For plotting and data visualization
#import matplotlib

#import matplotlib.pyplot as plt
#import seaborn as sns
#from matplotlib import ticker

#sns.set(style='ticks', font_scale=1.2)

# transformers
from transformers import AutoTokenizer, BertForSequenceClassification
from transformers import Trainer, TrainingArguments

model_name = 'bert-base-cased'

device_name = 'cuda'

caused by: ['/opt/conda/lib/python3.10/site-packages/tensorflow_io/python/ops/libtensorflow_io_plugins.so: undefined symbol: _ZN3tsl6StatusC1EN10tensorflow5error4CodeESt17basic_string_viewIcSt11char_traitsIcEENS_14SourceLocationE']
caused by: ['/opt/conda/lib/python3.10/site-packages/tensorflow_io/python/ops/libtensorflow_io.so: undefined symbol: _ZTVN10tensorflow13GcsFileSystemE']


In [5]:
# This code loads the news articles, removes short articles, cleans the data frame to fit the transormers format and saves the data to csv
df = pd.read_csv("/kaggle/input/amar-protest/protes.csv")
df['Protest']
df['labels'] = df['Protest'].apply(lambda x: 1 if x == 'Protest' else 0)
df = df.rename(columns={'body': 'text'})
df = df[['labels', 'text']]
df = df[df['text'].notna() & (df['text'] != '')]
df = df[df['text'].str.len() >= 50]
# Separate the majority and minority classes
df_majority = df[df['labels'] == 0]
df_minority = df[df['labels'] == 1]

# Down-sample the majority class
df_majority_downsampled = df_majority.sample(n=len(df_minority), random_state=0)
downsampled_index = df_majority_downsampled.index
# Combine the down-sampled majority class with the minority class
df_downsampled = pd.concat([df_majority_downsampled, df_minority])

# Save files
df_downsampled.to_csv('/output/outputdata/data_downsampled.csv', index=False)
index_series = pd.Series(downsampled_index)
index_series.to_csv('/output/outputdata/downsampled_index.csv', index=False, header=['index'])

In [7]:
seed = 41
n = 5
skf = StratifiedKFold(n_splits=n)

skf.get_n_splits(df_downsampled['text'], df_downsampled['labels'])

# Save the indices to a CSV file
for i, (train_index, test_index) in enumerate(skf.split(df_downsampled['text'], df_downsampled['labels'])):
    np.savetxt(f'/output/outputdata/train_indices_fold_{i}.csv', train_index, delimiter=',')
    np.savetxt(f'/output/outputdata/test_indices_fold_{i}.csv', test_index, delimiter=',')
tokenizer = AutoTokenizer.from_pretrained(model_name)  # The model_name needs to match our pre-trained model.

X = df['text'].values
Y = df['labels'].values

# Convert the labels to a numpy array
Y = np.array(Y)

# Initialize results storage
results = []

In [4]:
class MyDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    acc = accuracy_score(labels, preds)
    return {
        'accuracy': acc,
    }

In [7]:
results = []
confusion_matrices = []  # Store confusion matrices for each fold
classification_reports = [] #store classification reports
n_splits = 5  # Assuming you have 5 folds

# This fold runs the BERT model across each fold
for i in range(n_splits):
    # Define the file paths for train and test indices
    train_indices_file = f'/kaggle/input/indices-amar/train_indices_fold_{i}.csv'
    test_indices_file = f'/kaggle/input/indices-amar/test_indices_fold_{i}.csv'
    # Load train and test indices from CSV files
    train_index = np.loadtxt(train_indices_file, delimiter=',', dtype=int)
    test_index = np.loadtxt(test_indices_file, delimiter=',', dtype=int)

    # Subset your DataFrame to create train and test sets
    X_train, X_test = df_downsampled['text'].iloc[train_index], df_downsampled['text'].iloc[test_index]
    Y_train, Y_test = df_downsampled['labels'].iloc[train_index], df_downsampled['labels'].iloc[test_index]

    # Convert train and test sets to lists for tokenization
    X_train = X_train.tolist()
    X_test = X_test.tolist()

    # Tokenize the text data
    train_encodings = tokenizer(X_train, truncation=True, padding=True)
    test_encodings = tokenizer(X_test, truncation=True, padding=True)

    # Create torch datasets using the custom dataset class MyDataset
    train_dataset = MyDataset(train_encodings, Y_train.to_numpy())
    test_dataset = MyDataset(test_encodings, Y_test.to_numpy())


    # Load pre-trained BERT model and send it to the GPU
    model = BertForSequenceClassification.from_pretrained(model_name, num_labels=2).to(device_name)

    # Define training arguments
    training_args = TrainingArguments(
        num_train_epochs=3,  # total number of training epochs
        per_device_train_batch_size=16,  # batch size per device during training
        per_device_eval_batch_size=20,  # batch size for evaluation
        learning_rate=5e-5,  # initial learning rate for Adam optimizer
        warmup_steps=100,  # number of warmup steps for learning rate scheduler (set lower because of small dataset size)
        weight_decay=0.01,  # strength of weight decay
        output_dir=f'./results/fold_{i}',  # Change output directory for each fold
        logging_dir=f'./logs/fold_{i}',  # Change logging directory for each fold
        logging_steps=100,  # number of steps to output logging (set lower because of small dataset size)
        evaluation_strategy='steps',  # evaluate during fine-tuning so that we can see progress
    )

    # Initialize the Trainer
    trainer = Trainer(
    model=model,  # the instantiated Transformers model to be trained
    args=training_args,  # training arguments, defined above
    train_dataset=train_dataset,  # training dataset
    eval_dataset=test_dataset,  # evaluation dataset (usually a validation set; here we just send our test set)
    compute_metrics=compute_metrics  # our custom evaluation function
    )

    # Train the model
    trainer.train()

    # Evaluate the model
    fold_results = trainer.evaluate()
    
    # Save the results for this fold
    results.append(fold_results)

    # Clean up CUDA memory
    torch.cuda.empty_cache()
    
    # Predict on the test set
    test_predictions = trainer.predict(test_dataset)
    preds = np.argmax(test_predictions.predictions, axis=-1)

    # Save predictions to a CSV file
    preds_df = pd.DataFrame(preds, columns=['predictions'])
    preds_df.to_csv(f'predictions_fold_{i+1}.csv', index=False)
    
    # Compute and store the confusion matrix
    cm = confusion_matrix(Y_test, preds)
    confusion_matrices.append(cm)
    
    # Compute and store the classification report
    report = classification_report(Y_test, preds, output_dict=True)
    classification_reports.append(report)

    print(f'Fold {i} Classification Report:')
    print(f'Precision: {report["weighted avg"]["precision"]}')
    print(f'Recall: {report["weighted avg"]["recall"]}')
    print(f'F1-score: {report["weighted avg"]["f1-score"]}')

    # Evaluate the model and append results
    fold_results = trainer.evaluate()
    results.append(fold_results)

    # Clean up CUDA memory
    torch.cuda.empty_cache()

# After all folds, you can process and display the stored confusion matrices
for fold, cm in enumerate(confusion_matrices):
    print(f'Fold {fold} Confusion Matrix:')
    print(cm)

    * make sure the original data is stored as integers.
    * use the `converters=` keyword argument.  If you only use
      NumPy 1.23 or later, `converters=float` will normally work.
    * Use `np.loadtxt(...).astype(np.int64)` parsing the file as
      floating point and then convert it.  (On all NumPy versions.)
  (Deprecated NumPy 1.23)
  train_index = np.loadtxt(train_indices_file, delimiter=',', dtype=int)
    * make sure the original data is stored as integers.
    * use the `converters=` keyword argument.  If you only use
      NumPy 1.23 or later, `converters=float` will normally work.
    * Use `np.loadtxt(...).astype(np.int64)` parsing the file as
      floating point and then convert it.  (On all NumPy versions.)
  (Deprecated NumPy 1.23)
  test_index = np.loadtxt(test_indices_file, delimiter=',', dtype=int)
Some weights of the model checkpoint at bert-base-cased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.trans

  ········································


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


Step,Training Loss,Validation Loss,Accuracy
100,0.6179,0.359679,0.846154


Fold 0 Classification Report:
Precision: 0.8561868686868688
Recall: 0.8406593406593407
F1-score: 0.8389036413026891


    * make sure the original data is stored as integers.
    * use the `converters=` keyword argument.  If you only use
      NumPy 1.23 or later, `converters=float` will normally work.
    * Use `np.loadtxt(...).astype(np.int64)` parsing the file as
      floating point and then convert it.  (On all NumPy versions.)
  (Deprecated NumPy 1.23)
  train_index = np.loadtxt(train_indices_file, delimiter=',', dtype=int)
    * make sure the original data is stored as integers.
    * use the `converters=` keyword argument.  If you only use
      NumPy 1.23 or later, `converters=float` will normally work.
    * Use `np.loadtxt(...).astype(np.int64)` parsing the file as
      floating point and then convert it.  (On all NumPy versions.)
  (Deprecated NumPy 1.23)
  test_index = np.loadtxt(test_indices_file, delimiter=',', dtype=int)
Some weights of the model checkpoint at bert-base-cased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.trans

Step,Training Loss,Validation Loss,Accuracy
100,0.6243,0.490899,0.730769


Fold 1 Classification Report:
Precision: 0.8716853085333668
Recall: 0.8571428571428571
F1-score: 0.8557317073170732


    * make sure the original data is stored as integers.
    * use the `converters=` keyword argument.  If you only use
      NumPy 1.23 or later, `converters=float` will normally work.
    * Use `np.loadtxt(...).astype(np.int64)` parsing the file as
      floating point and then convert it.  (On all NumPy versions.)
  (Deprecated NumPy 1.23)
  train_index = np.loadtxt(train_indices_file, delimiter=',', dtype=int)
    * make sure the original data is stored as integers.
    * use the `converters=` keyword argument.  If you only use
      NumPy 1.23 or later, `converters=float` will normally work.
    * Use `np.loadtxt(...).astype(np.int64)` parsing the file as
      floating point and then convert it.  (On all NumPy versions.)
  (Deprecated NumPy 1.23)
  test_index = np.loadtxt(test_indices_file, delimiter=',', dtype=int)
Some weights of the model checkpoint at bert-base-cased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.trans

Step,Training Loss,Validation Loss,Accuracy
100,0.5867,0.475064,0.774725


Fold 2 Classification Report:
Precision: 0.7817337461300309
Recall: 0.7637362637362637
F1-score: 0.7599018254333488


    * make sure the original data is stored as integers.
    * use the `converters=` keyword argument.  If you only use
      NumPy 1.23 or later, `converters=float` will normally work.
    * Use `np.loadtxt(...).astype(np.int64)` parsing the file as
      floating point and then convert it.  (On all NumPy versions.)
  (Deprecated NumPy 1.23)
  train_index = np.loadtxt(train_indices_file, delimiter=',', dtype=int)
    * make sure the original data is stored as integers.
    * use the `converters=` keyword argument.  If you only use
      NumPy 1.23 or later, `converters=float` will normally work.
    * Use `np.loadtxt(...).astype(np.int64)` parsing the file as
      floating point and then convert it.  (On all NumPy versions.)
  (Deprecated NumPy 1.23)
  test_index = np.loadtxt(test_indices_file, delimiter=',', dtype=int)
Some weights of the model checkpoint at bert-base-cased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.trans

Step,Training Loss,Validation Loss,Accuracy
100,0.6088,0.583337,0.734807


Fold 3 Classification Report:
Precision: 0.7674289768438849
Recall: 0.7403314917127072
F1-score: 0.7332409546231636


    * make sure the original data is stored as integers.
    * use the `converters=` keyword argument.  If you only use
      NumPy 1.23 or later, `converters=float` will normally work.
    * Use `np.loadtxt(...).astype(np.int64)` parsing the file as
      floating point and then convert it.  (On all NumPy versions.)
  (Deprecated NumPy 1.23)
  train_index = np.loadtxt(train_indices_file, delimiter=',', dtype=int)
    * make sure the original data is stored as integers.
    * use the `converters=` keyword argument.  If you only use
      NumPy 1.23 or later, `converters=float` will normally work.
    * Use `np.loadtxt(...).astype(np.int64)` parsing the file as
      floating point and then convert it.  (On all NumPy versions.)
  (Deprecated NumPy 1.23)
  test_index = np.loadtxt(test_indices_file, delimiter=',', dtype=int)
Some weights of the model checkpoint at bert-base-cased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.trans

Step,Training Loss,Validation Loss,Accuracy
100,0.5552,1.051015,0.513812


Fold 4 Classification Report:
Precision: 0.7015139556576021
Recall: 0.6574585635359116
F1-score: 0.6384799163804689


Fold 0 Confusion Matrix:
[[67 24]
 [ 5 86]]
Fold 1 Confusion Matrix:
[[69 22]
 [ 4 87]]
Fold 2 Confusion Matrix:
[[81 10]
 [33 58]]
Fold 3 Confusion Matrix:
[[82  9]
 [38 52]]
Fold 4 Confusion Matrix:
[[80 10]
 [52 39]]


In [None]:
# save results
!zip -r file.zip outputs

In [13]:
# This code trains a model just on the clean text used when prompt engineering GPT-4

df_test = pd.read_csv("/kaggle/input/short-articles/100_short_articles_testing.csv")
df_train = pd.read_csv("/kaggle/input/short-articles/training_data_for_short_article_testing.csv")
df_train['text'] = df_train['body']
df_test['text'] = df_test['body']

'bert-base-cased'

In [None]:
tokenizer = AutoTokenizer.from_pretrained(model_name)  # The model_name needs to match our pre-trained model.

# Subset your DataFrame to create train and test sets
X_train, X_test = df_train['text'], df_test['text']
Y_train, Y_test = df_train['labels'], df_test['labels']

# Convert train and test sets to lists for tokenization
X_train = X_train.tolist()
X_test = X_test.tolist()

# Tokenize the text data
train_encodings = tokenizer(X_train, truncation=True, padding=True)
test_encodings = tokenizer(X_test, truncation=True, padding=True)

# Create torch datasets using the custom dataset class MyDataset
train_dataset = MyDataset(train_encodings, Y_train.to_numpy())
test_dataset = MyDataset(test_encodings, Y_test.to_numpy())

model = BertForSequenceClassification.from_pretrained(model_name, num_labels=2).to(device_name)

# Define training arguments
training_args = TrainingArguments(
    num_train_epochs=3,  # total number of training epoch
    per_device_train_batch_size=16,  # batch size per device during training
    per_device_eval_batch_size=20,  # batch size for evaluatio
    learning_rate=5e-5,  # initial learning rate for Adam optimizer
    warmup_steps=100,  # number of warmup steps for learning rate scheduler (set lower because of small dataset size)
    weight_decay=0.01,  # strength of weight decay
    output_dir=f'./results/short_articles',  # Change output directory for each fold
    logging_dir=f'./logs/short_articles',  # Change logging directory for each fold
    logging_steps=100,  # number of steps to output logging (set lower because of small dataset size)
    evaluation_strategy='steps',  # evaluate during fine-tuning so that we can see progress
)

# Initialize the Trainer
trainer = Trainer(
    model=model,  # the instantiated Transformers model to be trained
    args=training_args,  # training arguments, defined above
    train_dataset=train_dataset,  # training dataset
    eval_dataset=test_dataset,  # evaluation dataset (usually a validation set; here we just send our test set)
    compute_metrics=compute_metrics  # our custom evaluation function
)

# Train the model
trainer.train()

# Evaluate the model
results = trainer.evaluate()
    
# Clean up CUDA memory
torch.cuda.empty_cache()

In [20]:
# Predict on the test set
test_predictions = trainer.predict(test_dataset)
preds = np.argmax(test_predictions.predictions, axis=-1)

# Save predictions to a CSV file
preds_df = pd.DataFrame(preds, columns=['predictions'])
preds_df.to_csv(f'predictions_short_articles.csv', index=False)
# Compute and store the confusion matrix
cm = confusion_matrix(Y_test, preds)
    
# Compute and store the classification report
report = classification_report(Y_test, preds, output_dict=True)

print(f'Short article Classification Report:')
print(f'Precision: {report["weighted avg"]["precision"]}')
print(f'Recall: {report["weighted avg"]["recall"]}')
print(f'F1-score: {report["weighted avg"]["f1-score"]}')

cm

Short article Classification Report:
Precision: 0.920673076923077
Recall: 0.92
F1-score: 0.919967987194878


array([[47,  3],
       [ 5, 45]])