In [None]:
!pip install transformers
!pip install datasets


Collecting datasets
  Downloading datasets-2.20.0-py3-none-any.whl.metadata (19 kB)
Collecting pyarrow>=15.0.0 (from datasets)
  Downloading pyarrow-17.0.0-cp310-cp310-manylinux_2_28_x86_64.whl.metadata (3.3 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting requests>=2.32.2 (from datasets)
  Downloading requests-2.32.3-py3-none-any.whl.metadata (4.6 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.4.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.5.0,>=2023.1.0 (from fsspec[http]<=2024.5.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.5.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-2.20.0-py3-none-any.whl (547 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m547.8/547.8 kB[0m [31m11.3 MB/s[0m eta [36m0:00:00

In [None]:
import pandas as pd
from transformers import RobertaTokenizer, RobertaModel, Trainer, TrainingArguments
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader

# Load data
df = pd.read_csv('/content/reddit_appeal_analysis_1.csv')

# Convert the score columns to numeric type, coercing errors to NaN
df['logos_score'] = pd.to_numeric(df['logos_score'], errors='coerce')
df['pathos_score'] = pd.to_numeric(df['pathos_score'], errors='coerce')
df['ethos_score'] = pd.to_numeric(df['ethos_score'], errors='coerce')

# Drop rows with NaN values
df = df.dropna()

# Normalize the scores
df['logos_score'] = df['logos_score'] / 100.0
df['pathos_score'] = df['pathos_score'] / 100.0
df['ethos_score'] = df['ethos_score'] / 100.0

# Display the first few rows of the DataFrame to verify normalization
print(df.head())

# Initialize tokenizer
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')

# Tokenize text
def tokenize_function(text):
    return tokenizer(text, padding='max_length', truncation=True, max_length=512, return_tensors='pt')

df['input_ids'] = df['argument_text'].apply(lambda x: tokenize_function(x)['input_ids'].squeeze())
df['attention_mask'] = df['argument_text'].apply(lambda x: tokenize_function(x)['attention_mask'].squeeze())

                                       argument_text  logos_score  \
0  I've been bothered by this for years. Thank yo...          0.0   
1  “Those who cannot conceive Friendship as a sub...          0.7   
2  I remember in the late 90s reading an article ...          0.3   
3  I’m going to sound like an old fart here but.....          0.6   
4  Scholars have been debating Achellies and Patr...          0.7   

   pathos_score  ethos_score  \
0          0.50         0.30   
1          0.40         0.80   
2          0.70         0.20   
3          0.25         0.65   
4          0.00         0.85   

                                          logos_expl  \
0  The statement does not present any logical arg...   
1  The argument presents a logical premise by dif...   
2  The speaker makes a logical argument about the...   
3  The argument includes an effort to engage in l...   
4  The argument uses logic by pointing out that d...   

                                         pathos_expl  \

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

In [None]:
# Define the Dataset class
class ArgumentDataset(Dataset):
    def __init__(self, df):
        self.input_ids = df['input_ids'].tolist()
        self.attention_masks = df['attention_mask'].tolist()
        self.labels = df[['logos_score', 'pathos_score', 'ethos_score']].values

    def __len__(self):
        return len(self.input_ids)

    def __getitem__(self, idx):
        return {
            'input_ids': torch.tensor(self.input_ids[idx], dtype=torch.long),
            'attention_mask': torch.tensor(self.attention_masks[idx], dtype=torch.long),
            'labels': torch.tensor(self.labels[idx], dtype=torch.float)
        }

# Create the dataset and split
dataset = ArgumentDataset(df)
train_size = int(0.8 * len(dataset))
test_size = len(dataset) - train_size
train_dataset, test_dataset = torch.utils.data.random_split(dataset, [train_size, test_size])

# Define the Model
class RobertaForArgumentScoring(RobertaModel):
    def __init__(self, config):
        super().__init__(config)
        self.roberta = RobertaModel(config)
        self.dropout = nn.Dropout(config.hidden_dropout_prob)
        self.linear = nn.Linear(config.hidden_size, 3)  # 3 scores: logos, pathos, ethos
        self.init_weights()

    def forward(self, input_ids=None, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None, inputs_embeds=None, labels=None):
        outputs = self.roberta(input_ids, attention_mask=attention_mask, position_ids=position_ids, head_mask=head_mask, inputs_embeds=inputs_embeds)
        pooled_output = outputs[1]  # Use [CLS] token output
        pooled_output = self.dropout(pooled_output)
        scores = self.linear(pooled_output)

        loss = None
        if labels is not None:
            loss_fct = nn.MSELoss()
            loss = loss_fct(scores, labels)

        return {'loss': loss, 'logits': scores}

# Define training arguments
training_args = TrainingArguments(
    output_dir='./results',          # output directory
    num_train_epochs=5,              # total number of training epochs
    per_device_train_batch_size=16,  # batch size for training
    per_device_eval_batch_size=16,   # batch size for evaluation
    warmup_steps=500,                # number of warmup steps for learning rate scheduler
    weight_decay=0.01,               # strength of weight decay
    logging_dir='./logs',            # directory for storing logs
    logging_steps=10,
)

# Initialize the model
model = RobertaForArgumentScoring.from_pretrained('roberta-base')

# Define the Trainer
trainer = Trainer(
    model=model,                         # the instantiated 🤗 Transformers model to be trained
    args=training_args,                  # training arguments, defined above
    train_dataset=train_dataset,         # training dataset
    eval_dataset=test_dataset            # evaluation dataset
)

# Train the model
trainer.train()

# Evaluate the model
trainer.evaluate()

# Save the trained model and tokenizer
model.save_pretrained('./argument_scoring_model')
tokenizer.save_pretrained('./argument_scoring_model')


model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of RobertaForArgumentScoring were not initialized from the model checkpoint at roberta-base and are newly initialized: ['embeddings.LayerNorm.bias', 'embeddings.LayerNorm.weight', 'embeddings.position_embeddings.weight', 'embeddings.token_type_embeddings.weight', 'embeddings.word_embeddings.weight', 'encoder.layer.0.attention.output.LayerNorm.bias', 'encoder.layer.0.attention.output.LayerNorm.weight', 'encoder.layer.0.attention.output.dense.bias', 'encoder.layer.0.attention.output.dense.weight', 'encoder.layer.0.attention.self.key.bias', 'encoder.layer.0.attention.self.key.weight', 'encoder.layer.0.attention.self.query.bias', 'encoder.layer.0.attention.self.query.weight', 'encoder.layer.0.attention.self.value.bias', 'encoder.layer.0.attention.self.value.weight', 'encoder.layer.0.intermediate.dense.bias', 'encoder.layer.0.intermediate.dense.weight', 'encoder.layer.0.output.LayerNorm.bias', 'encoder.layer.0.output.LayerNorm.weight', 'encoder.layer.0.output.dense.bias', 'enco

Step,Training Loss
10,0.3285
20,0.3027
30,0.2516
40,0.1906
50,0.1287
60,0.104
70,0.0779
80,0.0682
90,0.0647
100,0.0617


  'input_ids': torch.tensor(self.input_ids[idx], dtype=torch.long),
  'attention_mask': torch.tensor(self.attention_masks[idx], dtype=torch.long),
  'input_ids': torch.tensor(self.input_ids[idx], dtype=torch.long),
  'attention_mask': torch.tensor(self.attention_masks[idx], dtype=torch.long),
  'input_ids': torch.tensor(self.input_ids[idx], dtype=torch.long),
  'attention_mask': torch.tensor(self.attention_masks[idx], dtype=torch.long),
  'input_ids': torch.tensor(self.input_ids[idx], dtype=torch.long),
  'attention_mask': torch.tensor(self.attention_masks[idx], dtype=torch.long),


('./argument_scoring_model/tokenizer_config.json',
 './argument_scoring_model/special_tokens_map.json',
 './argument_scoring_model/vocab.json',
 './argument_scoring_model/merges.txt',
 './argument_scoring_model/added_tokens.json')

In [None]:
from transformers import BertTokenizer # Load BertForArgumentScoring instead of BertForSequenceClassification

tokenizer = RobertaTokenizer.from_pretrained('/content/argument_scoring_model')
model = RobertaForArgumentScoring.from_pretrained('/content/argument_scoring_model') # Load BertForArgumentScoring instead of BertForSequenceClassification
# Set the model to evaluation mode
model.eval()

def prepare_input(text):
    return tokenizer(text, padding='max_length', truncation=True, max_length=512, return_tensors='pt')

# Example input text
text = "magine a mother who has to choose between paying for her child’s life-saving medication or keeping the lights on at home. Every day, countless families face this agonizing decision because the cost of healthcare in the U.S. is so high. This isn’t just a statistic—it’s a heartbreaking reality that affects real people. The emotional toll of watching loved ones suffer due to unaffordable care is immense and unacceptable. We need to act now to ensure that every individual, regardless of their financial situation, has access to the healthcare they need. It’s time to put compassion into action and make healthcare affordable for everyone."

# Prepare the input
inputs = prepare_input(text)

# Run the input through the model
# Move the input tensors to the same device as the model
with torch.no_grad():
    outputs = model(
        input_ids=inputs['input_ids'].to(model.device), # Move input_ids to the model's device
        attention_mask=inputs['attention_mask'].to(model.device) # Move attention_mask to the model's device
    )

# Extract scores from the output
scores = outputs['logits'].squeeze().tolist()

# Print the results
print(f"Logos Score: {scores[0]:.2f}")
print(f"Pathos Score: {scores[1]:.2f}")
print(f"Ethos Score: {scores[2]:.2f}")

Logos Score: 0.56
Pathos Score: 0.93
Ethos Score: 0.34


In [None]:
!zip -r rhetorical_model_2.zip /content/argument_scoring_model

  adding: content/argument_scoring_model/ (stored 0%)
  adding: content/argument_scoring_model/vocab.json (deflated 68%)
  adding: content/argument_scoring_model/config.json (deflated 49%)
  adding: content/argument_scoring_model/tokenizer_config.json (deflated 76%)
  adding: content/argument_scoring_model/merges.txt (deflated 53%)
  adding: content/argument_scoring_model/model.safetensors (deflated 10%)
  adding: content/argument_scoring_model/special_tokens_map.json (deflated 84%)


In [None]:
from google.colab import files
files.download('rhetorical_model_2.zip')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>