# Title and Summary extraction via web scraping from sites


In [1]:
import feedparser
import pandas as pd


In [2]:
def parse_rss_feeds(feed_links):
    feed_data = []
    for link in feed_links:
        feed = feedparser.parse(link)
        for entry in feed.entries:
            feed_data.append({
                'feed_link': link,
                'title': entry.get('title', ''),
                'published': entry.get('published', ''),
                'summary': entry.get('summary', ''),
                'link': entry.get('link', '')
            })
    return feed_data

In [3]:
rss_feed_links = [
    'https://www.justice.gov/news/rss?m=1',
    'https://www.sec.gov/news/pressreleases.rss',
    'https://www.cftc.gov/RSS/RSSENF/rssenf.xml',
    'https://www.ftc.gov/feeds/press-release.xml',
    'http://feeds.finra.org/FINRANotices',
    'https://www.cftc.gov/RSS/RSSGP/rssgp.xml',
    'https://www.justice.gov/news/rss?type=press_release&groupname=441&field_component=1981&search_api_language=en&require_all=1',
    'https://www.justice.gov/news/rss?type=press_release&groupname=236&field_component=1751&search_api_language=en&require_all=1',
    'https://www.justice.gov/news/rss?type=press_release&groupname=431&field_component=1971&search_api_language=en&require_all=1',
    'https://www.justice.gov/news/rss?type=press_release&groupname=291&field_component=1821&search_api_language=en&require_all=1',
    'https://www.justice.gov/news/rss?type=press_release&groupname=201&field_component=1721&search_api_language=en&require_all=1',
    'https://www.law360.com/whitecollar/rss',
    'https://www.whitecollarbriefly.com/feed/',
    'https://wp.nyu.edu/compliance_enforcement/feed/',
    'https://www.justice.gov/news/rss?end_date=05/08/2024&search_api_fulltext=&sort_by=field_date&start_date=05/01/2024&type=press_release&groupname=291&field_component=1821&search_api_language=en&require_all=0'
    # '',
    # Add more feed links as needed
]

# Parse RSS feeds
feed_data = parse_rss_feeds(rss_feed_links)

# Convert data to DataFrame
df = pd.DataFrame(feed_data)

# Display DataFrame
print(df.head(5))

                              feed_link  \
0  https://www.justice.gov/news/rss?m=1   
1  https://www.justice.gov/news/rss?m=1   
2  https://www.justice.gov/news/rss?m=1   
3  https://www.justice.gov/news/rss?m=1   
4  https://www.justice.gov/news/rss?m=1   

                                               title  \
0  Attorney General Merrick B. Garland Statement ...   
1  Former CIA Officer Pleads Guilty to Conspiracy...   
2  OVW Fiscal Year 2024 Healing and Response Team...   
3  California Man Arrested for Making Violent Thr...   
4  Readout of Acting Associate Attorney General B...   

                         published  \
0  Sat, 25 May 2024 12:00:00 +0000   
1  Fri, 24 May 2024 12:00:00 +0000   
2  Fri, 24 May 2024 12:00:00 +0000   
3  Fri, 24 May 2024 12:00:00 +0000   
4  Fri, 24 May 2024 12:00:00 +0000   

                                             summary  \
0  The Justice Department issued the following st...   
1  Alexander Yuk Ching Ma, 71, of Honolulu, a for...   
2      

In [4]:
excel_file = 'rss_feeds.xlsx'
df.to_excel(excel_file, index=False)

# DOJ Article scraping and conversion to dataframes


In [5]:
# Run command pip install openpyxl
import feedparser
import pandas as pd
import numpy as np

from urllib.parse import urlparse

import requests
from bs4 import BeautifulSoup, NavigableString

In [6]:
def parse_rss_feeds(feed_links, scrape_article=False):
    
    feed_data = []
    
    for link in feed_links:
        
        # Get hostname of provided rss link
        urlparser = urlparse(link)
        link_hostname = urlparser.hostname
        # Get source name of link
        link_source = links_dict[link_hostname]

        # Parse the rss feed link content
        feed = feedparser.parse(link)
        
        # Iterate feed entries
        for entry in feed.entries:
            
            # Format datetime to yyyy-mm-dd format
            date_time = pd.to_datetime(entry.get('published', ''))
            date_time = date_time.date()
            date_time = pd.Timestamp(date_time.strftime('%Y-%m-%d'))
            
            # Article source url
            url = entry.get('link', '')

            # Initialize article content variable
            article_content = np.nan

            # Check scraping parameter boolean
            if scrape_article:
                # Check if rss feed source is DOJ (Department of Justice)
                if link_source == 'DOJ':
                    # Function call to parse article conetent
                    article_content = get_rss_article(url, 'DOJ')

            # Append feed data objects
            feed_data.append({
                'feed_link': link,
                'feed_source': link_source,
                'title': entry.get('title', ''),
                'date_published': date_time,
                'summary': entry.get('summary', ''),
                'link': entry.get('link', ''),
                'article_content': article_content
            })

    # Return feed data objects
    return feed_data


In [7]:
def get_rss_article(url, rss_source):
    page = requests.get(url)
    soup = BeautifulSoup(page.content, 'html.parser')

    # Check RSS feed source
    if rss_source == 'DOJ':
        # Scrape the article content via bs4
        content_body = soup.find(class_='field_body')
        paragraphs = content_body.find_all('p')

        # Variable for an observed ending text string
        text_strip = 'More information can be found at'
        
        # Initialize content string
        article_content = ""
        
        # Iterate content paragraph tags (i.e <p></p>)
        for p in paragraphs:
            # Check for any anchor tags (i.e. <a>some link</a>)
            for tag in p.findAll('a'):
                tag.replaceWithChildren()

            # Set variable with paragraph text
            p_text = p.text

            # Check for ending text string
            if text_strip in p_text:
                # Get the the index
                idx = p_text.index(text_strip)
                # Truncate the string on the index
                p_text = p_text[:idx]

            # Strip whitespace and append to content string 
            p_text = p_text.strip()
            article_content += p_text + " "
            
    # Return article content string
    return article_content

In [8]:
links_dict = {
    'www.justice.gov': 'DOJ',
    'www.sec.gov': 'SEC',
    'www.cftc.gov': 'CFTC',
    'www.ftc.gov': 'FTC',
    'feeds.finra.org': 'FINRA',
    'www.law360.com': 'Law360',
    'www.whitecollarbriefly.com': 'White Collar Briefly',
    'wp.nyu.edu': 'NYU'
}

In [9]:
%%time

# Parse RSS feeds
feed_data = parse_rss_feeds(rss_feed_links, scrape_article=True)

# Convert data to DataFrame
df = pd.DataFrame(feed_data)

Wall time: 4min 32s


In [10]:
# Display DataFrame
df.head(5)

Unnamed: 0,feed_link,feed_source,title,date_published,summary,link,article_content
0,https://www.justice.gov/news/rss?m=1,DOJ,Attorney General Merrick B. Garland Statement ...,2024-05-25,The Justice Department issued the following st...,https://www.justice.gov/opa/pr/attorney-genera...,The Justice Department issued the following st...
1,https://www.justice.gov/news/rss?m=1,DOJ,Former CIA Officer Pleads Guilty to Conspiracy...,2024-05-24,"Alexander Yuk Ching Ma, 71, of Honolulu, a for...",https://www.justice.gov/opa/pr/former-cia-offi...,"Alexander Yuk Ching Ma, 71, of Honolulu, a for..."
2,https://www.justice.gov/news/rss?m=1,DOJ,OVW Fiscal Year 2024 Healing and Response Team...,2024-05-24,,https://www.justice.gov/ovw/video/ovw-fiscal-y...,OVW conducted a live web-based pre-application...
3,https://www.justice.gov/news/rss?m=1,DOJ,California Man Arrested for Making Violent Thr...,2024-05-24,"A Huntington Beach, California, man was arrest...",https://www.justice.gov/opa/pr/california-man-...,"A Huntington Beach, California, man was arrest..."
4,https://www.justice.gov/news/rss?m=1,DOJ,Readout of Acting Associate Attorney General B...,2024-05-24,"On Tuesday, May 21, and Wednesday, May 22, Act...",https://www.justice.gov/opa/pr/readout-acting-...,"WASHINGTON – On Tuesday, May 21, and Wednesday..."


In [11]:
excel_file = 'rss_feeds.xlsx'
df.to_excel(excel_file, index=False)

# Advanced Text Summarization with T5


This Jupyter Notebook demonstrates advanced text summarization using the T5 model, which is known for its effectiveness in generating accurate and concise summaries. Key phrase extraction is also performed using the `spaCy` library to enrich our text analysis.


In [21]:
import spacy
from transformers import T5Tokenizer, T5ForConditionalGeneration

print("Imports successful!")
# Load spaCy for key phrase extraction
nlp = spacy.load("en_core_web_sm")

# Initialize the tokenizer with explicit parameters
tokenizer = T5Tokenizer.from_pretrained('t5-large', model_max_length=512, legacy=False)

# Initialize the model
model = T5ForConditionalGeneration.from_pretrained('t5-large')



TypeError: Plain typing.NoReturn is not valid as type argument

NameError: name 'spacy' is not defined

In [16]:
def summarize_and_extract_key_phrases(text):
    """Summarize the text and extract key phrases."""
    # Generate summary
    inputs = tokenizer.encode("summarize: " + text, return_tensors="pt", max_length=512, truncation=True)
    summary_ids = model.generate(inputs, max_length=150, min_length=40, length_penalty=2.0, num_beams=4, early_stopping=True)
    summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
    
    return summary

def parse_rss_feeds(feed_links):
    all_summaries = []
    for link in feed_links:
        feed = feedparser.parse(link)
        for entry in feed.entries:
            if 'summary' in entry:
                summary = summarize_and_extract_key_phrases(entry['summary'])
                all_summaries.append(summary)
    return all_summaries



In [17]:
# Example feed links to parse
import feedparser
import pandas as pd
import spacy
from transformers import T5Tokenizer, T5ForConditionalGeneration
import torch
rss_feed_links = [
    'https://www.justice.gov/news/rss?m=1',
    'https://www.sec.gov/news/pressreleases.rss'
    # Add more links as needed
]

# Parse RSS feeds and print results
feed_data = parse_rss_feeds(rss_feed_links)
df = pd.DataFrame(feed_data)
print(df.head())  # Display first few rows of the DataFrame



                                                   0
0  a federal jury in Charlotte, north carolina, c...
1  a national of the Dominican republic was sente...
2  seven members and associates of the sinaloa ca...
3  a former Mississippi bureau of investigations ...
4  seven defendants were sentenced yesterday and ...


# TF-IDF Analysis of Summarized Texts

This notebook applies TF-IDF analysis to the summaries generated from a set of texts. The frequency of keywords in each summary is produced helping to highlight the most significant words in each summarized document.




In [18]:
import nltk
from sklearn.feature_extraction.text import TfidfVectorizer
from collections import defaultdict
import numpy as np

# You may need to download NLTK's stopword list
nltk.download('stopwords')
from nltk.corpus import stopwords

# Assuming the summarize_text and extract_key_phrases functions are defined as per previous discussions


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\sreya\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [4]:
# Import TfidfVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np
import feedparser

# Initialize the TF-IDF Vectorizer
vectorizer = TfidfVectorizer(stop_words='english')

# RSS feed links
rss_feed_links = [
    'https://www.justice.gov/news/rss?m=1',
    'https://www.sec.gov/news/pressreleases.rss'
]

# Function to parse RSS feeds and extract summaries
def parse_rss_feeds(links):
    summaries = []
    for link in links:
        feed = feedparser.parse(link)
        for entry in feed.entries:
            if 'summary' in entry:
                summaries.append(entry.summary)
            elif 'description' in entry:
                summaries.append(entry.description)
    return summaries

# Parse RSS feeds and obtain summaries
all_summaries = parse_rss_feeds(rss_feed_links)

# Transform summaries to TF-IDF matrix
tfidf_matrix = vectorizer.fit_transform(all_summaries)

# Function to display top keywords for each summary
def display_top_keywords(tfidf_matrix, feature_names, top_n=10):
    for idx, summary in enumerate(all_summaries):
        print(f"Summary {idx+1}:")
        feature_index = tfidf_matrix[idx, :].nonzero()[1]
        tfidf_scores = zip(feature_index, [tfidf_matrix[idx, x] for x in feature_index])
        sorted_items = sorted(tfidf_scores, key=lambda x: x[1], reverse=True)
        for item in sorted_items[:top_n]:
            print(feature_names[item[0]], ' - ', round(item[1], 2))
        print("\n")

# Get feature names and display keywords
feature_names = np.array(vectorizer.get_feature_names_out())
display_top_keywords(tfidf_matrix, feature_names)


Summary 1:
bils  -  0.31
russell  -  0.31
diego  -  0.31
san  -  0.31
violence  -  0.16
crime  -  0.16
relation  -  0.16
firearm  -  0.16
discharging  -  0.16
officers  -  0.16


Summary 2:
scams  -  0.25
proceeds  -  0.25
launder  -  0.25
roles  -  0.25
leading  -  0.25
played  -  0.25
alleging  -  0.25
nationals  -  0.25
chinese  -  0.25
central  -  0.25


Summary 3:
mortgage  -  0.28
clients  -  0.28
documents  -  0.28
creating  -  0.28
defraud  -  0.28
builder  -  0.28
obtain  -  0.25
conspiring  -  0.25
home  -  0.25
help  -  0.23


Summary 4:
man  -  0.52
murder  -  0.26
related  -  0.26
hawaii  -  0.26
appearance  -  0.26
initial  -  0.26
trafficking  -  0.24
connection  -  0.24
face  -  0.24
drug  -  0.22


Summary 5:
korea  -  0.45
seizures  -  0.25
dprk  -  0.23
democratic  -  0.23
generation  -  0.23
revenue  -  0.23
illicit  -  0.23
disrupt  -  0.23
actions  -  0.23
authorized  -  0.23


Summary 6:
procedures  -  0.32
medicare  -  0.32
claims  -  0.29
suitability  -  0.16
p

# Text Classification with RoBERTa

This notebook demonstrates how to use the RoBERTa model for classifying text into predefined categories such as animals, travel, technology, fashion, culture, weather, and food. We aim for high accuracy in classification by leveraging the powerful capabilities of the RoBERTa model.


In [1]:
# Try importing the libraries
try:
    from datasets import load_dataset
    from huggingface_hub import HfApi  # Not directly used here, but good to test
    print("Success: Libraries are properly installed.")
except ImportError as e:
    print("Failed to import libraries:", str(e))


Success: Libraries are properly installed.


In [27]:
import pandas as pd
from transformers import RobertaTokenizer
from datasets import Dataset

# Load the CSV file
file_path = 'C:\\Users\\sreya\\OneDrive\\Desktop\\FADS\\Regulatortrendscode\\Corpoarate Compliance Terms.csv'
data = pd.read_csv(file_path, header=None)

# Rename the column
data.columns = ['labels']

# Display the renamed DataFrame
print(data.head())


                   labels
0        Accounting fraud
1    Antitrust violations
2  Asset misappropriation
3         Asset stripping
4         Bait and switch


In [9]:
# Convert the DataFrame to a Hugging Face Dataset
dataset = Dataset.from_pandas(data)

# Load a pre-trained tokenizer
tokenizer = RobertaTokenizer.from_pretrained('roberta-large')

# Function to tokenize the summaries
def tokenize_function(examples):
    return tokenizer(examples['summary'], padding="max_length", truncation=True)

# Apply the tokenization to your dataset
tokenized_datasets = dataset.map(tokenize_function, batched=True)

# Display the tokenized datasets
print(tokenized_datasets)




Map:   0%|          | 0/67 [00:00<?, ? examples/s]

Dataset({
    features: ['summary', 'input_ids', 'attention_mask'],
    num_rows: 67
})


In [20]:
import transformers
import torch
import accelerate

print("Transformers version:", transformers.__version__)
print("Torch version:", torch.__version__)
print("Accelerate version:", accelerate.__version__)


Transformers version: 4.35.2
Torch version: 2.1.0+cpu
Accelerate version: 0.30.1


# test

In [2]:
import feedparser
import pandas as pd

# RSS feed links
rss_feed_links = [
    'https://www.justice.gov/news/rss?m=1',
    'https://www.sec.gov/news/pressreleases.rss'
]

# Function to parse RSS feeds and extract summaries
def parse_rss_feeds(links):
    summaries = []
    links_list = []
    for link in links:
        feed = feedparser.parse(link)
        for entry in feed.entries:
            if 'summary' in entry:
                summaries.append(entry.summary)
                links_list.append(entry.link)
            elif 'description' in entry:
                summaries.append(entry.description)
                links_list.append(entry.link)
    return summaries, links_list

# Parse RSS feeds and obtain summaries
all_summaries, all_links = parse_rss_feeds(rss_feed_links)


In [3]:
import re

# Load the compliance terms CSV file
compliance_file_path = 'C:\\Users\\sreya\\OneDrive\\Desktop\\FADS\\Regulatortrendscode\\Corpoarate Compliance Terms.csv'
compliance_terms = pd.read_csv(compliance_file_path, header=None)[0].tolist()

# Function to check if a summary is compliant
def is_compliant(summary, terms):
    for term in terms:
        if re.search(r'\b' + re.escape(term) + r'\b', summary, re.IGNORECASE):
            return 1
    return 0

# Generate labels for all summaries
labels = [is_compliant(summary, compliance_terms) for summary in all_summaries]


In [6]:
import pandas as pd
from datasets import Dataset, DatasetDict

# Load the modified CSV file without column names
file_path = 'C:\\Users\\sreya\\OneDrive\\Desktop\\FADS\\Regulatortrendscode\\summaries_with_labels.csv'
data = pd.read_csv(file_path)

# Ensure the 'summary' column is of type string and 'label' column is of type int
data['summary'] = data['summary'].astype(str)
data['label'] = data['label'].astype(int)

# Split data into training and evaluation sets
train_data = data.sample(frac=0.8, random_state=42)
eval_data = data.drop(train_data.index)

# Convert the DataFrame to a Hugging Face Dataset
train_dataset = Dataset.from_pandas(train_data)
eval_dataset = Dataset.from_pandas(eval_data)
datasets = DatasetDict({"train": train_dataset, "eval": eval_dataset})


In [7]:
from transformers import RobertaTokenizer, RobertaForSequenceClassification, TrainingArguments, Trainer

# Load a pre-trained tokenizer
tokenizer = RobertaTokenizer.from_pretrained('roberta-large')

# Function to tokenize the summaries
def tokenize_function(examples):
    return tokenizer(examples['summary'], padding="max_length", truncation=True)

# Apply the tokenization to your dataset
tokenized_datasets = datasets.map(tokenize_function, batched=True)

# Load a pre-trained RoBERTa model for sequence classification with 2 labels
model = RobertaForSequenceClassification.from_pretrained('roberta-large', num_labels=2)

# Define training arguments
training_args = TrainingArguments(
    output_dir='./results',          # output directory
    num_train_epochs=3,              # number of training epochs
    per_device_train_batch_size=8,   # batch size per device during training
    per_device_eval_batch_size=8,    # batch size per device during evaluation
    warmup_steps=500,                # number of warmup steps for learning rate scheduler
    weight_decay=0.01,               # strength of weight decay
    logging_dir='./logs',            # directory for storing logs
    logging_steps=10,
    evaluation_strategy="epoch",     # evaluate at the end of each epoch
)

# Create a Trainer instance
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets['train'],
    eval_dataset=tokenized_datasets['eval'],
)

# Train the model
trainer.train()

# Evaluate the model
trainer.evaluate()






Map:   0%|          | 0/40 [00:00<?, ? examples/s]

Map:   0%|          | 0/10 [00:00<?, ? examples/s]

Some weights of the model checkpoint at roberta-large were not used when initializing RobertaForSequenceClassification: ['lm_head.bias', 'lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.dense.weight', 'lm_head.layer_norm.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-large and are newly initialized: ['classifier.out_proj.weight', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.dense.bias']
You should 

  0%|          | 0/15 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

{'eval_loss': 0.7868456840515137, 'eval_runtime': 32.2853, 'eval_samples_per_second': 0.31, 'eval_steps_per_second': 0.062, 'epoch': 1.0}
{'loss': 0.7738, 'learning_rate': 1.0000000000000002e-06, 'epoch': 2.0}


  0%|          | 0/2 [00:00<?, ?it/s]

{'eval_loss': 0.7214387655258179, 'eval_runtime': 36.2457, 'eval_samples_per_second': 0.276, 'eval_steps_per_second': 0.055, 'epoch': 2.0}


  0%|          | 0/2 [00:00<?, ?it/s]

{'eval_loss': 0.6529731750488281, 'eval_runtime': 29.5856, 'eval_samples_per_second': 0.338, 'eval_steps_per_second': 0.068, 'epoch': 3.0}
{'train_runtime': 2113.6851, 'train_samples_per_second': 0.057, 'train_steps_per_second': 0.007, 'train_loss': 0.7565571626027425, 'epoch': 3.0}


  0%|          | 0/2 [00:00<?, ?it/s]

{'eval_loss': 0.6529731750488281,
 'eval_runtime': 29.8236,
 'eval_samples_per_second': 0.335,
 'eval_steps_per_second': 0.067,
 'epoch': 3.0}

# test over


In [4]:
import pandas as pd
from transformers import RobertaTokenizer, RobertaForSequenceClassification, TrainingArguments, Trainer
from datasets import Dataset, DatasetDict

# Load the compliance terms CSV file without column names
file_path = 'C:\\Users\\sreya\\OneDrive\\Desktop\\FADS\\Regulatortrendscode\\Corpoarate Compliance Terms.csv'
data = pd.read_csv(file_path, header=None)

# Rename the single column to 'label'
data.columns = ['label']

# Load the summaries CSV file
summaries_file_path = 'C:\\Users\\sreya\\OneDrive\\Desktop\\FADS\\Regulatortrendscode\\summaries.csv'
summaries_df = pd.read_csv(summaries_file_path)

# Ensure the number of summaries matches the number of labels in your CSV
if len(summaries_df) < len(data):
    print("Not enough summaries for all labels. Please provide more summaries or fewer labels.")
    exit()

# Add the summaries to the DataFrame
data['summary'] = summaries_df['summary'][:len(data)]

# Reorder columns to have 'summary' first and 'label' second
data = data[['summary', 'label']]

# Save the modified CSV file
modified_file_path = 'C:\\Users\\sreya\\OneDrive\\Desktop\\FADS\\Regulatortrendscode\\modified_compliance_terms.csv'
data.to_csv(modified_file_path, index=False, header=False)

# Load the modified CSV file without column names
data = pd.read_csv(modified_file_path, header=None)

# Rename the columns
data.columns = ['summary', 'label']

# Split data into training and evaluation sets
train_data = data.sample(frac=0.8, random_state=42)
eval_data = data.drop(train_data.index)

# Convert the DataFrame to a Hugging Face Dataset
train_dataset = Dataset.from_pandas(train_data)
eval_dataset = Dataset.from_pandas(eval_data)
datasets = DatasetDict({"train": train_dataset, "eval": eval_dataset})

# Load a pre-trained tokenizer
tokenizer = RobertaTokenizer.from_pretrained('roberta-large')

# Function to tokenize the summaries
def tokenize_function(examples):
    return tokenizer(examples['summary'], padding="max_length", truncation=True)

# Apply the tokenization to your dataset
tokenized_datasets = datasets.map(tokenize_function, batched=True)

# Load a pre-trained RoBERTa model for sequence classification with 2 labels
model = RobertaForSequenceClassification.from_pretrained('roberta-large', num_labels=2)

# Define training arguments
training_args = TrainingArguments(
    output_dir='./results',          # output directory
    num_train_epochs=3,              # number of training epochs
    per_device_train_batch_size=8,   # batch size per device during training
    per_device_eval_batch_size=8,    # batch size per device during evaluation
    warmup_steps=500,                # number of warmup steps for learning rate scheduler
    weight_decay=0.01,               # strength of weight decay
    logging_dir='./logs',            # directory for storing logs
    logging_steps=10,
    evaluation_strategy="epoch",     # evaluate at the end of each epoch
)

# Create a Trainer instance
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets['train'],
    eval_dataset=tokenized_datasets['eval'],
)

# Train the model
trainer.train()

# Evaluate the model
trainer.evaluate()


Not enough summaries for all labels. Please provide more summaries or fewer labels.




Map:   0%|          | 0/54 [00:00<?, ? examples/s]

ValueError: Input is not valid. Should be a string, a list/tuple of strings or a list/tuple of integers.

: 

In [None]:
# Save the model
model.save_pretrained('./my_roberta_model')

# To evaluate the model, you can add an evaluation dataset in the Trainer and use `trainer.evaluate()`
