In [4]:
# Transformers
from transformers import BartTokenizer, BartForConditionalGeneration            # BERT Tokenizer and architecture
from transformers import Seq2SeqTrainer, Seq2SeqTrainingArguments               # fine-tune model
from transformers import pipeline                                               # pipeline
from transformers import DataCollatorForSeq2Seq                                 # DataCollator to batch the data
import torch                                                                    # PyTorch
import evaluate                                                                 # Hugging Face's library for model evaluation
from datasets import Dataset

import numpy as np
import pandas as pd
import nltk
# from textblob import TextBlob                                                   # fix spelling mistakes in texts
from sklearn.feature_extraction.text import TfidfVectorizer                     # identify the most common terms in the corpus
import re                                                                       # clean text data
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\user\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [5]:
from datasets import load_dataset

# Load the multi_lexsum dataset
multi_lexsum = load_dataset("allenai/multi_lexsum", name="v20220616",trust_remote_code=True)

# Access the first instance of the validation set
example = multi_lexsum["validation"][0]

# Print the sources of the first example
print(example["sources"])

# Print the summaries of different lengths
for sum_len in ["long", "short", "tiny"]:
    print(f"{sum_len} summary: {example['summary/' + sum_len]}")


train.json:   0%|          | 0.00/15.7M [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


dev.json:   0%|          | 0.00/2.28M [00:00<?, ?B/s]

test.json:   0%|          | 0.00/4.27M [00:00<?, ?B/s]

sources.json:   0%|          | 0.00/2.22G [00:00<?, ?B/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

['Page 1\nLEXSEE 2003 U.S. DIST. CT. PLEADINGS 3030 View U.S. District Court Opinion View Original Source Image of This Document\nSUSAN STOCKING, for herself and all other similarly situated, Plaintiff, v. AT&T CORP., Defendant.\nCase No. 03-0421-CV-W-HFS UNITED STATES DISTRICT COURT FOR THE WESTERN DISTRICT OF\nMISSOURI, WESTERN DIVISION 2003 U.S. Dist. Ct. Pleadings 3030; 2004 U.S. Dist. Ct. Pleadings LEXIS 9181\nJanuary 23, 2004 Complaint\nVIEW OTHER AVAILABLE CONTENT RELATED TO THIS DOCUMENT: U.S. District Court: Motion(s) COUNSEL: [**1] Sylvester "Sly" James, Jr. MO # 33617, Michael J. Mohlman KS # 19084, The Sly James Firm Trial Lawyers, P.C., 802 Broadway, 7th Floor, Kansas City, MO 64105, 816-472-6800, 816-472-6805 facsimile and Rex A. Sharp KS # 12350, Gunderson, Sharp & Rhein, P.C., 4121 W. 83rd St., Ste. 256, Prairie Village, KS 66208, 913-901-0500, 913-901-0419 facsimile and Rick D. Holtsclaw MO # 32866, Holtsclaw & Kendall, LC, 312 West 8th Street, Kansas City, MO 64105, 8

In [6]:
import torch                                                                    # PyTorch

# Checking if GPU is available
if torch.cuda.is_available():
    print("GPU is available - Using GPU")
    device = torch.device('cuda')
else:
    print("GPU is not available - Using CPU")
    device = torch.device('cpu')

GPU is available - Using GPU


In [7]:
# allow to fully read the dialogues and its summary
pd.set_option('display.max_colwidth', 1000)

In [8]:
# Access the validation set
validation_set = multi_lexsum["validation"]

# Display the column names
print(validation_set.column_names)

['id', 'sources', 'summary/long', 'summary/short', 'summary/tiny']


In [9]:
def display_feature_list(features, feature_type):

    '''
    This function displays the features within each list for each type of data
    '''

    print(f"\n{feature_type} Features: ")
    print(', '.join(features) if features else 'None')

def describe_df(df):

    global categorical_features
    categorical_features = [col for col in df.columns if df[col].dtype == 'object']

    print(f"\n{type(df).__name__} shape: {df.shape}")
    print(f"\n{df.shape[0]:,.0f} samples")
    print(f'\nMissing Data: \n{df.isnull().sum()}')
    print(f'\nDuplicates: {df.duplicated().sum()}')

    display_feature_list(categorical_features, 'Categorical')

    # print(f'\n{type(df).__name__} Head: \n')
    # display(df.head(5))
    # print(f'\n{type(df).__name__} Tail: \n')
    # display(df.tail(5))

In [10]:
print(multi_lexsum)


DatasetDict({
    train: Dataset({
        features: ['id', 'sources', 'summary/long', 'summary/short', 'summary/tiny'],
        num_rows: 3177
    })
    validation: Dataset({
        features: ['id', 'sources', 'summary/long', 'summary/short', 'summary/tiny'],
        num_rows: 454
    })
    test: Dataset({
        features: ['id', 'sources', 'summary/long', 'summary/short', 'summary/tiny'],
        num_rows: 908
    })
})


In [11]:
# Access the training, validation, and test sets
train_dataset = multi_lexsum["train"]
val_dataset = multi_lexsum["validation"]
test_dataset = multi_lexsum["test"]


In [12]:
# Convert the Hugging Face Dataset to a Pandas DataFrame
train_df = train_dataset.to_pandas()
val_df = val_dataset.to_pandas()
test_df = test_dataset.to_pandas()


In [13]:
# Step 1: Replace None values with np.nan
train_df.replace(to_replace=[None], value=np.nan, inplace=True)
val_df.replace(to_replace=[None], value=np.nan, inplace=True)
test_df.replace(to_replace=[None], value=np.nan, inplace=True)


In [14]:
# Remove rows that have None or NaN values in 'summary/short' and 'summary/tiny' columns
train_df = train_df.dropna(subset=['summary/short', 'summary/tiny','summary/long'])
val_df = val_df.dropna(subset=['summary/short', 'summary/tiny','summary/long'])
test_df = test_df.dropna(subset=['summary/short', 'summary/tiny','summary/long'])

In [15]:
train_df.head()

Unnamed: 0,id,sources,summary/long,summary/short,summary/tiny
7,PB-WV-0002,"[Case 3:13-cv-24068 Document 8 Filed 10/01/13 Page 1 of 30 PageID #: 16\n\nUNITED STATES DISTRICT COURT FOR THE SOUTHERN DISTRICT OF WEST VIRGINIA Huntington Division\n\nCASIE JO MCGEE and SARAH ELIZABETH ADKINS; JUSTIN MURDOCK and WILLIAM GLAVARIS; and NANCY ELIZABETH MICHAEL and JANE LOUISE FENTON, individually and as next friends of A.S.M., a minor child;\n\nPlaint[f{s,\nv.\nKAREN S. COLE, in her official capacity as CABELL COUNTY CLERK; and VERA J. MCCORMICK, in her official capacity as KANAWHA COlJNTY CLERK;\n\nCivil\n\nAction\n\nNo.\n\n3:13-cv-24068\n----\n\nDefendants.\n\nCOMPLAINT FOR DECLARATORY AND INJUNCTIVE RELIEF\n\nPlaintiffs CASTE JO MCGEE and SARAH ELIZABETH ADKINS; JUSTIN\n\nMURDOCK and WILLIAM GLAVARIS; and, NANCY ELIZABETH MICHAEL and JANE\n\nLOUISE FENTON, individually and as next friends of A.S.M., a minor child, all by and through\n\ntheir attorneys, file this Complaint against Defendants KAREN S. COLE, in her official capacity\n\nas CABELL COUNTY CLERK, and V...","On October 1, 2013, three same-sex couples and the minor child of one of the couples filed a lawsuit against Kanawha County, West Virginia in the U.S. District Court for the Southern District of West Virginia. The plaintiffs, represented by Lambda Legal Defense & Education Fund, asked the court to declare unconstitutional any West Virginia laws banning same-sex marriage, to enjoin West Virginia from refusing to recognize same-sex marriages undertaken in other states, and to award the plaintiffs reasonable attorney's fees and costs of suit.\n\nThe plaintiffs claimed that their rights under the Due Process and Equal Protection clauses of the United States Constitution were violated. They further asserted that the ban on same-sex marriage discriminated on the basis of sexual orientation, sex, and parental status. The plaintiffs also alleged that they were being denied a multitude of other social and legal rights that marriage provides.\n\nOn December 2, 2013, the Court (Judge Robert C...","On October 1, 2013, three same-sex couples and the minor child of one of the couples filed a lawsuit against Kanawha County, West Virginia. The plaintiffs asked the court to declare unconstitutional any West Virginia laws banning same-sex marriage, enjoin West Virginia from refusing to recognize same-sex marriages undertaken in other states, and to award the plaintiffs reasonable attorney's fees and costs of suit. The case was decided in favor of the plaintiffs, who were granted summary judgment in November 2014 and awarded attorneys’ fees, costs, and expenses.","Three same-sex couples, and the minor child of one of the couples won this same-sex marriage lawsuit in WV"
8,PN-MI-0008,"[Case 1:13-cv-00469-PLM Doc #1 Filed 05/01/13 Page 1 of 23 Page ID#1\n\nIN THE UNITED STATES DISTRICT COURT FOR THE WESTERN DISTRICT OF MICHIGAN\n\nGILBERT WEBER and TYRONE HIGHTOWER,\nPlaintiffs,\nvs.\nCITY OF GRAND RAPIDS; KEVIN BELK, Chief of Police of the Grand Rapids Police Department, in his official capacity; Officer JOHN GUERRERO, in his individual capacity; Officer THOMAS MCCARTHY, in his individual capacity; Officer GREGORY REKUCKI, in his individual capacity; and Officer ANTHONY LEONARD, in his individual capacity,\nDefendants.\n\nHon. Case No.\nJURY TRIAL DEMANDED\n\nCOMPLAINT\n\nCase 1:13-cv-00469-PLM Doc #1 Filed 05/01/13 Page 2 of 23 Page ID#2\nINTRODUCTORY STATEMENT 1. This civil rights case challenges the Grand Rapids Police Department’s ongoing practice of charging individuals with criminal trespassing at gas stations, bars and other commercial businesses open to the public, even though the individuals have done nothing wrong, and even though no one has asked the...","On May 1, 2013, two men who were arrested for trespassing on property open to the public filed this lawsuit in the U.S. District Court for the Western District of Michigan. The plaintiffs sued the City of Grand Rapids, its chief of police, and two individual officers under 42 U.S.C. § 1983. The plaintiffs, represented by the National ACLU and ACLU of Michigan, asked the court for a declaratory judgment, damages, and injunctive relief concerning the use of ""No Trespass Letters."" The plaintiffs claimed that the City of Grand Rapids, its chief of police, and police officers violated their Fourth Amendment rights. \n \nAccording to the amended complaint, the Grand Rapids Police Department (""GRPD"") had arrested individuals for trespassing based on a City trespass ordinance, under which the City solicits No Trespass Letters from area businesses indicating their intent to prosecute trespassers. The plaintiffs alleged they were arrested for trespassing while sitting in their vehicles in a...","Two men who were arrested for trespassing on property of businesses open to the public filed a lawsuit in the U.S. District Court for the Western District of Michigan against the city of Grand Rapids, its chief of police, and two individual officers. The plaintiffs claimed that the Grand Rapids Police Department's policy and practice of arresting individuals for trespass -- without probable cause and based on general Letters of Intent to Prosecute signed by Grand Rapids businesses -- results in unreasonable searches and seizures in violation of the Fourth Amendment. The parties came to a private settlement agreement for damages and attorney's fees in late 2019. The Judge dismissed the case in early 2020.","Settlement reached in 2019 for @ACLU case on arrests for trespassing without a warning under Grand Rapids, MI's ""No Trespass Letters"" policy (W.D. Mich.)"
12,PC-MI-0036,"[2:15-cv-11222-SFC-DRG Doc # 1 Filed 03/31/15 Pg 1 of 51 Pg ID 1\n\nIN THE UNITED STATES DISTRICT COURT FOR THE EASTERN DISTRICT OF MICHIGAN\n\nMARY ANN MCBRIDE; BRIAN STANLEY WITTMAN; and RALPH WILLIAMS, on behalf of themselves and all others similarly situated,\nPIa intiffs ,\nv.\nMICHIGAN DEPARTMENT OF CORRECTIONS; DANIEL H. HEYNS, in his official capacity as Director of the Michigan Department of Corrections; THOMAS FINCO, in his official capacity as Deputy Director of the Correctional Facilities Administration; RANDALL TREACHER, in his official capacity as Chief Deputy Director of the Michigan Department of Corrections; ANTHONY STEWART, in his official capacity as Warden of the Women's Huron Valley Correctional Facility; JEFFREY WOODS, in his official capacity as Warden of the Chippewa Correctional Facility; and CATHLEEN STODDARD, in her official capacity as Warden of the Carson City Correctional Facility,\nDefendants.\n\nCase No. --------\n\nHon.\n\n_\n\nCLASS ACTION COMPLAIN...","On March 31, 2015, three prisoner-plaintiffs filed this putative class action lawsuit against the Michigan Department of Corrections (MDOC), alleging that MDOC was discriminating against them and other deaf and hard of hearing prisoners. The case was filed in the U.S. District Court for the Eastern District of Michigan, and was assigned to Judge Sean Cox. The plaintiffs were represented by Michigan Protection and Advocacy, the Washington Lawyers’ Committee and private counsel. \n\nThe plaintiffs alleged that MDOC consistently failed to provide them with effective communication opportunities, depriving them of full participation in prison programs, services, and activities, including visitation, religious activities, and disciplinary and parole proceedings. They also alleged that since they couldn’t hear guards’ orders, they were sometimes unable to obey prison regulations and were then unfairly disciplined. The plaintiffs alleged that this treatment violated the Americans with Disa...","This class-action lawsuit in the U.S. District Court for the Eastern District of Michigan was filed March 31, 2015 by three prisoner-plaintiffs alleging that the Michigan Department of Corrections was discriminating against them and other deaf and hard of hearing prisoners. MDOC’s summary judgment motion was denied by the court in 2016. The court certified the plaintiffs’ class on July 20, 2017. On March 9, 2018, the court ordered MDOC to provide necessary aids to inmates who are deaf or hard of hearing. The court approved a settlement agreement on March 29, 2019. Two class members later filed motions to enforce the settlement agreement, both of which were denied; as of July 20, 2020, one of the class members has an appeal pending before the Sixth Circuit.","In March 2019, a federal court approved a settlement agreement which required the Michigan Department of Corrections to make substantial changes to accommodate deaf and hard of hearing inmates’ communication needs. (E.D. Mich.)"
21,NS-UT-0001,"[Case 2:15-cv-00584-RJS-DBP Document 1 Filed 08/18/15 Page 1 of 38\n\nROSS C. ANDERSON (#0109) OF COUNSEL, WINDER & COUNSEL, P.C. 460 South 400 East Salt Lake City, UT 84111 Telephone: (801) 322-2222 Facsimile: (801) 322-2282 randerson@winderfirm.com\nAttorneys for Plaintiffs\n\nIN THE UNITED STATES DISTRICT COURT DISTRICT OF UTAH, CENTRAL DIVISION\n\nMARY JOSEPHINE (JOSIE) VALDEZ; HOWARD STEPHENSON; DEEDA SEED; DANIEL DARGER; WILLIAM GRANT BAGLEY, and THOMAS NELSON HUCKIN, on behalf of themselves and all others similarly situated,\n\nCOMPLAINT FOR CONSTITUTIONAL, COMMON LAW, AND STATUTORY VIOLATIONS, SEEKING DAMAGES, DECLARATORY, AND INJUNCTIVE RELIEF\n\nPlaintiffs,\n\nPROPOSED CLASS ACTION\n\nv.\n\nJURY DEMANDED\n\nNATIONAL SECURITY AGENCY; FEDERAL BUREAU OF INVESTIGATION; GEORGE W. BUSH, in his personal capacity; MICHAEL V. HAYDEN, in his personal capacity; RICHARD B. CHENEY, in his personal capacity; DAVID ADDINGTON; DOES #150, inclusive,\n\nCase No. _2_:_1_5_-_cv_-_0_0_5_8_4_-...","On August 8, 2015, six U.S. citizens who had their email, text message, and telephone call metadata collected by the government during the 2002 Winter Olympic Games in Salt Lake City, Utah, filed suit against the National Security Agency (""NSA"") and the Federal Bureau of Investigation (""FBI""). The plaintiffs challenged the legality of the domestic surveillance program and sought declaratory and injunctive relief, as well as statutory, actual, and punitive damages. The plaintiffs brought suit in the U.S. District Court for the district of Utah under the Administrative Procedure Act (""APA""), the Foreign Intelligence Surveillance Act (""FISA""), the Wiretap Act, the Stored Communications, and the Privacy Act. They alleged violations of these acts and of the 4th Amendment of the U.S. Constitution and of Article I, § 14 of the Utah Constitution. The plaintiffs were represented by private counsel.\n\nIn October of 2001, then President George W. Bush authorized the NSA, in conjunction with ...","During the 2002 Winter Olympic Games, the NSA and FBI collected the content of every email and text message, as well the metadata of every telephone call, moving to and from individuals in Salt Lake City, UT. in 2015, a group of Salt Lake City area residents filed this suit in the U.S. District Court for Utah, challenging the legality of this surveillance program. The court has yet to rule on a motion to dismiss by the defendants.",Lawsuit accuses National Security Agency of spying on every cell phone user in Utah during the 2002 Winter Olympics.
27,JC-CA-0118,"[Case5:14-cv-05481-PSG Document1 Filed12/16/14 Page1 of 22\n\n1 Dan Stormer, Esq. [S.B. # 101967] Josh Piovia-Scott, Esq. [S.B. #222364]\n\n2 Mohammad Tajsar, Esq. [S.B. #280152]\n\nHADSELL STORMER & RENICK LLP 3 128 N. Fair Oaks Avenue\n\n4\n\nPasadena, California 91103 Telephone: (626) 585-9600\n\n5 Facsimile: (626) 577-7079\n\nEmails: dstormer@hadsellstormer.com\n\n6\n\njps@hadsellstormer.com\n\n7\n\nmtajsar@hadsellstormer.com\n\n8\n\nLori Rifkin, Esq. [S.B. # 244081] RIFKIN LAW OFFICE\n\n9 P.O. Box 19169 Oakland, California 94619\n10 Telephone: (415) 685-3591\nEmail: lrifkin@rifkinlawoffice.com 11\n\n12 Attorneys for Plaintiffs\n\n13\n\nUNITED STATES DISTRICT COURT\n\n14\n\nNORTHERN DISTRICT OF CALIFORNIA\n\n15\n\nEstate of JACOB PARENTI, deceased, by and through MATTHEW\n\nCASE NO:\n\n16 OBERHOLTZ; D.P-O., a minor, by\n\n17 18\n\nand through his guardian, MATTHEW OBERHOLTZ; and SUSAN PARENTI,\n\nCOMPLAINT FOR DAMAGES 1. Failure To Provide Medical Care In\nViolation Of Eighth A...","On December 16, 2014, the estate of an inmate who died in jail filed this lawsuit in the U.S. District Court for the Northern District of California. The plaintiffs sued Monterey County, the Monterey County Sheriff, the Monterey County Jail, and the California Forensic Medical Group under 42 U.S.C. §1983. Represented by private counsel, the plaintiffs sought monetary and injunctive relief, claiming a failure to provide medical care in violation of the Eighth and Fourteenth Amendments and a deprivation of substantive due process in violation of the First and Fourteenth Amendments. The plaintiffs also claimed negligence and wrongful death under California state law. Specifically, the plaintiffs claimed that the defendants left the inmate lying unconscious, helpless, and untreated in his bed to die from viral influenza syndrome complicated by pneumonia—a treatable condition. At the time of the inmate's death, the jail was already the subject of a class action lawsuit regarding systemi...","In 2014, the estate of an inmate who died in jail filed this lawsuit in the U.S. District Court for the Northern District of California. The plaintiffs sued Monterey County and other defendants under 42 U.S.C. §1983. The plaintiffs claimed a failure to provide medical care in violation of the Eighth and Fourteenth Amendments and a deprivation of substantive due process in violation of the First and Fourteenth Amendments. The parties settled on March 18, 2019 and case was dismissed on April 29, 2019.",Estate of inmate who died from the flu in jail sues jail for Constitutional violations and wrongful death (N.D. Cal.)


In [16]:
# Print the number of samples in each set
print(f"Train set: {len(train_dataset)} samples")
print(f"Validation set: {len(val_dataset)} samples")
print(f"Test set: {len(test_dataset)} samples")

Train set: 3177 samples
Validation set: 454 samples
Test set: 908 samples


In [17]:
train_df.info()  # Check data types of each column


<class 'pandas.core.frame.DataFrame'>
Index: 1129 entries, 7 to 3175
Data columns (total 5 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   id             1129 non-null   object
 1   sources        1129 non-null   object
 2   summary/long   1129 non-null   object
 3   summary/short  1129 non-null   object
 4   summary/tiny   1129 non-null   object
dtypes: object(5)
memory usage: 52.9+ KB


In [18]:
# Ensure that columns containing lists or arrays are converted to strings
train_df['sources'] = train_df['sources'].apply(str)
train_df['summary/long'] = train_df['summary/long'].apply(str)
train_df['summary/short'] = train_df['summary/short'].apply(str)
train_df['summary/tiny'] = train_df['summary/tiny'].apply(str)


In [19]:
# Ensure that columns containing lists or arrays are converted to strings
val_df['sources'] = val_df['sources'].apply(str)
val_df['summary/long'] = val_df['summary/long'].apply(str)
val_df['summary/short'] = val_df['summary/short'].apply(str)
val_df['summary/tiny'] = val_df['summary/tiny'].apply(str)


In [20]:
# Ensure that columns containing lists or arrays are converted to strings
test_df['sources'] = test_df['sources'].apply(str)
test_df['summary/long'] = test_df['summary/long'].apply(str)
test_df['summary/short'] = test_df['summary/short'].apply(str)
test_df['summary/tiny'] = test_df['summary/tiny'].apply(str)


In [21]:

describe_df(train_df)
describe_df(val_df)
describe_df(test_df)




DataFrame shape: (1129, 5)

1,129 samples

Missing Data: 
id               0
sources          0
summary/long     0
summary/short    0
summary/tiny     0
dtype: int64

Duplicates: 0

Categorical Features: 
id, sources, summary/long, summary/short, summary/tiny

DataFrame shape: (161, 5)

161 samples

Missing Data: 
id               0
sources          0
summary/long     0
summary/short    0
summary/tiny     0
dtype: int64

Duplicates: 0

Categorical Features: 
id, sources, summary/long, summary/short, summary/tiny

DataFrame shape: (312, 5)

312 samples

Missing Data: 
id               0
sources          0
summary/long     0
summary/short    0
summary/tiny     0
dtype: int64

Duplicates: 0

Categorical Features: 
id, sources, summary/long, summary/short, summary/tiny


In [22]:
# Drop 'id' and 'sources' columns from the train dataset
train_df = train_df.drop(['id','sources'], axis=1)

# Drop 'id' and 'sources' columns from the validation dataset
val_df = val_df.drop(['id','sources'], axis=1)

# Drop 'id' and 'sources' columns from the test dataset
test_df = test_df.drop(['id','sources'], axis=1)


In [23]:

print("Long Summary:\n", train_df['summary/long'].iloc[1])
print("\nShort Summary:\n", train_df['summary/short'].iloc[1])
print("\nTiny Summary:\n", train_df['summary/tiny'].iloc[1])


Long Summary:
 On May 1, 2013, two men who were arrested for trespassing on property open to the public filed this lawsuit in the U.S. District Court for the Western District of Michigan. The plaintiffs sued the City of Grand Rapids, its chief of police, and two individual officers under 42 U.S.C. § 1983. The plaintiffs, represented by the National ACLU and ACLU of Michigan, asked the court for a declaratory judgment, damages, and injunctive relief concerning the use of "No Trespass Letters." The plaintiffs claimed that the City of Grand Rapids, its chief of police, and police officers violated their Fourth Amendment rights. 
  
Amendment. 

On December 3, 2013, the defendants filed a motion to dismiss the plaintiffs' claims for injunctive and declaratory relief. They argued that the plaintiffs lacked standing to seek declaratory or injunctive relief because they were not suffering an imminent threat of repeated future misconduct. Additionally, defendants argued that the plaintiffs' cl

In [24]:
# Load model directly
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

tokenizer = AutoTokenizer.from_pretrained("facebook/bart-large-cnn")
model = AutoModelForSeq2SeqLM.from_pretrained("facebook/bart-large-cnn")


config.json:   0%|          | 0.00/1.58k [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]



model.safetensors:   0%|          | 0.00/1.63G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

In [25]:
print(model) # Visualizing model's architecture


BartForConditionalGeneration(
  (model): BartModel(
    (shared): BartScaledWordEmbedding(50264, 1024, padding_idx=1)
    (encoder): BartEncoder(
      (embed_tokens): BartScaledWordEmbedding(50264, 1024, padding_idx=1)
      (embed_positions): BartLearnedPositionalEmbedding(1026, 1024)
      (layers): ModuleList(
        (0-11): 12 x BartEncoderLayer(
          (self_attn): BartSdpaAttention(
            (k_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (v_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (q_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (out_proj): Linear(in_features=1024, out_features=1024, bias=True)
          )
          (self_attn_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
          (activation_fn): GELUActivation()
          (fc1): Linear(in_features=1024, out_features=4096, bias=True)
          (fc2): Linear(in_features=4096, out_features=1024, bias=True)
    

In [26]:
summarizer = pipeline('summarization', model = 'facebook/bart-large-cnn')


Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.


In [27]:
text='''On May 1, 2013, two men who were arrested for trespassing on property open to the public filed this lawsuit in the U.S. District Court for the Western District of Michigan. The plaintiffs sued the City of Grand Rapids, its chief of police, and two individual officers under 42 U.S.C. § 1983. The plaintiffs, represented by the National ACLU and ACLU of Michigan, asked the court for a declaratory judgment, damages, and injunctive relief concerning the use of "No Trespass Letters." The plaintiffs claimed that the City of Grand Rapids, its chief of police, and police officers violated their Fourth Amendment rights.

According to the amended complaint, the Grand Rapids Police Department ("GRPD") had arrested individuals for trespassing based on a City trespass ordinance, under which the City solicits No Trespass Letters from area businesses indicating their intent to prosecute trespassers. The plaintiffs alleged they were arrested for trespassing while sitting in their vehicles in a business's parking lot, without any warning or complaint from the business itself. The plaintiffs claimed their arrests were without probable cause and therefore in violation of the Fourth Amendment. Additionally, the plaintiffs claimed that the ordinance violated the void-for-vagueness doctrine of the Due Process Clause of the Fourteenth
Amendment.

On December 3, 2013, the defendants filed a motion to dismiss the plaintiffs' claims for injunctive and declaratory relief. They argued that the plaintiffs lacked standing to seek declaratory or injunctive relief because they were not suffering an imminent threat of repeated future misconduct. Additionally, defendants argued that the plaintiffs' claims for declaratory and injunctive relief were not ripe because the City had changed the No Trespass Letters following the plaintiffs' filling of the Complaint. Defendants did not challenge the Court's jurisdiction to hear the plaintiffs' claim for damages.

On August 4, 2014, the Judge Paul L. Maloney dismissed the claims of one of the plaintiffs against the defendants according to the parties' stipulation. However, the other plaintiffs' claims remained.

While the motion to dismiss was still pending, the defendants and the plaintiffs both filed motions for summary judgment. In April 2015, the court postponed the trial date pending the resolution of multiple motions made by both the defendants and the plaintiffs.

Following a hearing on the defendants' motion to dismiss, as well as both parties' motions for summary judgment, Judge Maloney granted the defendants' motion to dismiss the plaintiffs' claims for injunctive and declaratory relief on June 21, 2017. 256 F.Supp.3d 742. The Court held that, because the plaintiffs had not alleged sufficient facts to present a threat of an imminent, as opposed to a speculative, injury, they lacked standing to seek declaratory and injunctive relief. The Court also noted that the City's changes to the No Trespass Letters following the filing of this case rendered the requests for prospective relief unripe. The Court did not, however, grant the defendants' motion to dismiss the plaintiffs' claim for damages.

Notably, the Michigan Court of Appeals addressed the same issue of whether the Grand Rapids ordinance is constitutional and held that it is unconstitutional. People v. Maggitt, 903 N.W.2d 868 (Mich. Ct. App. 2017). Following the Michigan Court of Appeals' ruling, the City effectively ended its practice of arresting individuals pursuant to the No Trespass Letters.

On October 17, 2018, Judge Maloney held that both sides were entitled to partial summary judgment. 407 F.Supp.3d 707.

The Court granted summary judgment for plaintiffs on their municipal liability claim against the City of Grand Rapids. The Court found that the City had an unconstitutional policy or custom whereby police officers arrested individuals for trespassing on property covered by a no-trespass letter without first informing the suspect that he or she must leave the property. However, the Court found that the plaintiffs had failed to show that the City's trespass ordinance was unconstitutionally vague.'''
summary1 = summarizer(text)
print(summary1)

[{'summary_text': 'On May 1, 2013, two men who were arrested for trespassing on property open to the public filed this lawsuit in the U.S. District Court for the Western District of Michigan. The plaintiffs sued the City of Grand Rapids, its chief of police, and two individual officers under 42 U.N.C. § 1983. On October 17, 2018, Judge Maloney held that both sides were entitled to partial summary judgment.'}]


In [28]:
# Filter out rows where 'summary/long' or 'summary/short' are None
train_dataset = train_dataset.filter(lambda example: example['summary/long'] is not None and example['summary/short'] is not None)
val_dataset = val_dataset.filter(lambda example: example['summary/long'] is not None and example['summary/short'] is not None)
test_dataset = test_dataset.filter(lambda example: example['summary/long'] is not None and example['summary/short'] is not None)


Filter:   0%|          | 0/3177 [00:00<?, ? examples/s]

Filter:   0%|          | 0/454 [00:00<?, ? examples/s]

Filter:   0%|          | 0/908 [00:00<?, ? examples/s]

In [29]:
def preprocess_function(examples):
    # Ensure that the source text is 'summary/long' and the target is 'summary/short'
    # We skip rows with None or NaN values
    inputs = [doc for doc in examples["summary/long"] if doc is not None]
    targets = [summary for summary in examples["summary/short"] if summary is not None]

    # Tokenize the input text (summary/long)
    model_inputs = tokenizer(inputs, max_length=1024, truncation=True)

    # Setup the tokenizer for targets (summary/short)
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(targets, max_length=128, truncation=True)

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs


train_dataset = train_dataset.map(preprocess_function, batched=True)
val_dataset = val_dataset.map(preprocess_function, batched=True)
test_dataset = test_dataset.map(preprocess_function, batched=True)


Map:   0%|          | 0/2210 [00:00<?, ? examples/s]



Map:   0%|          | 0/312 [00:00<?, ? examples/s]

Map:   0%|          | 0/616 [00:00<?, ? examples/s]

In [30]:
# Printing results
print('\n' * 3)
print('Preprocessed Training Dataset:\n')
print(train_dataset)
print('\n' * 2)
print('Preprocessed Test Dataset:\n')
print(val_dataset)
print('\n' * 2)
print('Preprocessed Validation Dataset:\n')
print(test_dataset)





Preprocessed Training Dataset:

Dataset({
    features: ['id', 'sources', 'summary/long', 'summary/short', 'summary/tiny', 'input_ids', 'attention_mask', 'labels'],
    num_rows: 2210
})



Preprocessed Test Dataset:

Dataset({
    features: ['id', 'sources', 'summary/long', 'summary/short', 'summary/tiny', 'input_ids', 'attention_mask', 'labels'],
    num_rows: 312
})



Preprocessed Validation Dataset:

Dataset({
    features: ['id', 'sources', 'summary/long', 'summary/short', 'summary/tiny', 'input_ids', 'attention_mask', 'labels'],
    num_rows: 616
})


In [31]:
# Selecting a sample from the dataset
sample = train_dataset[0]

# Printing its features
print("input_ids:")                    # These are the token IDs
print(sample['input_ids'])
print("\n")
print("attention_mask:")               # indicates which tokens the model should pay attention to and which tokens should be ignored
print(sample['attention_mask'])
print("\n")
print("sample:")                       # token IDs obtained from the words and subwords in the summaries
print(sample['labels'])
print("\n")

input_ids:
[0, 4148, 772, 379, 6, 4013, 6, 5, 25235, 17820, 19469, 1463, 36, 9993, 4571, 43, 1658, 3235, 136, 446, 9, 3378, 6, 603, 482, 15, 4137, 9, 41, 3200, 54, 21, 2346, 2277, 142, 79, 21, 5283, 4, 9402, 5775, 8, 30272, 879, 32740, 3500, 13, 5, 3200, 36, 8529, 776, 1880, 6, 4660, 13, 3722, 4798, 6, 8, 21987, 8357, 238, 5, 27397, 4571, 1146, 3235, 223, 13497, 33559, 9, 5, 5280, 3941, 1783, 9, 17616, 13, 12286, 6886, 15, 5, 1453, 9, 2099, 4, 20, 27397, 4571, 67, 2952, 7, 5312, 63, 1042, 4, 50118, 50118, 42567, 940, 4778, 6, 5, 3200, 1658, 10, 4298, 7, 13192, 11, 5, 3235, 6, 61, 21, 6885, 4159, 71, 5, 675, 13, 3386, 16006, 1595, 396, 1160, 4, 20, 3200, 1146, 1449, 223, 13497, 33559, 8, 194, 488, 8, 2952, 12246, 5, 276, 3500, 25, 5, 27397, 4571, 6, 4682, 14, 5, 3674, 4010, 2952, 9493, 29963, 4, 50118, 50118, 42280, 5, 1799, 376, 7, 10, 4221, 1288, 6, 61, 5, 837, 36, 40145, 8051, 118, 229, 4, 5620, 387, 3876, 43, 2867, 25, 10, 7132, 20156, 15, 1133, 158, 6, 2338, 4, 20, 1110, 9, 5, 2015

In [32]:
# Instantiating Data Collator to batch the data
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)

In [33]:
# Loading ROUGE Score
metric = evaluate.load('rouge')

In [34]:
def compute_metrics(eval_pred):
    predictions, labels = eval_pred  # Obtaining predictions and true labels

    # Decoding predictions
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)

    # Handling masked tokens in labels (i.e., label = -100)
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # Rouge expects a newline after each sentence
    decoded_preds = ["\n".join(nltk.sent_tokenize(pred.strip())) for pred in decoded_preds]
    decoded_labels = ["\n".join(nltk.sent_tokenize(label.strip())) for label in decoded_labels]

    # Compute the Rouge score
    result = metric.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)

    # Check if `result` has the expected structure, and safely access fmeasure
    if isinstance(result, dict):
        result = {key: (value['fmeasure'] * 100 if isinstance(value, dict) and 'fmeasure' in value else value)
                  for key, value in result.items()}
    else:
        print("Unexpected result structure:", result)

    # Add mean-generated length
    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in predictions]
    result["gen_len"] = np.mean(prediction_lens)

    return {k: round(v, 4) for k, v in result.items()}

In [39]:
# Defining parameters for training
training_args = Seq2SeqTrainingArguments(
    output_dir = 'Bart_law',
    evaluation_strategy = "epoch",
    save_strategy = 'epoch',
    load_best_model_at_end = True,
    metric_for_best_model = 'eval_loss',
    seed = 42,
    learning_rate=2e-5,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    gradient_accumulation_steps=2,
    weight_decay=0.01,
    save_total_limit=2,
    num_train_epochs=4,
    predict_with_generate=True,
    fp16=True,
    report_to="none"
)

In [40]:
from transformers import EarlyStoppingCallback


In [42]:
# Defining Trainer
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=3)]
)

In [43]:
# Training model
trainer.train()

  0%|          | 0/1104 [00:00<?, ?it/s]

  0%|          | 0/154 [00:00<?, ?it/s]

Non-default generation parameters: {'max_length': 142, 'min_length': 56, 'early_stopping': True, 'num_beams': 4, 'length_penalty': 2.0, 'no_repeat_ngram_size': 3, 'forced_bos_token_id': 0, 'forced_eos_token_id': 2}


{'eval_loss': 1.0617969036102295, 'eval_rouge1': 0.5686, 'eval_rouge2': 0.3818, 'eval_rougeL': 0.4518, 'eval_rougeLsum': 0.5264, 'eval_gen_len': 120.1607, 'eval_runtime': 14041.7482, 'eval_samples_per_second': 0.044, 'eval_steps_per_second': 0.011, 'epoch': 1.0}


KeyboardInterrupt: 