In [1]:
pip install datasets

Collecting datasets
  Downloading datasets-3.2.0-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.2.0-py3-none-any.whl (480 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m35.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m11.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.9.0-py3-none-any.whl 

In [2]:
pip install evaluate

Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Downloading evaluate-0.4.3-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.0/84.0 kB[0m [31m6.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: evaluate
Successfully installed evaluate-0.4.3


# **Problem 2: Sentiment Analysis Using BERT**

In [5]:
import pandas as pd
import numpy as np
import seaborn as sns
import nltk
from sklearn.model_selection import train_test_split
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification, AdamW, AutoModel
from torch.utils.data import DataLoader, TensorDataset
from datasets import Dataset
from evaluate import load
from transformers import DataCollatorWithPadding
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer

# import train and test data
train_df = pd.read_csv('/twitter_training.csv',
                 header=None,
                 names=['id', 'entity', 'sentiment', 'text'])
test_df = pd.read_csv('/twitter_validation.csv',
                 header=None,
                 names=['id', 'entity', 'sentiment', 'text'])

# create labels to input for the model
labels = list(train_df['sentiment'].unique())
id2label = {k:v for k,v in enumerate(labels)}
label2id = {v:k for k,v in enumerate(labels)}
train_df['label'] = train_df['sentiment'].map(label2id)
test_df['label'] = test_df['sentiment'].map(label2id)

### Pre-trained Bert Model

In [6]:
from sklearn.linear_model import LogisticRegression

# load the pre-trained bert model and tokenizer
tokenizer = AutoTokenizer.from_pretrained("cardiffnlp/twitter-roberta-base-sentiment")
model = AutoModel.from_pretrained("cardiffnlp/twitter-roberta-base-sentiment",num_labels=4)

# validation/test dataset
X_test = test_df['text'].astype(str).tolist()
y_test = test_df['label']

# tokenize the text data
encoded_inputs = tokenizer(X_test, padding=True, truncation=True, return_tensors="pt")

# CLS embeddings
with torch.no_grad():
    outputs = model(**encoded_inputs)
    cls_embeddings = outputs.last_hidden_state[:, 0, :].numpy()

# logistic regression
clf = LogisticRegression()
clf.fit(cls_embeddings, y_test)

# predict and generate classification report
from sklearn.metrics import classification_report

predicted_labels = clf.predict(cls_embeddings)
report = classification_report(y_test, predicted_labels, target_names=labels)
print(report)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/747 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/150 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of RobertaModel were not initialized from the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


              precision    recall  f1-score   support

    Positive       0.88      0.86      0.87       277
     Neutral       0.80      0.82      0.81       285
    Negative       0.83      0.90      0.86       266
  Irrelevant       0.83      0.70      0.76       172

    accuracy                           0.83      1000
   macro avg       0.83      0.82      0.83      1000
weighted avg       0.83      0.83      0.83      1000



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


### Fine Tuned Bert Model

In [None]:
# create a seperate validation dataset to prevent overfitting
train = train_df.sample(frac=0.9, random_state=42)
valid = train_df.drop(train.index)

# convert pandas dataframes into hugging face dataframe
train_ds = Dataset.from_pandas(train)
valid_ds = Dataset.from_pandas(valid)
test_ds = Dataset.from_pandas(test_df)

# use accuracy
metric = load("accuracy")
def compute_metrics(eval_pred):
  logits, labels = eval_pred
  predictions = np.argmax(logits, axis=-1)
  return metric.compute(predictions=predictions,
                        references=labels)

# load the pre-trained bert model and tokenizer
tokenizer = AutoTokenizer.from_pretrained("cardiffnlp/twitter-roberta-base-sentiment")
model = AutoModelForSequenceClassification.from_pretrained(
    "cardiffnlp/twitter-roberta-base-sentiment",
    num_labels=len(labels),
    id2label=id2label,
    label2id=label2id,
    ignore_mismatched_sizes=True
)

# make sure input sequences in a batch have the same length
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

# extract text in dataframe and tokenize it
def preprocess_function(data):
    text = [str(text) for text in data["text"]]
    return tokenizer(text, truncation=True, max_length=128)

train_ds = train_ds.map(preprocess_function, batched=True)
valid_ds = valid_ds.map(preprocess_function, batched=True)
test_ds = test_ds.map(preprocess_function, batched=True)

Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/747 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/878k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/446k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/150 [00:00<?, ?B/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Downloading:   0%|          | 0.00/476M [00:00<?, ?B/s]

  state_dict = torch.load(resolved_archive_file, map_location="cpu")
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment and are newly initialized because the shapes did not match:
- classifier.out_proj.weight: found shape torch.Size([3, 768]) in the checkpoint and torch.Size([4, 768]) in the model instantiated
- classifier.out_proj.bias: found shape torch.Size([3]) in the checkpoint and torch.Size([4]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/67214 [00:00<?, ? examples/s]

Map:   0%|          | 0/7468 [00:00<?, ? examples/s]

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

In [None]:
training_args = TrainingArguments(
    # directory where model checkpoints are saved
    output_dir="twitter-sentiment-detector",
    # learning rate
    learning_rate=0.00002,
    # batch size for training for each gpu
    per_device_train_batch_size=32,
    # batch size for evaluation for each gpu
    per_device_eval_batch_size=32,
    # 5 epochs
    num_train_epochs=5,
    # evaluate the model at the end of epoch
    evaluation_strategy="epoch",
    # save model checkpoint at the end of epoch
    save_strategy="epoch",
    # disable logging into external services
    report_to="none",
    # regularization to prevent overfitting
    weight_decay=0.01,
)

trainer = Trainer(
    # sentiment classification model
    model=model,
    # training arguments
    args=training_args,
    # train dataset
    train_dataset=train_ds,
    # validation dataset
    eval_dataset=valid_ds,
    # tokenizer
    tokenizer=tokenizer,
    # handling padding of sequences
    data_collator=data_collator,
    # using accuracy as the metric
    compute_metrics=compute_metrics,
)

# train it
trainer.train()

The following columns in the training set  don't have a corresponding argument in `RobertaForSequenceClassification.forward` and have been ignored: sentiment, entity, id, text, __index_level_0__.
***** Running training *****
  Num examples = 67214
  Num Epochs = 5
  Instantaneous batch size per device = 32
  Total train batch size (w. parallel, distributed & accumulation) = 32
  Gradient Accumulation steps = 1
  Total optimization steps = 10505


Epoch,Training Loss,Validation Loss,Accuracy
1,0.6466,0.555471,0.788297
2,0.3291,0.336222,0.88203
3,0.1959,0.305566,0.903321
4,0.1369,0.319003,0.912694
5,0.0987,0.337188,0.912694


The following columns in the evaluation set  don't have a corresponding argument in `RobertaForSequenceClassification.forward` and have been ignored: sentiment, entity, id, text, __index_level_0__.
***** Running Evaluation *****
  Num examples = 7468
  Batch size = 32
Saving model checkpoint to twitter-sentiment-detector/checkpoint-2101
Configuration saved in twitter-sentiment-detector/checkpoint-2101/config.json
Model weights saved in twitter-sentiment-detector/checkpoint-2101/pytorch_model.bin
tokenizer config file saved in twitter-sentiment-detector/checkpoint-2101/tokenizer_config.json
Special tokens file saved in twitter-sentiment-detector/checkpoint-2101/special_tokens_map.json
The following columns in the evaluation set  don't have a corresponding argument in `RobertaForSequenceClassification.forward` and have been ignored: sentiment, entity, id, text, __index_level_0__.
***** Running Evaluation *****
  Num examples = 7468
  Batch size = 32
Saving model checkpoint to twitter-sen

TrainOutput(global_step=10505, training_loss=0.32495463856171, metrics={'train_runtime': 1314.5186, 'train_samples_per_second': 255.66, 'train_steps_per_second': 7.992, 'total_flos': 1.3795241546318544e+16, 'train_loss': 0.32495463856171, 'epoch': 5.0})

In [None]:
# predict by extracting the class labels
predictions = trainer.predict(test_ds).predictions.argmax(axis=1)

# true label
y_true = test_ds["label"]

# classification report
print(classification_report(y_true, predictions, target_names=labels))

The following columns in the test set  don't have a corresponding argument in `RobertaForSequenceClassification.forward` and have been ignored: entity, sentiment, text, id.
***** Running Prediction *****
  Num examples = 1000
  Batch size = 32


              precision    recall  f1-score   support

    Positive       0.94      0.99      0.96       277
     Neutral       0.96      0.95      0.96       285
    Negative       0.97      0.98      0.98       266
  Irrelevant       0.99      0.93      0.96       172

    accuracy                           0.96      1000
   macro avg       0.97      0.96      0.96      1000
weighted avg       0.96      0.96      0.96      1000



### Analysis

The pre-trained BERT model is used as a feature extractor to transform text into CLS embeddings—numerical features with semantic information. These embeddings are suitable for machine learning models, such as logistic regression, to predict sentiment. The F1-scores across all classes exceed 0.8, demonstrating the pre-trained model's adequacy for sentiment analysis.

For the fine-tuning model, the pre-trained cardiffnlp/twitter-roberta-base-sentiment model is used. It is a roBERTa-base model trained on ~58M tweets and finetuned for sentiment analysis with the TweetEval benchmark. The f1-score for the fine-tuned model is higher by 0.1 compared to the pre-trained model, which is quite a difference.

Because fine-tuning models adjust the weights specifically for the dataset and understands task-specific nuances, it generally yields a higher precision and recall score.

# **Problem 3: News Text Summarization with T5**

In [None]:
pip install datasets

Collecting datasets
  Downloading datasets-3.2.0-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.2.0-py3-none-any.whl (480 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m9.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m9.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.9.0-py3-none-any.whl (1

In [None]:
pip install evaluate

Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Downloading evaluate-0.4.3-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.0/84.0 kB[0m [31m2.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: evaluate
Successfully installed evaluate-0.4.3


In [None]:
pip install 'pip<24.1'

Collecting pip<24.1
  Downloading pip-24.0-py3-none-any.whl.metadata (3.6 kB)
Downloading pip-24.0-py3-none-any.whl (2.1 MB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/2.1 MB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.1/2.1 MB[0m [31m4.9 MB/s[0m eta [36m0:00:01[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━[0m [32m1.3/2.1 MB[0m [31m19.5 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.1/2.1 MB[0m [31m23.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pip
  Attempting uninstall: pip
    Found existing installation: pip 24.1.2
    Uninstalling pip-24.1.2:
      Successfully uninstalled pip-24.1.2
Successfully installed pip-24.0


In [None]:
!pip install simplet5

[33mDEPRECATION: pytorch-lightning 1.5.10 has a non-standard dependency specifier torch>=1.7.*. pip 24.1 will enforce this behaviour change. A possible replacement is to upgrade to a newer version of pytorch-lightning or contact the author to suggest that they release a version with a conforming dependency specifiers. Discussion can be found at https://github.com/pypa/pip/issues/12063[0m[33m
[0m

In [None]:
import pandas as pd
import torch
from transformers import T5TokenizerFast, T5ForConditionalGeneration
from transformers import DataCollatorForSeq2Seq, Seq2SeqTrainingArguments, Seq2SeqTrainer
import evaluate
import numpy as np
import pandas as pd

# import train and test data
train_df = pd.read_csv('/content/train.csv')
test = pd.read_csv('/content/test.csv')
validation_df = pd.read_csv('/content/validation.csv')

# reducing dataset because of time and low computing units
train_df = train_df.sample(frac=0.1, random_state=42)
test = test.sample(frac=0.01, random_state=42)
validation_df = validation_df.sample(frac=0.1, random_state=42)

def preprocess_text(row):
    # add summarize
    row['article'] = "summarize: " + row["article"]
    return row

# apply preprocess_text
train_dataset = train_df.apply(preprocess_text, axis=1)
val_dataset = validation_df.apply(preprocess_text, axis=1)
test = test.apply(preprocess_text, axis=1)

In [None]:
from simplet5 import SimpleT5
model = SimpleT5()

# pretrained T5 model
model.from_pretrained(model_type="t5", model_name="t5-base")

# rename the columns
train_dataset.rename(columns={"article": "source_text",
                              "highlights": "target_text"}, inplace=True)
val_dataset.rename(columns={"article": "source_text",
                            "highlights": "target_text"}, inplace=True)

# train
model.train(train_df=train_dataset,
            eval_df=val_dataset,
            source_max_token_len=300,
            target_max_token_len=100,
            batch_size=16, max_epochs=2, use_gpu=True,
            early_stopping_patience_epochs=2)

INFO:pytorch_lightning.utilities.seed:Global seed set to 42


Downloading:   0%|          | 0.00/773k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.32M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.18k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/850M [00:00<?, ?B/s]

  state_dict = torch.load(resolved_archive_file, map_location="cpu")
  from torch.distributed._sharded_tensor import pre_load_state_dict_hook, state_dict_hook
INFO:pytorch_lightning.utilities.distributed:GPU available: True, used: True
INFO:pytorch_lightning.utilities.distributed:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.utilities.distributed:IPU available: False, using: 0 IPUs
INFO:pytorch_lightning.accelerators.gpu:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
INFO:pytorch_lightning.callbacks.model_summary:
  | Name  | Type                       | Params
-----------------------------------------------------
0 | model | T5ForConditionalGeneration | 222 M 
-----------------------------------------------------
222 M     Trainable params
0         Non-trainable params
222 M     Total params
891.614   Total estimated model params size (MB)


Validation sanity check: 0it [00:00, ?it/s]

  rank_zero_warn(
INFO:pytorch_lightning.utilities.seed:Global seed set to 42
  rank_zero_warn(


Training: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

INFO:pytorch_lightning.callbacks.early_stopping:Metric val_loss improved. New best score: 1.668


Validating: 0it [00:00, ?it/s]

In [None]:
pip install rouge

Collecting rouge
  Downloading rouge-1.0.1-py3-none-any.whl.metadata (4.1 kB)
Downloading rouge-1.0.1-py3-none-any.whl (13 kB)
[33mDEPRECATION: pytorch-lightning 1.5.10 has a non-standard dependency specifier torch>=1.7.*. pip 24.1 will enforce this behaviour change. A possible replacement is to upgrade to a newer version of pytorch-lightning or contact the author to suggest that they release a version with a conforming dependency specifiers. Discussion can be found at https://github.com/pypa/pip/issues/12063[0m[33m
[0mInstalling collected packages: rouge
Successfully installed rouge-1.0.1


In [None]:
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer

# load model and tokenizer using the previous model
model_path = "/content/outputs/simplet5-epoch-1-train-loss-1.7162-val-loss-1.6853"
tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForSeq2SeqLM.from_pretrained(model_path)

# Maximum input length for the model
max_input_length = 512
max_output_length = 150 

# test data
test_articles = test['article'].tolist()
highlights = test['highlights'].tolist()

# list for generated summaries
text_summaries = []

# generating summaries
for article in test_articles:
    # tokenize
    inputs = tokenizer.encode(
        article, max_length=max_input_length, truncation=True, return_tensors="pt"
    )

    # summary
    summary_ids = model.generate(
        inputs,
        max_length=max_output_length,
        num_beams=4,
        early_stopping=True
    )

    # decode the generated summary
    summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
    text_summaries.append(summary)

  state_dict = torch.load(resolved_archive_file, map_location="cpu")


In [None]:
pip install rouge-score evaluate

Collecting rouge-score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: rouge-score
  Building wheel for rouge-score (setup.py) ... [?25l[?25hdone
  Created wheel for rouge-score: filename=rouge_score-0.1.2-py3-none-any.whl size=24951 sha256=879c8d2ba93925b8a18ccc54a2b14c9c3e3982af2aab33dacb8003ec816b6e68
  Stored in directory: /root/.cache/pip/wheels/5f/dd/89/461065a73be61a532ff8599a28e9beef17985c9e9c31e541b4
Successfully built rouge-score
[33mDEPRECATION: pytorch-lightning 1.5.10 has a non-standard dependency specifier torch>=1.7.*. pip 24.1 will enforce this behaviour change. A possible replacement is to upgrade to a newer version of pytorch-lightning or contact the author to suggest that they release a version with a conforming dependency specifiers. Discussion can be found at https://github.com/pypa/pip/issues/12063[0m[33m
[0mInstalling collected packages: rouge-score
Successfully inst

In [None]:
from evaluate import load

# rouge metric
rouge = load("rouge")

# compute 
results = rouge.compute(predictions=text_summaries, references=highlights)

# results
for metric, score in results.items():
    print(f"{metric}: {score:.4f}")

rouge1: 0.4033
rouge2: 0.1878
rougeL: 0.2860
rougeLsum: 0.3514


### Analysis

ROUGE-1 (0.4033)

Rouge 1 is based on unigram precision and recall to measure summarization quality. The score 0.4 shows a decent accuracy in generating important keywords but fails to miss other content.

ROUGE-2 (0.1878)

Rouge 2 is same as rouge 1 but is based on bigrams. The significant drop in rouge 2 when compared to rouge 1 may indicate the model struggles with coherence or context captured by bigrams.

ROUGE-L (0.2860)

Rouge L doesn't compare n-grams but instead treats each summary as a sequence of words and then searches for the longest common subsequence. Score of 0.286 suggests the model is adequate in maintaining the sequence and structure of key content but needs improvement for coherency and readability.

ROUGE-Lsum (0.3514)

Rouge-Lsum splits the summaries into sentences and performs rouge L calculations for each sentence individually. Higher score in rouge lsum when compared with rouge L indicates the generated summary is better at sentence level than overall text level.

The T5 summarization model performs well in generating salient keywords and each single sentences but contains weaknesses in generating relevant phrases and global summarization quality.