In [26]:
!pip install emoji
import pandas as pd
import numpy as np
import nltk
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
import string
import emoji
from transformers import PegasusTokenizer, PegasusForConditionalGeneration
from sklearn.model_selection import train_test_split
from tqdm import tqdm
from google.colab import drive

# Mount Google Drive
drive.mount('/content/drive')

# Load dataset
reviews = pd.read_csv("/content/prompts.csv")
reviews = reviews.head(10000)

# Preprocessing
nltk.download('punkt')
nltk.download('stopwords')
# Download the 'punkt_tab' resource
nltk.download('punkt_tab')  # This line is added to download the necessary data
stop_words = set(stopwords.words('english'))




Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


In [27]:
# Slang conversion function
def convert_slang(text):
    slang_dict = {
        'u': 'you', 'r': 'are', 'ur': 'your', 'n': 'and', 'l8r': 'later',
        'gr8': 'great', 'b4': 'before', '2nite': 'tonight', 'plz': 'please',
        'thx': 'thanks', 'omg': 'oh my god', 'btw': 'by the way',
        'bff': 'best friends forever', 'idk': "I don't know", 'imo': 'in my opinion', 'lol': 'laugh out loud'
    }
    words = text.split()
    words = [slang_dict[word] if word in slang_dict else word for word in words]
    return ' '.join(words)

In [28]:
# Emoji conversion function
def convert_emojis(text):
    return emoji.demojize(text)

# Apply preprocessing
# Apply preprocessing
# Assuming your text column is named 'act' based on the global variables
reviews['Text'] = reviews['act'].str.lower()
reviews['Text'] = reviews['Text'].apply(convert_slang)
reviews['Text'] = reviews['Text'].apply(convert_emojis)

In [29]:
# Tokenization, stopword removal, and punctuation removal
tqdm.pandas()
reviews['Text'] = reviews['Text'].progress_apply(word_tokenize)
reviews['Text'] = reviews['Text'].apply(
    lambda tokens: [token for token in tokens if token not in stop_words and token not in string.punctuation]
)
reviews['Text'] = reviews['Text'].apply(lambda tokens: ' '.join(tokens))


100%|██████████| 170/170 [00:00<00:00, 7837.84it/s]


In [30]:
# Remove empty rows
reviews = reviews.dropna(subset=['Text'])
reviews = reviews[reviews['Text'] != '']

In [31]:
# Split data into training and validation sets
train_df, val_df = train_test_split(reviews, test_size=0.1, random_state=42)

In [32]:
# Load PEGASUS tokenizer and model
model_name = "google/pegasus-xsum"
tokenizer = PegasusTokenizer.from_pretrained(model_name)
model = PegasusForConditionalGeneration.from_pretrained(model_name)

Some weights of PegasusForConditionalGeneration were not initialized from the model checkpoint at google/pegasus-xsum and are newly initialized: ['model.decoder.embed_positions.weight', 'model.encoder.embed_positions.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [33]:
# Summarization function
def summarize_text(text, tokenizer, model):
    # Extract key sentences for better summarization
    key_sentences = sent_tokenize(text)[:2]  # Keep the first two sentences
    refined_input = " ".join(key_sentences)

In [34]:
# Summarization function
def summarize_text(text, tokenizer, model):
    # Extract key sentences for better summarization
    key_sentences = sent_tokenize(text)[:2]  # Keep the first two sentences
    refined_input = " ".join(key_sentences)
    # Tokenize and generate summary
    input_ids = tokenizer.encode(
        "summarize: " + refined_input, return_tensors="pt", max_length=512, truncation=True
    )
    summary_ids = model.generate(
        input_ids, max_length=50, num_beams=6, length_penalty=1.0, early_stopping=True
    )
    summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
    return summary

In [35]:
# Generate summaries
generated_summaries = []
for text in tqdm(train_df['Text']):
    try:
        summary = summarize_text(text, tokenizer, model)
        generated_summaries.append(summary)
    except Exception as e:
        print(f"Error summarizing text: {text[:50]}... | Error: {e}")
        generated_summaries.append("")

100%|██████████| 153/153 [22:16<00:00,  8.73s/it]


In [36]:
# Add summaries to DataFrame
train_df['GeneratedSummary'] = generated_summaries

In [37]:
# Save results to CSV
output_path = "/content/drive/MyDrive/fine_tuned_reviews_with_summaries.csv"
train_df.to_csv(output_path, index=False)
print(f"Summaries saved to {output_path}")

Summaries saved to /content/drive/MyDrive/fine_tuned_reviews_with_summaries.csv


In [38]:
# Save fine-tuned model and tokenizer for reuse
model.save_pretrained("/content/drive/MyDrive/fine_tuned_pegasus_model")
tokenizer.save_pretrained("/content/drive/MyDrive/fine_tuned_pegasus_model")



('/content/drive/MyDrive/fine_tuned_pegasus_model/tokenizer_config.json',
 '/content/drive/MyDrive/fine_tuned_pegasus_model/special_tokens_map.json',
 '/content/drive/MyDrive/fine_tuned_pegasus_model/spiece.model',
 '/content/drive/MyDrive/fine_tuned_pegasus_model/added_tokens.json')

In [39]:
# Test with an example input
input_text = "If you are looking for the secret ingredient in Robitussin I believe I have found it. I got this in addition to the Root Beer Extract I ordered (which was good) and made some cherry soda. The flavor is very medicinal."
generated_summary = summarize_text(input_text, tokenizer, model)
print("Generated Summary:", generated_summary)

Generated Summary: If you are looking for the secret ingredient in Robitussin I believe I have found it.
