In [None]:
!pip install transformers[torch] tokenizers datasets evaluate rouge_score sentencepiece huggingface_hub --upgrade

In [None]:
import nltk
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from wordcloud import WordCloud, STOPWORDS
from datasets import load_dataset
import evaluate
from transformers import T5Tokenizer, T5ForConditionalGeneration, DataCollatorForSeq2Seq, Seq2SeqTrainingArguments, Seq2SeqTrainer
import torch
from nltk.corpus import stopwords
from nltk.translate.bleu_score import corpus_bleu
from sklearn.metrics import f1_score

In [None]:
# Loading dataset
dataset = load_dataset("toughdata/quora-question-answer-dataset")

In [None]:
print(f"Dataset structure: {dataset}")

# Analyzing the structure of the dataset
print("Dataset columns and sample data:")
print(dataset["train"].column_names)
print(dataset["train"].features)

df = pd.DataFrame(dataset['train'])

# Analyzing the structure and content of the dataset
print("Dataset structure:")
print(df.head())
print("\nDataset info:")
print(df.info())
print("\nDataset description:")
print(df.describe())


# Checking for missing values
print("\nMissing Values in Data:")
print(df.isnull().sum())


In [None]:
contractions_dict = {
    "ain't": "am not",
    "aren't": "are not",
    "can't": "cannot",
    "can't've": "cannot have",
    "'cause": "because",
    "could've": "could have",
    "couldn't": "could not",
    "couldn't've": "could not have",
    "didn't": "did not",
    "doesn't": "does not",
    "don't": "do not",
    "hadn't": "had not",
    "hadn't've": "had not have",
    "hasn't": "has not",
    "haven't": "have not",
    "he'd": "he had / he would",
    "he'd've": "he would have",
    "he'll": "he will",
    "he'll've": "he will have",
    "he's": "he is",
    "how'd": "how did",
    "how'd'y": "how do you",
    "how'll": "how will",
    "how's": "how is",
    "i'd": "I had / I would",
    "i'd've": "I would have",
    "i'll": "I will",
    "i'll've": "I will have",
    "i'm": "I am",
    "i've": "I have",
    "isn't": "is not",
    "it'd": "it had / it would",
    "it'd've": "it would have",
    "it'll": "it will",
    "it'll've": "it will have",
    "it's": "it has / it is",
    "let's": "let us",
    "ma'am": "madam",
    "mayn't": "may not",
    "might've": "might have",
    "mightn't": "might not",
    "mightn't've": "might not have",
    "must've": "must have",
    "mustn't": "must not",
    "mustn't've": "must not have",
    "needn't": "need not",
    "needn't've": "need not have",
    "o'clock": "of the clock",
    "oughtn't": "ought not",
    "oughtn't've": "ought not have",
    "shan't": "shall not",
    "sha'n't": "shall not",
    "shan't've": "shall not have",
    "she'd": "she had / she would",
    "she'd've": "she would have",
    "she'll": "she will",
    "she'll've": "she will have",
    "she's": "she is",
    "should've": "should have",
    "shouldn't": "should not",
    "shouldn't've": "should not have",
    "so've": "so have",
    "so's": "so is",
    "that'd": "that had",
    "that'd've": "that would have",
    "that's": "that has / that is",
    "there'd": "there had / there would",
    "there'd've": "there would have",
    "there's": "there has / there is",
    "they'd": "they had / they would",
    "they'd've": "they would have",
    "they'll": "they shall / they will",
    "they'll've": "they shall have / they will have",
    "they're": "they are",
    "they've": "they have",
    "to've": "to have",
    "wasn't": "was not",
    "we'd": "we had / we would",
    "we'd've": "we would have",
    "we'll": "we will",
    "we'll've": "we will have",
    "we're": "we are",
    "we've": "we have",
    "weren't": "were not",
    "what'll": "what shall / what will",
    "what'll've": "what shall have / what will have",
    "what're": "what are",
    "what's": "what has / what is",
    "what've": "what have",
    "when's": "when has / when is",
    "when've": "when have",
    "where'd": "where did",
    "where's": "where has / where is",
    "where've": "where have",
    "who'll": "who shall / who will",
    "who'll've": "who shall have / who will have",
    "who's": "who is",
    "who've": "who have",
    "why's": "why is",
    "why've": "why have",
    "will've": "will have",
    "won't": "will not",
    "won't've": "will not have",
    "would've": "would have",
    "wouldn't": "would not",
    "wouldn't've": "would not have",
    "y'all": "you all",
    "y'all'd": "you all would",
    "y'all'd've": "you all would have",
    "y'all're": "you all are",
    "y'all've": "you all have",
    "you'd": "you would",
    "you'd've": "you would have",
    "you'll": "you will",
    "you'll've": "you will have",
    "you're": "you are",
    "you've": "you have"
}

In [None]:
import re

def expand_contractions(text, contractions_dict):
    if not text:
        return ""
    for contraction, expansion in contractions_dict.items():
        text = re.sub(r'\b{}\b'.format(re.escape(contraction)), expansion, text)
    return text

# Text preprocessing function
def text_preprocessing(text):
    if pd.isnull(text):  # Checking if the text is NaN
        return ''
    
    # Expanding contractions
    text = expand_contractions(text, contractions_dict)
    
    # Converting text to lowercase
    text = text.lower()
    
    # Removing HTML tags
    text = re.sub(r'<.*?>', '', text)
    
    # Replacing non-alphabetic characters with spaces, but keep contractions intact
    text = re.sub(r'[^a-z\' ]', ' ', text)
    
    # Removing extra spaces
    text = re.sub(r'\s+', ' ', text).strip()
    
    return text

# Function to preprocess texts
def preprocess_texts(df):
    df['question'] = df['question'].apply(lambda x: text_preprocessing(x))
    df['answer'] = df['answer'].apply(lambda x: text_preprocessing(x))
    return df

df_train = preprocess_texts(df)

In [None]:
print(df)

In [None]:
sample_text = df_train['question'].iloc[7]
orig = df['question'].iloc[7]
expanded_text = expand_contractions(sample_text, contractions_dict)
print(f"Original: {orig}")
print(f"Expanded: {expanded_text}")

In [None]:
# Data Visualization
# Distribution of question lengths
df_train['question_length'] = df_train['question'].apply(lambda x: len(x.split()))
plt.figure(figsize=(12, 6))
sns.histplot(df_train['question_length'], bins=50, kde=True)
plt.title('Distribution of Question Lengths')
plt.xlabel('Length of Questions')
plt.ylabel('Frequency')
plt.show()

In [None]:
# Distribution of answer lengths
df_train['answer_length'] = df_train['answer'].apply(lambda x: len(x.split()))

# Handling possible infinite values
df_train = df_train.replace([np.inf, -np.inf], np.nan).dropna()

plt.figure(figsize=(12, 6))
sns.histplot(df_train['answer_length'], bins=50, kde=True)
plt.title('Distribution of Answer Lengths')
plt.xlabel('Length of Answers')
plt.ylabel('Frequency')
plt.show()

In [None]:
import matplotlib.pyplot as plt
from wordcloud import WordCloud, STOPWORDS
import nltk
from nltk.corpus import stopwords

# Download stopwords
nltk.download('stopwords')

# Load stop words from nltk and wordcloud
nltk_stopwords = set(stopwords.words('english'))
wordcloud_stopwords = set(STOPWORDS)

# Combining NLTK and WordCloud stop words
stop_words = nltk_stopwords.union(wordcloud_stopwords)

# Word cloud for questions
text = ' '.join(df_train['question'])
wordcloud = WordCloud(stopwords=stop_words,width=800, height=400, background_color='white').generate(text)
plt.figure(figsize=(10, 5))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.title('Word Cloud for Questions')
plt.show()

In [None]:
# Word cloud for answers
text = ' '.join(df_train['answer'])
wordcloud = WordCloud(stopwords=stop_words,width=800, height=400, background_color='white').generate(text)
plt.figure(figsize=(10, 5))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.title('Word Cloud for Answers')
plt.show()

In [None]:
print(type(df_train))
print(type(dataset))

In [None]:
# Split the dataset
from datasets import Dataset
df_train = df_train[['question','answer']]
dataset = Dataset.from_pandas(df_train)
dataset = dataset.train_test_split(test_size=0.25)
tokenizer = T5Tokenizer.from_pretrained("google/flan-t5-base")
model = T5ForConditionalGeneration.from_pretrained("google/flan-t5-base")
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)

In [None]:
print(dataset)

In [None]:
prefix = "answer the question: "

# Preprocessing function for model
def preprocess_function(examples):
    inputs = [prefix + doc for doc in examples["question"]]
    model_inputs = tokenizer(inputs, max_length=128, truncation=True)
    labels = tokenizer(text_target=examples["answer"], max_length=512, truncation=True)
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

tokenized_dataset = dataset.map(preprocess_function, batched=True)

In [None]:
nltk.download("punkt", quiet=True)
metric = evaluate.load("rouge")

from rouge_score import rouge_scorer
import nltk
from nltk.translate.bleu_score import sentence_bleu
from sklearn.metrics import f1_score

# Initializing ROUGE scorer
rouge_scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)

def compute_metrics(eval_preds):
    preds, labels = eval_preds

    # Decoding predictions and labels
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # Calculating ROUGE scores
    rouge_scores = {'rouge1': 0, 'rouge2': 0, 'rougeL': 0}
    for pred, label in zip(decoded_preds, decoded_labels):
        scores = rouge_scorer.score(label, pred)
        rouge_scores['rouge1'] += scores['rouge1'].fmeasure
        rouge_scores['rouge2'] += scores['rouge2'].fmeasure
        rouge_scores['rougeL'] += scores['rougeL'].fmeasure
    
    num_samples = len(decoded_preds)
    rouge_scores = {key: score / num_samples for key, score in rouge_scores.items()}

    # Calculating BLEU score
    bleu_scores = [sentence_bleu([nltk.word_tokenize(label)], nltk.word_tokenize(pred)) for pred, label in zip(decoded_preds, decoded_labels)]
    bleu_score = sum(bleu_scores) / len(bleu_scores)

    # Calculating F1 score
    f1_scores = f1_score(decoded_labels, decoded_preds, average='weighted', zero_division=0)

    return {
        "rouge1": rouge_scores['rouge1'],
        "rouge2": rouge_scores['rouge2'],
        "rougeL": rouge_scores['rougeL'],
        "bleu": bleu_score,
        "f1": f1_scores,
    }

In [None]:
training_args = Seq2SeqTrainingArguments(
    output_dir="./results",
    eval_strategy="epoch",
    learning_rate=3e-4,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=4,
    weight_decay=0.01,
    save_total_limit=2,
    num_train_epochs=3,
    predict_with_generate=True,
    push_to_hub=False,
    report_to="none"
)

# Set up trainer
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

# Train the model
trainer.train()

In [None]:
trainer.save_model("./t5_fine_tuned")

In [None]:
import matplotlib.pyplot as plt

def plot_training_loss(log_history):
    # Extract loss values and steps from log history
    steps = [log['step'] for log in log_history if 'loss' in log]
    losses = [log['loss'] for log in log_history if 'loss' in log]
    
    plt.figure(figsize=(12, 6))
    plt.plot(steps, losses, label='Training Loss', color='blue')
    plt.xlabel('Steps')
    plt.ylabel('Loss')
    plt.title('Training Loss Over Steps')
    plt.legend()
    plt.show()

# After training, extract and plot training loss
log_history = trainer.state.log_history
plot_training_loss(log_history)

In [None]:
import matplotlib.pyplot as plt

def plot_training_loss(log_history):
    # Extract loss values and epochs from log history
    epochs = [log['epoch'] for log in log_history if 'loss' in log and 'epoch' in log]
    losses = [log['loss'] for log in log_history if 'loss' in log and 'epoch' in log]
    
    plt.figure(figsize=(12, 6))
    plt.plot(epochs, losses, marker='o', label='Training Loss', color='blue')
    plt.xlabel('Epochs')
    plt.ylabel('Loss')
    plt.title('Training Loss Over Epochs')
    plt.legend()
    plt.show()

# Plloting training loss
log_history = trainer.state.log_history
plot_training_loss(log_history)

In [None]:
# Evaluating the model to get the metrics
eval_results = trainer.evaluate()

rouge_scores = {
    'rouge1': eval_results['eval_rouge1'],
    'rouge2': eval_results['eval_rouge2'],
    'rougeL': eval_results['eval_rougeL']
}
bleu_score = eval_results['eval_bleu']

import matplotlib.pyplot as plt

# Function to plot ROUGE scores
def plot_rouge_scores(rouge_scores):
    plt.figure(figsize=(12, 6))
    plt.bar(rouge_scores.keys(), rouge_scores.values(), color=['blue', 'orange', 'green'])
    plt.title('ROUGE Scores')
    plt.xlabel('ROUGE Type')
    plt.ylabel('Score')
    plt.ylim(0, 1)
    plt.show()

# Function to plot BLEU score
def plot_bleu_score(bleu_score):
    plt.figure(figsize=(8, 4))
    plt.bar(['BLEU Score'], [bleu_score], color='purple')
    plt.title('BLEU Score')
    plt.xlabel('Metric')
    plt.ylabel('Score')
    plt.ylim(0, 1)
    plt.show()

# Plot the metrics
plot_rouge_scores(rouge_scores)
plot_bleu_score(bleu_score)

In [None]:
print(eval_results)

In [None]:
prefix = "answer the question: "
question = "How can I increase sales?"

input_text = prefix + question
inputs = tokenizer(input_text, return_tensors="pt")

In [None]:
model_device = next(model.parameters()).device

inputs = {key: value.to(model_device) for key, value in inputs.items()}


import torch
model.eval()

model_device = next(model.parameters()).device

inputs = {key: value.to(model_device) for key, value in inputs.items()}

# Generating the answer
with torch.no_grad():
    outputs = model.generate(inputs["input_ids"], max_length=60, num_beams=2, early_stopping=True)

# Decoding the generated answer
answer = tokenizer.decode(outputs[0], skip_special_tokens=True)
print(answer)
