<a href="https://colab.research.google.com/github/liitonsamppa/COMP.CS.060/blob/main/LLM_week2_tech.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

1. Synthetic Dataset Creation and Augmentation

- Create a synthetic dataset with at least 500 sentences (e.g., positive and
negative reviews).
- Use nlpaug to augment the dataset by inserting, deleting, and replacing words
with synonyms.


In [5]:
!pip install nlpaug nltk
import pandas as pd
import numpy as np
import nlpaug.augmenter.word as naw
import random
import nltk

# Download required NLTK resources
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('averaged_perceptron_tagger_eng')

# Define positive and negative review templates
positive_templates = [
    "I absolutely loved the {product}. It was {adjective} and exceeded my expectations!",
    "The {product} is fantastic! Highly recommend it to anyone looking for {feature}.",
    "Amazing experience with the {product}. It's {adjective} and works perfectly.",
    "I'm so happy with my {product}. It's {adjective} and worth every penny.",
    "The {product} is a game-changer. It's {adjective} and easy to use."
]

negative_templates = [
    "I was really disappointed with the {product}. It was {adjective} and didn't meet my expectations.",
    "The {product} is terrible. I wouldn't recommend it to anyone looking for {feature}.",
    "Horrible experience with the {product}. It's {adjective} and doesn't work as advertised.",
    "I regret buying the {product}. It's {adjective} and a waste of money.",
    "The {product} is a complete letdown. It's {adjective} and hard to use."
]

# Define placeholders
products = ["smartphone", "laptop", "headphones", "smartwatch", "camera"]
adjectives = ["great", "awesome", "terrible", "poor", "excellent"]
features = ["performance", "battery life", "sound quality", "durability", "design"]

# Generate synthetic dataset
def generate_synthetic_data(templates, label, num_samples):
    data = []
    for _ in range(num_samples):
        template = np.random.choice(templates)
        product = np.random.choice(products)
        adjective = np.random.choice(adjectives)
        feature = np.random.choice(features)
        sentence = template.format(product=product, adjective=adjective, feature=feature)
        data.append((sentence, label))
    return data

# Generate 250 positive and 250 negative reviews
positive_data = generate_synthetic_data(positive_templates, "positive", 250)
negative_data = generate_synthetic_data(negative_templates, "negative", 250)

# Combine into a DataFrame
df = pd.DataFrame(positive_data + negative_data, columns=["sentence", "label"])

print(df.head())


# Initialize augmenters
synonym_aug = naw.SynonymAug(aug_src='wordnet', aug_max=2)  # Fast
delete_aug = naw.RandomWordAug(action="delete", aug_max=2)  # Fast
insert_aug = naw.ContextualWordEmbsAug(model_path='distilbert-base-uncased', action="insert", aug_max=2)  # Faster model

augmenters = [synonym_aug, delete_aug, insert_aug]

# Function to augment a sentence
def augment_sentence(sentence):
    augmenter = random.choice(augmenters)
    return augmenter.augment(sentence)

# Apply augmentation to the dataset
augmented_data = []
for _, row in df.iterrows():
    sentence, label = row["sentence"], row["label"]
    augmented_sentence = augment_sentence(sentence)
    augmented_data.append((augmented_sentence, label))

# Create augmented DataFrame
augmented_df = pd.DataFrame(augmented_data, columns=["sentence", "label"])

print(augmented_df.head(100))

Collecting nlpaug
  Downloading nlpaug-1.1.11-py3-none-any.whl.metadata (14 kB)
Downloading nlpaug-1.1.11-py3-none-any.whl (410 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m410.5/410.5 kB[0m [31m6.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: nlpaug
Successfully installed nlpaug-1.1.11


[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger_eng.zip.


                                            sentence     label
0  I'm so happy with my smartwatch. It's poor and...  positive
1  I'm so happy with my laptop. It's terrible and...  positive
2  The camera is fantastic! Highly recommend it t...  positive
3  I'm so happy with my laptop. It's awesome and ...  positive
4  Amazing experience with the smartphone. It's e...  positive


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

                                             sentence     label
0   [and i ' ’ m so happy with my smartwatch. it '...  positive
1   [I ' m so happy with laptop. It ' s terrible a...  positive
2   [the camera is fantastic! highly recommend rea...  positive
3   [I ' m so happy with my laptop. It ' s awesome...  positive
4   [Amazing experience with the smartphone. It ' ...  positive
..                                                ...       ...
95  [The headphones is game - changer. It ' s and ...  positive
96  [Amazing experience with the tv camera. It ' s...  positive
97  [i ' m so happy with stealing my laptop. it ' ...  positive
98  [the camera is fantastic! highly experts recom...  positive
99  [setting the headphones is fantastic! highly r...  positive

[100 rows x 2 columns]


2. Handling Missing Values

- Simulate missing values by removing 10% of the sentences from a dataset.
- Use a sequence-to-sequence model (e.g., T5) to reconstruct the missing
sentences.

In [6]:
import pandas as pd
import numpy as np
from transformers import T5ForConditionalGeneration, T5Tokenizer

# Load your dataset with missing values
df = pd.DataFrame(augmented_df)
np.random.seed(42)
missing_indices = np.random.choice(df.index, size=int(0.1 * len(df)), replace=False)
df_missing = df.copy()
df_missing.loc[missing_indices, "sentence"] = np.nan

print("Dataset with missing values:")
print(df_missing)

# Load the T5 model and tokenizer
model_name = "google/flan-t5-small"  # You can use "t5-base" or "t5-large" for better performance
tokenizer = T5Tokenizer.from_pretrained(model_name)
model = T5ForConditionalGeneration.from_pretrained(model_name)

# Function to reconstruct missing sentences using T5
def reconstruct_sentence(row):
    if pd.isna(row["sentence"]):
        if row["label"] == "positive":
            prompt = f"Generate a detailed positive review for a {random.choice(products)}."
        else:
            prompt = f"Write a comprehensive negative review about a {random.choice(products)}."


        # Tokenize the prompt
        input_ids = tokenizer.encode(prompt, return_tensors="pt", max_length=512, truncation=True)

        # Generate the output
        output = model.generate(
    input_ids,
    max_length=125,  # Allow for longer outputs
    num_beams=10,  # Use a higher beam search for more diverse results
    do_sample=True,
    temperature=1.2,  # Adjust temperature to control randomness
    top_k=48,  # Limit to the top 50 most likely tokens
    top_p=0.96,  # Use nucleus sampling for diversity
    repetition_penalty=1.2,  # Penalize repetitive sequences
    early_stopping=True
)


        # Decode the output into a human-readable sentence
        reconstructed_sentence = tokenizer.decode(output[0], skip_special_tokens=True)


        return reconstructed_sentence
    else:
        return row["sentence"]

# Apply the reconstruction function to the dataset
df_missing["sentence"] = df_missing.apply(reconstruct_sentence, axis=1)

print("\nDataset after reconstructing missing sentences:")
print(df_missing)

Dataset with missing values:
                                              sentence     label
0                                                  NaN  positive
1    [I ' m so happy with laptop. It ' s terrible a...  positive
2                                                  NaN  positive
3    [I ' m so happy with my laptop. It ' s awesome...  positive
4    [Amazing experience with the smartphone. It ' ...  positive
..                                                 ...       ...
495                                                NaN  negative
496  [horrible job experience with improving the sm...  negative
497                                                NaN  negative
498  [The smartwatch is a complete letdown. Informa...  negative
499  [the smartphone is pretty terrible. i wouldn '...  negative

[500 rows x 2 columns]


tokenizer_config.json:   0%|          | 0.00/2.54k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/2.20k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.42M [00:00<?, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


config.json:   0%|          | 0.00/1.40k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/308M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]


Dataset after reconstructing missing sentences:
                                              sentence     label
0    This is the best camera I've ever seen. It's a...  positive
1    [I ' m so happy with laptop. It ' s terrible a...  positive
2    This is a great smartwatch. It's a little pric...  positive
3    [I ' m so happy with my laptop. It ' s awesome...  positive
4    [Amazing experience with the smartphone. It ' ...  positive
..                                                 ...       ...
495  This is one of the worst laptops I've ever had...  negative
496  [horrible job experience with improving the sm...  negative
497  This is one of the worst laptops I've ever had...  negative
498  [The smartwatch is a complete letdown. Informa...  negative
499  [the smartphone is pretty terrible. i wouldn '...  negative

[500 rows x 2 columns]


Kaggle Dataset Preprocessing

- Use the Kaggle API to download a dataset (e.g., IMDB Sentiment Analysis or
SMS Spam Classification).
- Preprocess and tokenize the dataset using Hugging Face Tokenizers.
- Train a baseline model (e.g., Logistic Regression) and compare it to a
Transformer model (e.g., TinyBERT).

In [1]:


import os
from kaggle.api.kaggle_api_extended import KaggleApi

!ls -l ~/.config/kaggle/kaggle.json

# Get the current working directory
current_directory = os.getcwd()

# Build the full path to kaggle.json
kaggle_json_path = os.path.join(current_directory, "kaggle.json")

# Move the file using the correct path
!mv "$kaggle_json_path" ~/.config/kaggle/

# Set correct permissions for the kaggle.json file
!chmod 600 ~/.config/kaggle/kaggle.json

# Authenticate Kaggle API
api = KaggleApi()
api.authenticate()

# ... (Rest of your Kaggle dataset download code)

# Download the IMDB Sentiment Analysis dataset
dataset_name = "lakshmi25npathi/imdb-dataset-of-50k-movie-reviews"
api.dataset_download_files(dataset_name, path="./data", unzip=True)

# Load the dataset
import pandas as pd
df = pd.read_csv("./data/IMDB Dataset.csv")
print(df.head())



OSError: Could not find kaggle.json. Make sure it's located in /root/.config/kaggle. Or use the environment method. See setup instructions at https://github.com/Kaggle/kaggle-api/

In [None]:
import re

# Preprocess text
def preprocess_text(text):
    # Convert to lowercase
    text = text.lower()
    # Remove special characters and numbers
    text = re.sub(r"[^a-zA-Z\s]", "", text)
    return text

# Apply preprocessing
df["review"] = df["review"].apply(preprocess_text)

# Encode labels
df["sentiment"] = df["sentiment"].map({"positive": 1, "negative": 0})

from transformers import AutoTokenizer

# Load a tokenizer (e.g., TinyBERT)
tokenizer = AutoTokenizer.from_pretrained("huawei-noah/TinyBERT_General_4L_312D")

# Tokenize the dataset
def tokenize_data(texts, max_length=128):
    return tokenizer(
        texts.tolist(),  # Convert pandas Series to list
        padding=True,
        truncation=True,
        max_length=max_length,
        return_tensors="pt",  # Return PyTorch tensors
    )

# Tokenize the reviews
tokenized_data = tokenize_data(df["review"])

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

# Split the dataset
X_train, X_test, y_train, y_test = train_test_split(
    df["review"], df["sentiment"], test_size=0.2, random_state=42
)

# Convert text to TF-IDF features
vectorizer = TfidfVectorizer(max_features=5000)
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

# Train a Logistic Regression model
lr_model = LogisticRegression()
lr_model.fit(X_train_tfidf, y_train)

# Evaluate the model
y_pred = lr_model.predict(X_test_tfidf)
accuracy = accuracy_score(y_test, y_pred)
print(f"Logistic Regression Accuracy: {accuracy:.4f}")