Q1 -  Prepare/Pre-process a text corpus to make it more usable for NLP tasks using
tokenization, filtration of stop words, removal of punctuation, stemming and
lemmatization.

In [None]:
# pip install nltk
import nltk
import string
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer, WordNetLemmatizer

# 1. Download necessary NLTK data (run this once)
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('punkt_tab')

def preprocess_text_nltk(text):
    print(f"Original Text: \n{text}\n")

    # --- Step 1: Tokenization ---
    # Breaking the text into individual words/tokens
    tokens = word_tokenize(text)

    # --- Step 2: Noise Removal (Stop words & Punctuation) ---
    # Load stop words (e.g., "the", "is", "in")
    stop_words = set(stopwords.words('english'))
    punctuation = set(string.punctuation)

    filtered_tokens = []
    for token in tokens:
        # Convert to lower case for consistency
        word = token.lower()

        # Filter out punctuation and stop words
        if word not in stop_words and word not in punctuation:
            filtered_tokens.append(word)

    # --- Step 3: Stemming ---
    # Reducing words to their root form by chopping off ends (e.g., "running" -> "run")
    ps = PorterStemmer()
    stemmed_tokens = [ps.stem(word) for word in filtered_tokens]

    # --- Step 4: Lemmatization ---
    # Reducing words to their dictionary root using vocabulary analysis (e.g., "better" -> "good")
    lemmatizer = WordNetLemmatizer()
    lemmatized_tokens = [lemmatizer.lemmatize(word) for word in filtered_tokens]

    return stemmed_tokens, lemmatized_tokens

# Example Usage
sample_text = "The striped bats are hanging on their feet for best results!"
stems, lemmas = preprocess_text_nltk(sample_text)

print(f"Stemmed:    {stems}")
print(f"Lemmatized: {lemmas}")

Q3 -  Extract the usernames from the email addresses present in a given text. .

In [None]:
import re

def extract_usernames(text):
    # Regex Breakdown:
    # ([a-zA-Z0-9._%+-]+)  -> Group 1: Match alphanumeric, dots, underscores, %, +, - (The Username)
    # @                    -> Match the literal '@' symbol
    # [a-zA-Z0-9.-]+       -> Match the domain name
    # \.                   -> Match a literal dot
    # [a-zA-Z]{2,}         -> Match the extension (2+ letters like .com, .in, .org)

    email_pattern = r'([a-zA-Z0-9._%+-]+)@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}'

    # re.findall returns a list of specific groups matched (in this case, just the username)
    usernames = re.findall(email_pattern, text)

    return usernames

# Test Data
corpus = """
    Please contact support at help.desk@company.com for assistance.
    You can also reach the admin: admin_01@internal.net.
    Ignore invalid emails like bob@com or plain text.
    cc: jane.doe+test@gmail.co.uk
"""

result = extract_usernames(corpus)

print(f"Extracted Usernames: {result}")

Q7 - Extract all bigrams , trigrams using ngrams of nltk library

In [None]:
import nltk
from nltk.util import ngrams
from nltk.tokenize import word_tokenize

# Ensure the tokenizer is downloaded
nltk.download('punkt')

def extract_ngrams(text):
    # 1. Tokenize the text first
    # (N-grams work on a list of items, so we must split the string into words)
    tokens = word_tokenize(text)

    # 2. Extract Bigrams (N=2)
    # The ngrams function returns a generator, so we convert it to a list to view it
    bigrams = list(ngrams(tokens, 2))

    # 3. Extract Trigrams (N=3)
    trigrams = list(ngrams(tokens, 3))

    return bigrams, trigrams

# Example Data
text = "The quick brown fox jumps over the lazy dog."

bigrams_list, trigrams_list = extract_ngrams(text)

print(f"--- Original Text ---\n{text}\n")

print(f"--- Bigrams (Total: {len(bigrams_list)}) ---")
for bg in bigrams_list:
    print(bg)

print(f"\n--- Trigrams (Total: {len(trigrams_list)}) ---")
for tg in trigrams_list:
    print(tg)

Q10 - Classify movie reviews as positive or negative from the IMDB movie dataset of
50K movie reviews. (Link for dataset:
https://www.kaggle.com/datasets/lakshmi25npathi/imdb-dataset-of-50k-movie-revie
ws )

In [None]:
# pip install pandas scikit-learn nltk
import pandas as pd
import re
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

# 1. Load the Dataset
# Ensure 'IMDB Dataset.csv' is in your working directory
try:
    df = pd.read_csv('IMDB Dataset.csv')
    print("Dataset loaded successfully.")
except FileNotFoundError:
    print("Error: 'IMDB Dataset.csv' not found. Please download it from Kaggle.")
    exit()

# 2. Pre-processing Function
def clean_text(text):
    # Remove HTML tags (e.g., <br />) common in this dataset
    text = re.sub(r'<.*?>', '', text)
    # Remove non-alphabetic characters (keep only letters)
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    # Convert to lowercase
    text = text.lower()
    return text

print("Cleaning data... (this might take a moment)")
df['cleaned_review'] = df['review'].apply(clean_text)

# 3. Prepare Features (X) and Labels (y)
X = df['cleaned_review']
# Convert labels: 'positive' -> 1, 'negative' -> 0
y = df['sentiment'].map({'positive': 1, 'negative': 0})

# 4. Split Data (80% Training, 20% Testing)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 5. Vectorization (TF-IDF)
# We limit to top 5000 words to keep the model fast
tfidf = TfidfVectorizer(max_features=5000)

print("Vectorizing data...")
X_train_vec = tfidf.fit_transform(X_train)
X_test_vec = tfidf.transform(X_test)

# 6. Train Model (Logistic Regression)
print("Training model...")
model = LogisticRegression()
model.fit(X_train_vec, y_train)

# 7. Evaluate
predictions = model.predict(X_test_vec)
accuracy = accuracy_score(y_test, predictions)

print("\n" + "="*30)
print(f"Model Accuracy: {accuracy * 100:.2f}%")
print("="*30)
print("\nClassification Report:\n")
print(classification_report(y_test, predictions, target_names=['Negative', 'Positive']))

# 8. Test on New Custom Reviews
new_reviews = [
    "The movie was fantastic! Great acting and plot.",
    "It was a complete waste of time. Boring and predictable."
]
new_vec = tfidf.transform(new_reviews)
preds = model.predict(new_vec)

print("\n--- Custom Tests ---")
for review, pred in zip(new_reviews, preds):
    label = "Positive" if pred == 1 else "Negative"
    print(f"Review: '{review}' -> Sentiment: {label}")

Q11 - Build and train a text classifier for the given data (using textbob or
simpletransformers library)

In [None]:
# pip install textblob
from textblob import TextBlob
from textblob.classifiers import NaiveBayesClassifier

# 1. Prepare Data
# TextBlob expects a list of tuples: [(text, label), ...]
train_data = [
    ("I love this movie, it's amazing!", 'pos'),
    ("The plot was boring and slow.", 'neg'),
    ("What a fantastic performance by the actor.", 'pos'),
    ("I fell asleep halfway through.", 'neg'),
    ("The cinematography was beautiful.", 'pos'),
    ("This is a waste of time and money.", 'neg')
]

test_data = [
    ("The movie was good.", 'pos'),
    ("I did not like the ending.", 'neg')
]

# 2. Train the Classifier
print("Training TextBlob Classifier...")
cl = NaiveBayesClassifier(train_data)

# 3. Evaluate
print(f"Accuracy: {cl.accuracy(test_data) * 100:.2f}%")

# 4. Predict on new text
new_review = "The director did a horrible job."
prob_dist = cl.prob_classify(new_review)

print(f"\nReview: '{new_review}'")
print(f"Predicted: {prob_dist.max()}")
print(f"Confidence: {round(prob_dist.prob(prob_dist.max()), 2)}")

Q12 - Generate text using a character-based RNN using an appropriate dataset. Given a
sequence of characters from a given data (eg "Shakespear"), train a model to predict
the next character in the sequence ("e").

In [None]:
# pip install tensorflow numpy requests
import tensorflow as tf
import numpy as np
import os
import requests

# --- 1. Load Data ---
# Download Shakespeare dataset (tiny shakespeare)
url = "https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt"
text = requests.get(url).text

print(f"Corpus length: {len(text)} characters")

# Take a smaller slice for quick training demonstration (first 100k chars)
# In a real scenario, use the whole text.
text = text[:100000]

# --- 2. Vectorize Text (Char to Int) ---
# Find all unique characters
vocab = sorted(set(text))
vocab_size = len(vocab)
print(f"Unique characters: {vocab_size}")

# Create mapping from char to index and index to char
char2idx = {u:i for i, u in enumerate(vocab)}
idx2char = np.array(vocab)

# Convert whole text to integers
text_as_int = np.array([char2idx[c] for c in text])

# --- 3. Create Training Sequences ---
# The maximum length sentence we want for a single input in characters
seq_length = 100
examples_per_epoch = len(text) // (seq_length + 1)

# Create training examples / targets
char_dataset = tf.data.Dataset.from_tensor_slices(text_as_int)

# Create sequences of 101 characters (100 input, 1 target)
sequences = char_dataset.batch(seq_length + 1, drop_remainder=True)

def split_input_target(chunk):
    input_text = chunk[:-1]       # "Shakespear"
    target_text = chunk[1:]       # "hakespeare" (Shifted by 1)
    return input_text, target_text

dataset = sequences.map(split_input_target)

# Batch size and Buffer size for shuffling
BATCH_SIZE = 64
BUFFER_SIZE = 10000

dataset = dataset.shuffle(BUFFER_SIZE).batch(BATCH_SIZE, drop_remainder=True)

# --- 4. Build the Model ---
vocab_size = len(vocab)
embedding_dim = 256
rnn_units = 1024

def build_model(vocab_size, embedding_dim, rnn_units, batch_size):
    model = tf.keras.Sequential([
        tf.keras.layers.Embedding(vocab_size, embedding_dim,
                                  batch_input_shape=[batch_size, None]),
        tf.keras.layers.LSTM(rnn_units,
                            return_sequences=True,
                            stateful=True,
                            recurrent_initializer='glorot_uniform'),
        tf.keras.layers.Dense(vocab_size)
    ])
    return model

model = build_model(vocab_size, embedding_dim, rnn_units, BATCH_SIZE)

# --- 5. Train the Model ---
def loss(labels, logits):
    return tf.keras.losses.sparse_categorical_crossentropy(labels, logits, from_logits=True)

model.compile(optimizer='adam', loss=loss)

# Directory to save checkpoints
checkpoint_dir = './training_checkpoints'
checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt_{epoch}")

checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(
    filepath=checkpoint_prefix,
    save_weights_only=True)

print("Starting training... (This may take a few minutes)")
# Train for 5 epochs for demonstration (increase to 30+ for good results)
history = model.fit(dataset, epochs=5, callbacks=[checkpoint_callback])

# --- 6. Generate Text ---
def generate_text(model, start_string):
    # Rebuild model with batch_size=1 for prediction
    model = build_model(vocab_size, embedding_dim, rnn_units, batch_size=1)
    model.load_weights(tf.train.latest_checkpoint(checkpoint_dir))
    model.build(tf.TensorShape([1, None]))

    # Number of characters to generate
    num_generate = 500

    # Converting our start string to numbers (vectorizing)
    input_eval = [char2idx[s] for s in start_string]
    input_eval = tf.expand_dims(input_eval, 0)

    text_generated = []

    # Low temperature results in more predictable text.
    # Higher temperature results in more surprising text.
    temperature = 1.0

    model.reset_states()
    for i in range(num_generate):
        predictions = model(input_eval)
        # remove the batch dimension
        predictions = tf.squeeze(predictions, 0)

        # using a categorical distribution to predict the character returned by the model
        predictions = predictions / temperature
        predicted_id = tf.random.categorical(predictions, num_samples=1)[-1,0].numpy()

        # We pass the predicted character as the next input to the model
        # along with the previous hidden state
        input_eval = tf.expand_dims([predicted_id], 0)

        text_generated.append(idx2char[predicted_id])

    return (start_string + ''.join(text_generated))

print("\n--- GENERATED TEXT ---\n")
print(generate_text(model, start_string=u"ROMEO: "))