In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/imdb-dataset-of-50k-movie-reviews/IMDB Dataset.csv


## Part 1

In [2]:
import pandas as pd
import numpy as np

# Load the IMDB dataset
df = pd.read_csv('/kaggle/input/imdb-dataset-of-50k-movie-reviews/IMDB Dataset.csv')  # file should have columns: review, sentiment

# Map sentiment to int
df['label'] = df['sentiment'].map({'positive': 1, 'negative': 0})

# Basic cleaning—optional: remove HTML tags
import re
df['clean_review'] = df['review'].apply(lambda x: re.sub(r'<.*?>', ' ', x))


In [9]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Hyperparameters
MAX_NUM_WORDS = 20000
MAX_LEN = 200

tokenizer = Tokenizer(num_words=MAX_NUM_WORDS)
tokenizer.fit_on_texts(df['clean_review'])
sequences = tokenizer.texts_to_sequences(df['clean_review'])
word_index = tokenizer.word_index
print('Vocabulary size:', len(word_index))

# Pad sequences for batching
X_data = pad_sequences(sequences, maxlen=MAX_LEN)
y_data = df['label'].values


Vocabulary size: 124245


In [10]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_data, y_data, test_size=0.2, random_state=42)


In [12]:
# Load GloVe Embeddings
# Build embedding matrix
embedding_index = {}
with open('/kaggle/input/glove6b50dtxt/glove.6B.50d.txt', encoding='utf8') as f:
    for line in f:
        values = line.split()
        word = values[0]
        coeffs = np.asarray(values[1:], dtype='float32')
        embedding_index[word] = coeffs

EMBEDDING_DIM = 50
embedding_matrix = np.zeros((MAX_NUM_WORDS, EMBEDDING_DIM))
for word, i in tokenizer.word_index.items():
    if i < MAX_NUM_WORDS:
        embedding_vector = embedding_index.get(word)
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector


In [13]:
# Vanilla RNN with GloVe (using PyTorch)
import torch
import torch.nn as nn
import torch.optim as optim

class RNNClassifier(nn.Module):
    def __init__(self, embedding_matrix):
        super(RNNClassifier, self).__init__()
        num_embeddings, embedding_dim = embedding_matrix.shape
        self.embedding = nn.Embedding.from_pretrained(torch.FloatTensor(embedding_matrix), freeze=True)
        self.rnn = nn.RNN(embedding_dim, 64, batch_first=True)
        self.fc = nn.Linear(64, 1)
    
    def forward(self, x):
        x = self.embedding(x)
        h_seq, _ = self.rnn(x)
        out = self.fc(h_seq[:, -1, :])
        return torch.sigmoid(out).squeeze()

# DataLoader setup for PyTorch
from torch.utils.data import TensorDataset, DataLoader

BATCH_SIZE = 128
train_data = TensorDataset(torch.LongTensor(X_train), torch.FloatTensor(y_train))
test_data = TensorDataset(torch.LongTensor(X_test), torch.FloatTensor(y_test))

train_loader = DataLoader(train_data, batch_size=BATCH_SIZE, shuffle=True)
test_loader = DataLoader(test_data, batch_size=BATCH_SIZE)

# Model, loss, optimizer
model = RNNClassifier(embedding_matrix)
loss_fn = nn.BCELoss()
optimizer = optim.Adam(model.parameters())

# Training Loop
n_epochs = 2
model.train()
for epoch in range(n_epochs):
    for xb, yb in train_loader:
        optimizer.zero_grad()
        preds = model(xb)
        loss = loss_fn(preds, yb)
        loss.backward()
        optimizer.step()


In [14]:
# LSTM with GloVe
class LSTMClassifier(nn.Module):
    def __init__(self, embedding_matrix):
        super(LSTMClassifier, self).__init__()
        num_embeddings, embedding_dim = embedding_matrix.shape
        self.embedding = nn.Embedding.from_pretrained(torch.FloatTensor(embedding_matrix), freeze=True)
        self.lstm = nn.LSTM(embedding_dim, 64, batch_first=True)
        self.fc = nn.Linear(64, 1)
    
    def forward(self, x):
        x = self.embedding(x)
        h_seq, _ = self.lstm(x)
        out = self.fc(h_seq[:, -1, :])
        return torch.sigmoid(out).squeeze()

# Instantiate/Train as for RNN

In [15]:
#Vanilla RNN & LSTM with trainable torch.nn.Embedding (Random Initialization)
# For on-the-fly embeddings, replace the Embedding line as:
self.embedding = nn.Embedding(MAX_NUM_WORDS, EMBEDDING_DIM)
# Instantiate RNNClassifier or LSTMClassifier as above, and train.

NameError: name 'self' is not defined

In [17]:
EMBEDDING_DIM = 50
embedding_index = {}

with open('/kaggle/input/glove6b50dtxt/glove.6B.50d.txt', encoding='utf8') as f:
    for line in f:
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        embedding_index[word] = coefs

embedding_matrix = np.zeros((MAX_NUM_WORDS, EMBEDDING_DIM))
for word, i in tokenizer.word_index.items():
    if i < MAX_NUM_WORDS:
        vec = embedding_index.get(word)
        if vec is not None:
            embedding_matrix[i] = vec


In [18]:
import torch
from torch.utils.data import TensorDataset, DataLoader

BATCH_SIZE = 128

X_train_torch = torch.LongTensor(X_train)
y_train_torch = torch.FloatTensor(y_train)
X_test_torch = torch.LongTensor(X_test)
y_test_torch = torch.FloatTensor(y_test)

train_ds = TensorDataset(X_train_torch, y_train_torch)
test_ds = TensorDataset(X_test_torch, y_test_torch)

train_loader = DataLoader(train_ds, batch_size=BATCH_SIZE, shuffle=True)
test_loader = DataLoader(test_ds, batch_size=BATCH_SIZE)


In [19]:
import torch.nn as nn

class RNNClassifier(nn.Module):
    def __init__(self, embedding_matrix=None, trainable=False):
        super().__init__()
        num_emb, emb_dim = embedding_matrix.shape if embedding_matrix is not None else (MAX_NUM_WORDS, EMBEDDING_DIM)
        if embedding_matrix is not None:
            self.embedding = nn.Embedding.from_pretrained(torch.FloatTensor(embedding_matrix), freeze=not trainable)
        else:
            self.embedding = nn.Embedding(num_emb, emb_dim)
        self.rnn = nn.RNN(emb_dim, 64, batch_first=True)
        self.fc = nn.Linear(64, 1)
    def forward(self, x):
        x = self.embedding(x)
        out, _ = self.rnn(x)
        return torch.sigmoid(self.fc(out[:, -1, :])).squeeze()

class LSTMClassifier(nn.Module):
    def __init__(self, embedding_matrix=None, trainable=False):
        super().__init__()
        num_emb, emb_dim = embedding_matrix.shape if embedding_matrix is not None else (MAX_NUM_WORDS, EMBEDDING_DIM)
        if embedding_matrix is not None:
            self.embedding = nn.Embedding.from_pretrained(torch.FloatTensor(embedding_matrix), freeze=not trainable)
        else:
            self.embedding = nn.Embedding(num_emb, emb_dim)
        self.lstm = nn.LSTM(emb_dim, 64, batch_first=True)
        self.fc = nn.Linear(64, 1)
    def forward(self, x):
        x = self.embedding(x)
        out, _ = self.lstm(x)
        return torch.sigmoid(self.fc(out[:, -1, :])).squeeze()


In [20]:
def train_model(model, train_loader, n_epochs=2):
    optimizer = torch.optim.Adam(model.parameters())
    loss_fn = nn.BCELoss()
    model.train()
    for epoch in range(n_epochs):
        total_loss = 0
        for xb, yb in train_loader:
            optimizer.zero_grad()
            preds = model(xb)
            loss = loss_fn(preds, yb)
            loss.backward()
            optimizer.step()
            total_loss += loss.item()
        print(f"Epoch {epoch+1} - Loss: {total_loss / len(train_loader)}")


In [21]:
from sklearn.metrics import f1_score

def evaluate_model(model, loader):
    model.eval()
    preds, y_true = [], []
    with torch.no_grad():
        for xb, yb in loader:
            outputs = model(xb)
            preds += outputs.round().detach().cpu().numpy().tolist()
            y_true += yb.cpu().numpy().tolist()
    return f1_score(y_true, preds)


In [23]:
# RNN with GloVe
rnn_glove_model = RNNClassifier(embedding_matrix, trainable=False)
train_model(rnn_glove_model, train_loader, n_epochs=2)
# LSTM with GloVe
lstm_glove_model = LSTMClassifier(embedding_matrix, trainable=False)
train_model(lstm_glove_model, train_loader, n_epochs=2)
# RNN with trainable embeddings (random init)
rnn_onfly_model = RNNClassifier(None, trainable=True)
train_model(rnn_onfly_model, train_loader, n_epochs=2)
# LSTM with trainable embeddings (random init)
lstm_onfly_model = LSTMClassifier(None, trainable=True)
train_model(lstm_onfly_model, train_loader, n_epochs=2)


Epoch 1 - Loss: 0.6534062844876665
Epoch 2 - Loss: 0.6326016786570747
Epoch 1 - Loss: 0.6766700580858955
Epoch 2 - Loss: 0.6517319917297972
Epoch 1 - Loss: 0.6577003588691687
Epoch 2 - Loss: 0.6193911806463053
Epoch 1 - Loss: 0.6075078097585672
Epoch 2 - Loss: 0.42799272276342104


In [24]:
print("RNN + GloVe:", evaluate_model(rnn_glove_model, test_loader))
print("LSTM + GloVe:", evaluate_model(lstm_glove_model, test_loader))
print("RNN + trainable:", evaluate_model(rnn_onfly_model, test_loader))
print("LSTM + trainable:", evaluate_model(lstm_onfly_model, test_loader))


RNN + GloVe: 0.7229307540871353
LSTM + GloVe: 0.6967895362663495
RNN + trainable: 0.7060838362420486
LSTM + trainable: 0.8349203356274159


## Part 2

In [34]:
import re
from datetime import datetime

# Month mapping for name to number
month_map = {
    'jan': '01', 'january': '01',
    'feb': '02', 'february': '02',
    'mar': '03', 'march': '03',
    'apr': '04', 'april': '04',
    'may': '05',
    'jun': '06', 'june': '06',
    'jul': '07', 'july': '07',
    'aug': '08', 'august': '08',
    'sep': '09', 'sept': '09', 'september': '09',
    'oct': '10', 'october': '10',
    'nov': '11', 'november': '11',
    'dec': '12', 'december': '12'
}

def normalize_year(yy):
    yy = int(yy)
    if yy < 100:
        return str(2000 + yy) if yy <= 30 else str(1900 + yy)  # heuristic cutoff
    return str(yy)

def extract_date(text):
    patterns = [
        # 1. Day Month Year (e.g., 15th September 2021, 1 July 2023)
        r'(?P<day>\d{1,2})(?:st|nd|rd|th)?\s+(?:of\s+)?(?P<month>[a-zA-Z]+)[,]?\s+(?P<year>\d{2,4})',
        # 2. Month Day Year (e.g., July 2, 2023)
        r'(?P<month>[a-zA-Z]+)\s+(?P<day>\d{1,2})(?:st|nd|rd|th)?[,]?\s+(?P<year>\d{2,4})',
        # 3. yyyy-mm-dd or yyyy/mm/dd or yyyy.mm.dd
        r'(?P<year>\d{4})[-/.](?P<month>\d{1,2})[-/.](?P<day>\d{1,2})',
        # 4. dd/mm/yyyy or mm/dd/yyyy or dd.mm.yyyy etc.
        r'(?P<day>\d{1,2})[-/.](?P<month>\d{1,2})[-/.](?P<year>\d{2,4})',
        # 5. yyyy (only for ISO-style within text)
        # This is covered above, no isolated year capture here
    ]
    
    for pattern in patterns:
        match = re.search(pattern, text, re.IGNORECASE)
        if match:
            day = match.group('day')
            month = match.group('month')
            year = match.group('year')

            # Normalize month if it's a name
            if month.isalpha():
                month_lower = month.lower()
                month = month_map.get(month_lower[:3], None)
                if not month:
                    continue 
            
            # Normalize year length (handle YY vs YYYY)
            year = normalize_year(year)
            
            # Pad day and month with zeros
            day = day.zfill(2)
            month = month.zfill(2)

            return f"{day}/{month}/{year}"
    return None

# Test inputs:
tests = [
    "The event will take place on March 5, 2023.",
    "Her birthday is on 07/08/1990",
    "The deadline is 2022-12-31.",
    "We met on 1st of January 2000.",
    "The concert is scheduled for 15th September, 2021.",
    "Let's catch up on 02.04.2022.",
    "The project started on 5/6/19.",
    "He was born on 1987/11/23.",
    "Christmas is on 25th Dec 2024.",
    "Submit your report by 08/31/2021.",
]

for t in tests:
    print(f"{t} -> {extract_date(t)}")


The event will take place on March 5, 2023. -> 05/03/2023
Her birthday is on 07/08/1990 -> 07/08/1990
The deadline is 2022-12-31. -> 31/12/2022
We met on 1st of January 2000. -> 01/01/2000
The concert is scheduled for 15th September, 2021. -> 15/09/2021
Let's catch up on 02.04.2022. -> 02/04/2022
The project started on 5/6/19. -> 05/06/2019
He was born on 1987/11/23. -> 23/11/1987
Christmas is on 25th Dec 2024. -> 25/12/2024
Submit your report by 08/31/2021. -> 08/31/2021


## Part 3

In [37]:
import spacy
import string

nlp = spacy.load("en_core_web_sm")

def switch_gender(text, target='female'):
    # Define mappings male->female and female->male pronouns
    pronoun_map = {
        'male': {
            'he': 'she', 'him': 'her', 'his': 'her', 'himself': 'herself',
            'himself': 'herself', 'i': 'i', 'me': 'me', 'my': 'my', 'mine': 'mine',
        },
        'female': {
            'she': 'he', 'her': ['him', 'his'], 'hers': 'his', 'herself': 'himself',
            'i': 'i', 'me': 'me', 'my': 'my', 'mine': 'mine',
        }
    }

    # Choose map depending on target
    if target == 'female':
        mapping = pronoun_map['male']
    else:
        mapping = pronoun_map['female']

    doc = nlp(text)
    result_tokens = []

    # helper function to match case of original token to replacement
    def match_case(original, replacement):
        if original.isupper():
            return replacement.upper()
        if original[0].isupper():
            return replacement.capitalize()
        return replacement

    for token in doc:
        text_lower = token.text.lower()
        replacement = None

        # Special case: 'her' can map to 'him' or 'his' depending on POS tag
        if target == 'female' and text_lower == 'her':
            if token.pos_ == 'PRON':
                replacement = 'him'  # objective pronoun
            elif token.pos_ in ['DET', 'ADJ']:
                replacement = 'his'  # possessive adjective
        elif target == 'male' and text_lower == 'him':
            # Reverse logic for 'him'
            if token.pos_ == 'PRON':
                replacement = 'her'
            elif token.pos_ in ['DET', 'ADJ']:
                replacement = 'her'

        if not replacement and text_lower in mapping:
            mapping_value = mapping[text_lower]
            # For female->male her can map to list of two, handle appropriately
            # Here we use first for pronoun substitution, fallback later
            if isinstance(mapping_value, list):
                # pick pronoun or possessive based on POS and dependency for better accuracy
                if token.dep_ in ['dobj', 'pobj']:  # object
                    replacement = mapping_value[0]
                else:
                    replacement = mapping_value[1]
            else:
                replacement = mapping_value

        if replacement:
            replacement = match_case(token.text, replacement)
            result_tokens.append(replacement)
        else:
            result_tokens.append(token.text)

    # Rebuild string with corrected spacing
    return spacy.tokens.Doc(doc.vocab, words=result_tokens).text_with_ws.strip()

examples = [
    ("He is going to the market.", "female"),
    ("His book is on the table.", "female"),
    ("I saw him yesterday.", "female"),
    ("He hurt himself.", "female"),
    ("I called him last night.", "female"),
    ("That is his car.", "female"),
    ("He told me about his trip.", "female"),
    ("The teacher gave him a warning.", "female"),
    ("He blames himself for the mistake.", "female"),
    ("He brought his laptop.", "female"),
    ("He made it himself.", "female"),
    ("I don’t like his attitude.", "female"),
    ("Tell him to come here.", "female"),
    ("She is going to the market.", "male"),
    ("Her book is on the table.", "male"),
    ("I saw her yesterday.", "male"),
    ("She hurt herself.", "male"),
    ("I called her last night.", "male"),
    ("That is her car.", "male"),
]

for text, gender in examples:
    print(f"Input: {text}\nTarget: {gender}\nOutput: {switch_gender(text, target=gender)}\n")


Input: He is going to the market.
Target: female
Output: She is going to the market .

Input: His book is on the table.
Target: female
Output: Her book is on the table .

Input: I saw him yesterday.
Target: female
Output: I saw her yesterday .

Input: He hurt himself.
Target: female
Output: She hurt herself .

Input: I called him last night.
Target: female
Output: I called her last night .

Input: That is his car.
Target: female
Output: That is her car .

Input: He told me about his trip.
Target: female
Output: She told me about her trip .

Target: female

Input: He blames himself for the mistake.
Target: female
Output: She blames herself for the mistake .

Input: He brought his laptop.
Target: female
Output: She brought her laptop .

Input: He made it himself.
Target: female
Output: She made it herself .

Input: I don’t like his attitude.
Target: female
Output: I do n’t like her attitude .

Input: Tell him to come here.
Target: female
Output: Tell her to come here .

Input: She is goi