In [189]:
import pandas as pd
import random
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter
from wordcloud import WordCloud
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence

In [190]:
class CustomDataset(Dataset):
    def __init__(self, csv_file, encoding='utf-8'):
        self.data = pd.read_csv(csv_file, encoding=encoding, header=None)

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        label = self.data.iloc[idx, 0]
        features = self.data.iloc[idx, 1:].values.astype(float)
        return features, label

loading data

In [191]:
data = pd.read_csv("/kaggle/input/sentiment140/training.1600000.processed.noemoticon.csv", encoding = "ISO-8859-1", engine="python")


In [192]:
data.tail()

Unnamed: 0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, that's a bummer. You shoulda got David Carr of Third Day to do it. ;D"
1599994,4,2193601966,Tue Jun 16 08:40:49 PDT 2009,NO_QUERY,AmandaMarie1028,Just woke up. Having no school is the best fee...
1599995,4,2193601969,Tue Jun 16 08:40:49 PDT 2009,NO_QUERY,TheWDBoards,TheWDB.com - Very cool to hear old Walt interv...
1599996,4,2193601991,Tue Jun 16 08:40:49 PDT 2009,NO_QUERY,bpbabe,Are you ready for your MoJo Makeover? Ask me f...
1599997,4,2193602064,Tue Jun 16 08:40:49 PDT 2009,NO_QUERY,tinydiamondz,Happy 38th Birthday to my boo of alll time!!! ...
1599998,4,2193602129,Tue Jun 16 08:40:50 PDT 2009,NO_QUERY,RyanTrevMorris,happy #charitytuesday @theNSPCC @SparksCharity...


naming columns

In [193]:
data.columns = ["label", "time", "date", "query", "username", "text"]

changing value of target values as 0 and 1 ( from 0 and 4) because target =2 case not found in dataset

In [194]:
# Assuming your label column is named 'label' and your DataFrame is named 'data'
data['label'] = data['label'].replace(4, 1)


removing urls and emails

In [197]:
import re

def remove_emails(text):
    # Define the pattern for matching emails
    email_pattern = r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b'

    # Replace emails with an empty string
    clean_text = re.sub(email_pattern, 'email', text)

    return clean_text
data['text'] = data['text'].apply(remove_emails)
def remove_urls(text):
    # Define the pattern for matching URLs
    url_pattern = r'https?://\S+|www\.\S+'

    # Replace URLs with an empty string
    clean_text = re.sub(url_pattern, 'url', text)

    return clean_text
data['text'] = data['text'].apply(remove_urls)

elaborating short forms

In [195]:
abbreviations = {
    "$": " dollar ",
    "€": " euro ",
    "4ao": "for adults only",
    "a.m": "before midday",
    "a3": "anytime anywhere anyplace",
    "aamof": "as a matter of fact",
    "acct": "account",
    "adih": "another day in hell",
    "afaic": "as far as I am concerned",
    "afaict": "as far as I can tell",
    "afaik": "as far as I know",
    "afair": "as far as I remember",
    "afk": "away from keyboard",
    "app": "application",
    "approx": "approximately",
    "apps": "applications",
    "asap": "as soon as possible",
    "asl": "age, sex, location",
    "atk": "at the keyboard",
    "ave.": "avenue",
    "aymm": "are you my mother",
    "ayor": "at your own risk",
    "b&b": "bed and breakfast",
    "b+b": "bed and breakfast",
    "b.c": "before christ",
    "b2b": "business to business",
    "b2c": "business to customer",
    "b4": "before",
    "b4n": "bye for now",
    "b@u": "back at you",
    "bae": "before anyone else",
    "bak": "back at keyboard",
    "bbbg": "bye bye be good",
    "bbc": "british broadcasting corporation",
    "bbias": "be back in a second",
    "bbl": "be back later",
    "bbs": "be back soon",
    "be4": "before",
    "bfn": "bye for now",
    "blvd": "boulevard",
    "bout": "about",
    "brb": "be right back",
    "bros": "brothers",
    "brt": "be right there",
    "bsaaw": "big smile and a wink",
    "btw": "by the way",
    "bwl": "bursting with laughter",
    "c/o": "care of",
    "cet": "central european time",
    "cf": "compare",
    "cia": "central intelligence agency",
    "csl": "can not stop laughing",
    "cu": "see you",
    "cul8r": "see you later",
    "cv": "curriculum vitae",
    "cwot": "complete waste of time",
    "cya": "see you",
    "cyt": "see you tomorrow",
    "dae": "does anyone else",
    "dbmib": "do not bother me i am busy",
    "diy": "do it yourself",
    "dm": "direct message",
    "dwh": "during work hours",
    "e123": "easy as one two three",
    "eet": "eastern european time",
    "eg": "example",
    "embm": "early morning business meeting",
    "encl": "enclosed",
    "etc": "and so on",
    "faq": "frequently asked questions",
    "fawc": "for anyone who cares",
    "fb": "facebook",
    "fc": "fingers crossed",
    "fig": "figure",
    "fimh": "forever in my heart",
    "ft.": "feet",
    "ft": "featuring",
    "ftl": "for the loss",
    "ftw": "for the win",
    "fwiw": "for what it is worth",
    "fyi": "for your information",
    "g9": "genius",
    "gahoy": "get a hold of yourself",
    "gal": "get a life",
    "gcse": "general certificate of secondary education",
    "gfn": "gone for now",
    "gg": "good game",
    "gl": "good luck",
    "glhf": "good luck have fun",
    "gmt": "greenwich mean time",
    "gmta": "great minds think alike",
    "gn": "good night",
    "g.o.a.t": "greatest of all time",
    "goat": "greatest of all time",
    "goi": "get over it",
    "gps": "global positioning system",
    "gr8": "great",
    "gratz": "congratulations",
    "gyal": "girl",
    "h&c": "hot and cold",
    "hp": "horsepower",
    "hr": "hour",
    "hrh": "his royal highness",
    "ht": "height",
    "ibrb": "i will be right back",
    "ic": "i see",
    "icq": "i seek you",
    "icymi": "in case you missed it",
    "idc": "i do not care",
    "idgadf": "i do not give a damn fuck",
    "idgaf": "i do not give a fuck",
    "idk": "i do not know",
    "ie": "that is",
    "i.e": "that is",
    "ifyp": "i feel your pain",
    "IG": "instagram",
    "iirc": "if i remember correctly",
    "ilu": "i love you",
    "ily": "i love you",
    "imho": "in my humble opinion",
    "imo": "in my opinion",
    "imu": "i miss you",
    "iow": "in other words",
    "irl": "in real life",
    "j4f": "just for fun",
    "jic": "just in case",
    "jk": "just kidding",
    "jsyk": "just so you know",
    "l8r": "later",
    "lb": "pound",
    "lbs": "pounds",
    "ldr": "long distance relationship",
    "lmao": "laugh my ass off",
    "lmfao": "laugh my fucking ass off",
    "lol": "laughing out loud",
    "ltd": "limited",
    "ltns": "long time no see",
    "m8": "mate",
    "mf": "motherfucker",
    "mfs": "motherfuckers",
    "mfw": "my face when",
    "mofo": "motherfucker",
    "mph": "miles per hour",
    "mr": "mister",
    "mrw": "my reaction when",
    "ms": "miss",
    "mte": "my thoughts exactly",
    "nagi": "not a good idea",
    "nbc": "national broadcasting company",
    "nbd": "not big deal",
    "nfs": "not for sale",
    "ngl": "not going to lie",
    "nhs": "national health service",
    "nrn": "no reply necessary",
    "nsfl": "not safe for life",
    "nsfw": "not safe for work",
    "nth": "nice to have",
    "nvr": "never",
    "nyc": "new york city",
    "oc": "original content",
    "og": "original",
    "ohp": "overhead projector",
    "oic": "oh i see",
    "omdb": "over my dead body",
    "omg": "oh my god",
    "omw": "on my way",
    "p.a": "per annum",
    "p.m": "after midday",
    "pm": "prime minister",
    "poc": "people of color",
    "pov": "point of view",
    "pp": "pages",
    "ppl": "people",
    "prw": "parents are watching",
    "ps": "postscript",
    "pt": "point",
    "ptb": "please text back",
    "pto": "please turn over",
    "qpsa": "what happens",
    "ratchet": "rude",
    "rbtl": "read between the lines",
    "rlrt": "real life retweet",
    "rofl": "rolling on the floor laughing",
    "roflol": "rolling on the floor laughing out loud",
    "rotflmao": "rolling on the floor laughing my ass off",
    "rt": "retweet",
    "ruok": "are you ok",
    "sfw": "safe for work",
    "sk8": "skate",
    "smh": "shake my head",
    "sq": "square",
    "srsly": "seriously",
    "ssdd": "same stuff different day",
    "tbh": "to be honest",
    "tbs": "tablespoonful",
    "tbsp": "tablespoonful",
    "tfw": "that feeling when",
    "thks": "thank you",
    "tho": "though",
    "thx": "thank you",
    "tia": "thanks in advance",
    "til": "today i learned",
    "tl;dr": "too long i did not read",
    "tldr": "too long i did not read",
    "tmb": "tweet me back",
    "tntl": "trying not to laugh",
    "ttyl": "talk to you later",
    "u": "you",
    "u2": "you too",
    "u4e": "yours for ever",
    "utc": "coordinated universal time",
    "w/": "with",
    "w/o": "without",
    "w8": "wait",
    "wassup": "what is up",
    "wb": "welcome back",
    "wtf": "what the fuck",
    "wtg": "way to go",
    "wtpa": "where the party at",
    "wuf": "where are you from",
    "wuzup": "what is up",
    "wywh": "wish you were here",
    "yd": "yard",
    "ygtr": "you got that right",
    "ynk": "you never know",
    "zzz": "sleeping bored and tired"
}

In [198]:
def replace_abbreviations(text):
    words = text.split()
    new_words = []
    for word in words:
        if word.lower() in abbreviations:
            new_words.append(abbreviations[word.lower()])
        else:
            new_words.append(word)
    return ' '.join(new_words)


replacing emoticons with words


In [199]:


# Define a dictionary to map emoticons to their corresponding emotions
emoticon_emotion_mapping = {
    ':)': 'smile', ':-)': 'smile', ';d': 'wink', ':-E': 'vampire', ':(': 'sad', 
          ':-(': 'sad', ':-<': 'sad', ':P': 'raspberry', ':O': 'surprised',
          ':-@': 'shocked', ':@': 'shocked',':-$': 'confused', ':\\': 'annoyed', 
          ':#': 'mute', ':X': 'mute', ':^)': 'smile', ':-&': 'confused', '$_$': 'greedy',
          '@@': 'eyeroll', ':-!': 'confused', ':-D': 'smile', ':-0': 'yell', 'O.o': 'confused',
          '<(-_-)>': 'robot', 'd[-_-]b': 'dj', ":'-)": 'sadsmile', ';)': 'wink', 
          ';-)': 'wink', 'O:-)': 'angel','O*-)': 'angel','(:-D': 'gossip', '=^.^=': 'cat'
}

# Function to replace emoticons with their corresponding emotions
def replace_emoticons_with_emotions(text):
    for emoticon, emotion in emoticon_emotion_mapping.items():
        text = text.replace(emoticon, emotion)
    return text

# Assuming your dataset is stored in a pandas DataFrame called 'df' and the text column is named 'text_column'
data['text'] = data['text'].apply(replace_emoticons_with_emotions)



removing username tags and non ascii characters

In [200]:
def remove_username_tags(text):
    # Define the pattern to match "@username" tags
    pattern = r'@\w+'
    # Use re.sub() to replace all occurrences of the pattern with an empty string
    cleaned_text = re.sub(pattern, '', text)
    return cleaned_text

# data['text'] = data['text'].apply(replace_abbreviations)
import unicodedata

def remove_non_ascii(text):
    # Use unicodedata.normalize() to remove non-ASCII characters
    cleaned_text = ''.join(c for c in unicodedata.normalize('NFD', text) if unicodedata.category(c) != 'Mn')
    return cleaned_text



In [206]:
data['text'] = data['text'].apply(remove_username_tags)
data['text'] = data['text'].apply(remove_non_ascii)

In [207]:
# Assuming 'data' is your DataFrame with a 'text' column

# Convert text data to lowercase
data['text'] = data['text'].apply(lambda x: x.lower())

# Example: Display the first few rows of the DataFrame
print(data.head())


   label                                               text
0      0  is upset that he can't update his facebook by ...
1      0   i dived many times for the ball. managed to s...
2      0    my whole body feels itchy and like its on fire 
3      0   no, it's not behaving at all. i'm mad. why am...
4      0                                not the whole crew 


In [208]:
import gensim

In [209]:
data.drop(["time", "date", "query", "username"], axis=1, inplace=True)


KeyError: "['time', 'date', 'query', 'username'] not found in axis"

In [None]:
import pandas as pd

# Assuming 'data' is your DataFrame and it contains a 'text' column
empty_text_rows = data[data['text'].isnull() | (data['text'] == "")]

if not empty_text_rows.empty:
    print("Rows where 'text' column is empty or equal to the value in the first row of 'text' column:")
    print(empty_text_rows)
else:
    print("No rows where 'text' column is empty or equal to the value in the first row of 'text' column.")


In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(data['text'].values, data['label'].values, test_size=0.2, random_state=42)

In [None]:
# Assuming X_train is a NumPy array or Pandas Series containing text data
documents = [text.split() for text in X_train]


In [None]:
test_documents=[text.split() for text in X_test]

In [None]:
print(test_documents[0])

In [None]:
print(len(test_documents))
print(len(documents))

In [None]:
w2v_model = gensim.models.word2vec.Word2Vec(vector_size=300, 
                                            window=7, 
                                            min_count=10, 
                                            workers=8)


In [None]:
w2v_model.build_vocab(documents)

In [None]:
words = w2v_model.wv.key_to_index.keys()
vocab_size = len(words)
print("Vocab size:", vocab_size)


In [None]:
# epoch_logger = EpochLogger()

w2v_model.train(documents, total_examples=len(documents), epochs=30)


In [None]:
w2v_model.wv.most_similar("good")

In [None]:
vocab_size = len(w2v_model.wv.key_to_index)
embedding_size = w2v_model.wv.vector_size
oov_vector = np.random.uniform(low=-0.05, high=0.05, size=(embedding_size,))

# Extend the embedding matrix with the OOV vector
new_vectors = np.vstack([w2v_model.wv.vectors, oov_vector.reshape(1, -1)])

# Update the model with the new embedding matrix
w2v_model.wv.vectors = new_vectors

# Update the vocabulary to include the OOV vector
w2v_model.wv.index_to_key.append('OOV_word')
w2v_model.wv.key_to_index['OOV_word'] = vocab_size

In [None]:
import torch
import numpy as np

def get_word_embeddings(word_tokens, w2v_model, oov_vector):
    # Initialize a list to store word embeddings
    word_embeddings = []
    
    # Iterate over each word token in the document
    for word in word_tokens:
        # Check if the word is in the vocabulary of the Word2Vec model
        if word in w2v_model.wv:
            # If the word is in the vocabulary, get its embedding from the model
            word_embedding = w2v_model.wv[word]
        else:
            # If the word is not in the vocabulary, use the OOV vector
            word_embedding = oov_vector
        # Append the word embedding to the list
        word_embeddings.append(word_embedding)
    
    # Convert the list of word embeddings to a NumPy array
    word_embeddings_np = np.array(word_embeddings)
    
    # Convert the NumPy array to a PyTorch tensor
    return torch.tensor(word_embeddings_np, dtype=torch.float32)

# Assuming 'documents' is a list of documents where each document is a list of word tokens
# Assuming 'w2v_model' is your Word2Vec model
# Assuming 'oov_vector' is your out-of-vocabulary vector

# Create a list to store the document representations
document_representations = []

# Iterate over each document in the training data
for doc in documents:
    # Get word embeddings for the document's word tokens
    word_embeddings = get_word_embeddings(doc, w2v_model, oov_vector)
    
    # Check if the list of word embeddings is empty
    if len(word_embeddings) == 0:
        # If the list is empty, assign the OOV vector as the document representation
        doc_representation = torch.tensor(oov_vector, dtype=torch.float32)
    else:
        # Otherwise, aggregate word embeddings (e.g., by averaging)
        doc_representation = torch.mean(word_embeddings, dim=0)  # Assuming average pooling
    
    # Append the document representation to the list
    document_representations.append(doc_representation)

# Convert the list of document representations into a tensor
# document_tensor = torch.stack(document_representations)


In [210]:
document_tensor = torch.stack(document_representations)

In [211]:
print(document_tensor.size(0))

1279999


In [212]:
print(documents[405])

[]


In [214]:
print(document_representations[293])

tensor([-0.0274, -0.0101, -0.0089,  0.0445, -0.0500,  0.0313, -0.0157, -0.0246,
        -0.0367, -0.0237, -0.0108,  0.0216, -0.0158,  0.0459, -0.0036,  0.0415,
        -0.0329,  0.0345, -0.0459, -0.0309, -0.0244,  0.0081,  0.0206,  0.0230,
         0.0486, -0.0483,  0.0408, -0.0174,  0.0208, -0.0392, -0.0042, -0.0044,
         0.0445, -0.0359,  0.0015,  0.0316,  0.0447,  0.0487, -0.0235,  0.0077,
         0.0280,  0.0334,  0.0249,  0.0108, -0.0106,  0.0336, -0.0173, -0.0174,
        -0.0145, -0.0249, -0.0070, -0.0452, -0.0039, -0.0214, -0.0370, -0.0397,
        -0.0013, -0.0487, -0.0361, -0.0294,  0.0365, -0.0353, -0.0397, -0.0117,
        -0.0059, -0.0416, -0.0156,  0.0015, -0.0213,  0.0160, -0.0355,  0.0464,
        -0.0309, -0.0272, -0.0055,  0.0286,  0.0014,  0.0355,  0.0261, -0.0250,
         0.0256, -0.0353, -0.0116,  0.0165,  0.0219,  0.0189,  0.0448, -0.0064,
        -0.0091, -0.0234,  0.0035, -0.0456,  0.0071, -0.0215, -0.0395,  0.0393,
         0.0196,  0.0169, -0.0451, -0.04

In [215]:
import torch
import torch.nn as nn
from torch.utils.data import TensorDataset, DataLoader

# Define your parameters
input_size = 300  # Dimension for word embeddings
hidden_size = 8
output_size = 2  # Binary classification
dropout_prob = 0.5
learning_rate = 0.001
num_epochs = 10
batch_size = 64

class LSTMClassifier(nn.Module):
    def __init__(self, input_size, hidden_size, output_size, dropout_prob=0.5):
        super(LSTMClassifier, self).__init__()
        self.dropout = nn.Dropout(dropout_prob)
        self.lstm = nn.LSTM(input_size, hidden_size, batch_first=True)
        self.fc = nn.Linear(hidden_size, output_size)

    def forward(self, x):
        embedded = self.dropout(x)  # No embedding layer, directly use the input
        lstm_out, _ = self.lstm(embedded)
#         print("lstm_out shape:", lstm_out.shape)  # Print the shape of lstm_out
#         lstm_out = lstm_out[:, -1, :]  # Take the last hidden state of the sequence
#         lstm_out = lstm_out[:, -1, :]  # Remove the indexing for time steps

        output = self.fc(lstm_out)
        return output



# Convert document_representations into PyTorch tensor
document_tensor = torch.stack(document_representations)

# Convert labels into PyTorch tensor
labels_tensor = torch.tensor(y_train)

# Create DataLoader object for training
train_dataset = TensorDataset(document_tensor, labels_tensor)
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

# Instantiate your LSTM classifier model
model = LSTMClassifier(input_size, hidden_size, output_size, dropout_prob)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

# Training loop
for epoch in range(num_epochs):
    model.train()
    running_loss = 0.0
    for i, (inputs, labels) in enumerate(train_loader, 1):
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        running_loss += loss.item()
        
        if i % 100 == 0:
            print(f'Epoch [{epoch+1}/{num_epochs}], Iteration [{i}/{len(train_loader)}], Training Loss: {loss.item():.4f}')
    
    average_loss = running_loss / len(train_loader)
    print(f'Epoch [{epoch+1}/{num_epochs}], Average Training Loss: {average_loss:.4f}')



Epoch [1/10], Iteration [100/20000], Training Loss: 0.6744
Epoch [1/10], Iteration [200/20000], Training Loss: 0.6067
Epoch [1/10], Iteration [300/20000], Training Loss: 0.5591
Epoch [1/10], Iteration [400/20000], Training Loss: 0.5879
Epoch [1/10], Iteration [500/20000], Training Loss: 0.5811
Epoch [1/10], Iteration [600/20000], Training Loss: 0.6028
Epoch [1/10], Iteration [700/20000], Training Loss: 0.5851
Epoch [1/10], Iteration [800/20000], Training Loss: 0.5709
Epoch [1/10], Iteration [900/20000], Training Loss: 0.4936
Epoch [1/10], Iteration [1000/20000], Training Loss: 0.4986
Epoch [1/10], Iteration [1100/20000], Training Loss: 0.4803
Epoch [1/10], Iteration [1200/20000], Training Loss: 0.7243
Epoch [1/10], Iteration [1300/20000], Training Loss: 0.5631
Epoch [1/10], Iteration [1400/20000], Training Loss: 0.5096
Epoch [1/10], Iteration [1500/20000], Training Loss: 0.5366
Epoch [1/10], Iteration [1600/20000], Training Loss: 0.5468
Epoch [1/10], Iteration [1700/20000], Training Lo

In [216]:


# Assuming 'documents' is a list of documents where each document is a list of word tokens
# Assuming 'w2v_model' is your Word2Vec model
# Assuming 'oov_vector' is your out-of-vocabulary vector

# Create a list to store the document representations
test_representations = []

# Iterate over each document in the training data
for doc in test_documents:
    # Get word embeddings for the document's word tokens
    word_embeddings = get_word_embeddings(doc, w2v_model, oov_vector)
    
    # Check if the list of word embeddings is empty
    if len(word_embeddings) == 0:
        # If the list is empty, assign the OOV vector as the document representation
        doc_representation = torch.tensor(oov_vector, dtype=torch.float32)
    else:
        # Otherwise, aggregate word embeddings (e.g., by averaging)
        doc_representation = torch.mean(word_embeddings, dim=0)  # Assuming average pooling
    
    # Append the document representation to the list
    test_representations.append(doc_representation)

# Convert the list of document representations into a tensor
# document_tensor = torch.stack(document_representations)


In [217]:
# Save the model
torch.save(model.state_dict(), 'lstm_model.pth')


In [218]:
test_document_tensor = torch.stack(test_representations)

In [219]:
# Step 1: Preprocess your test data to convert it into word embeddings using word2vec

# Assuming test_data_word2vec is a list of word embeddings for each document in your test dataset

# Step 2: Convert the word embeddings into PyTorch tensors


# Step 3: Create DataLoader object for your test dataset
test_dataset = TensorDataset(test_document_tensor, torch.tensor(y_test)) 
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)


# Step 4: Pass the test dataset through your trained LSTM model
model.eval()
correct = 0
total = 0
with torch.no_grad():
    for inputs, labels in test_loader:
        outputs = model(inputs)
        _, predicted = torch.max(outputs, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()

# Step 5: Calculate the accuracy of the model on the test dataset
print(f'Accuracy on test data: {100 * correct / total:.2f}%')


Accuracy on test data: 77.54%


**got a zero text dont know why?**

stacked lstm

In [223]:
import torch
import torch.nn as nn
from torch.utils.data import TensorDataset, DataLoader

# Define your parameters
input_size = 300  # Dimension for word embeddings
hidden_size = 8
output_size = 2  # Binary classification
dropout_prob = 0.5
learning_rate = 0.001
num_epochs = 15
batch_size = 64
num_layers = 2  # Number of LSTM layers

class LSTMClassifier(nn.Module):
    def __init__(self, input_size, hidden_size, output_size, num_layers, dropout_prob=0.5):
        super(LSTMClassifier, self).__init__()
        self.dropout = nn.Dropout(dropout_prob)
        self.lstm = nn.LSTM(input_size, hidden_size, num_layers=num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_size, output_size)

    def forward(self, x):
        embedded = self.dropout(x)  # No embedding layer, directly use the input
        lstm_out, _ = self.lstm(embedded)
        output = self.fc(lstm_out)  # Take the last hidden state of the last layer
        return output

# Convert document_representations into PyTorch tensor
document_tensor = torch.stack(document_representations)

# Convert labels into PyTorch tensor
labels_tensor = torch.tensor(y_train)

# Create DataLoader object for training
train_dataset = TensorDataset(document_tensor, labels_tensor)
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

# Instantiate your LSTM classifier model
model = LSTMClassifier(input_size, hidden_size, output_size, num_layers, dropout_prob)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

# Training loop
for epoch in range(num_epochs):
    model.train()
    running_loss = 0.0
    for i, (inputs, labels) in enumerate(train_loader, 1):
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        running_loss += loss.item()
        
        if i % 1000 == 0:
            print(f'Epoch [{epoch+1}/{num_epochs}], Iteration [{i}/{len(train_loader)}], Training Loss: {loss.item():.4f}')
    
    average_loss = running_loss / len(train_loader)
    print(f'Epoch [{epoch+1}/{num_epochs}], Average Training Loss: {average_loss:.4f}')


Epoch [1/15], Iteration [1000/20000], Training Loss: 0.5746
Epoch [1/15], Iteration [2000/20000], Training Loss: 0.4172
Epoch [1/15], Iteration [3000/20000], Training Loss: 0.5321
Epoch [1/15], Iteration [4000/20000], Training Loss: 0.5623
Epoch [1/15], Iteration [5000/20000], Training Loss: 0.5236
Epoch [1/15], Iteration [6000/20000], Training Loss: 0.5114
Epoch [1/15], Iteration [7000/20000], Training Loss: 0.3965
Epoch [1/15], Iteration [8000/20000], Training Loss: 0.6051
Epoch [1/15], Iteration [9000/20000], Training Loss: 0.5456
Epoch [1/15], Iteration [10000/20000], Training Loss: 0.5383
Epoch [1/15], Iteration [11000/20000], Training Loss: 0.6306
Epoch [1/15], Iteration [12000/20000], Training Loss: 0.5273
Epoch [1/15], Iteration [13000/20000], Training Loss: 0.5204
Epoch [1/15], Iteration [14000/20000], Training Loss: 0.5263
Epoch [1/15], Iteration [15000/20000], Training Loss: 0.5094
Epoch [1/15], Iteration [16000/20000], Training Loss: 0.5450
Epoch [1/15], Iteration [17000/20

In [227]:
# Save the model
torch.save(model.state_dict(), 'stacked_lstm_model.pth')


In [229]:
model.eval()
correct = 0
total = 0
with torch.no_grad():
    for inputs, labels in test_loader:
        outputs = model(inputs)
        _, predicted = torch.max(outputs, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()

# Step 5: Calculate the accuracy of the model on the test dataset
print(f'Accuracy on test data for stacked lstm: {100 * correct / total:.2f}%')


Accuracy on test data for stacked lstm: 77.51%


In [231]:
import torch
import torch.nn as nn
from torch.utils.data import TensorDataset, DataLoader

# Define your parameters
input_size = 300  # Dimension for word embeddings
hidden_size = 8
output_size = 2  # Binary classification
dropout_prob = 0.5
learning_rate = 0.001
num_epochs = 15
batch_size = 64
num_layers = 1  # Number of LSTM layers
bidirectional = True  # Use bidirectional LSTM

class BiLSTMClassifier(nn.Module):
    def __init__(self, input_size, hidden_size, output_size, num_layers, bidirectional, dropout_prob=0.5):
        super(BiLSTMClassifier, self).__init__()
        self.dropout = nn.Dropout(dropout_prob)
        self.lstm = nn.LSTM(input_size, hidden_size, num_layers=num_layers, batch_first=True, bidirectional=bidirectional)
        self.fc = nn.Linear(hidden_size * 2 if bidirectional else hidden_size, output_size)  # Multiply by 2 if bidirectional

    def forward(self, x):
        embedded = self.dropout(x)  # No embedding layer, directly use the input
        lstm_out, _ = self.lstm(embedded)
        output = self.fc(lstm_out)  # Take the last hidden state
        return output

# Convert document_representations into PyTorch tensor
document_tensor = torch.stack(document_representations)

# Convert labels into PyTorch tensor
labels_tensor = torch.tensor(y_train)

# Create DataLoader object for training
train_dataset = TensorDataset(document_tensor, labels_tensor)
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

# Instantiate your BiLSTM classifier model
model = BiLSTMClassifier(input_size, hidden_size, output_size, num_layers, bidirectional, dropout_prob)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

# Training loop
for epoch in range(num_epochs):
    model.train()
    running_loss = 0.0
    for i, (inputs, labels) in enumerate(train_loader, 1):
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        running_loss += loss.item()
        
        if i % 1000 == 0:
            print(f'Epoch [{epoch+1}/{num_epochs}], Iteration [{i}/{len(train_loader)}], Training Loss: {loss.item():.4f}')
    
    average_loss = running_loss / len(train_loader)
    print(f'Epoch [{epoch+1}/{num_epochs}], Average Training Loss: {average_loss:.4f}')


Epoch [1/15], Iteration [1000/20000], Training Loss: 0.5228
Epoch [1/15], Iteration [2000/20000], Training Loss: 0.4955
Epoch [1/15], Iteration [3000/20000], Training Loss: 0.5524
Epoch [1/15], Iteration [4000/20000], Training Loss: 0.4869
Epoch [1/15], Iteration [5000/20000], Training Loss: 0.4482
Epoch [1/15], Iteration [6000/20000], Training Loss: 0.4667
Epoch [1/15], Iteration [7000/20000], Training Loss: 0.5073
Epoch [1/15], Iteration [8000/20000], Training Loss: 0.5122
Epoch [1/15], Iteration [9000/20000], Training Loss: 0.5444
Epoch [1/15], Iteration [10000/20000], Training Loss: 0.5255
Epoch [1/15], Iteration [11000/20000], Training Loss: 0.6131
Epoch [1/15], Iteration [12000/20000], Training Loss: 0.5082
Epoch [1/15], Iteration [13000/20000], Training Loss: 0.5113
Epoch [1/15], Iteration [14000/20000], Training Loss: 0.4812
Epoch [1/15], Iteration [15000/20000], Training Loss: 0.4838
Epoch [1/15], Iteration [16000/20000], Training Loss: 0.3793
Epoch [1/15], Iteration [17000/20

In [232]:
# Save the model
torch.save(model.state_dict(), 'bi_lstm_model.pth')


In [233]:
model.eval()
correct = 0
total = 0
with torch.no_grad():
    for inputs, labels in test_loader:
        outputs = model(inputs)
        _, predicted = torch.max(outputs, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()

# Step 5: Calculate the accuracy of the model on the test dataset
print(f'Accuracy on test data for bilstm: {100 * correct / total:.2f}%')


Accuracy on test data for bilstm: 77.88%


In [234]:
import torch
import torch.nn as nn
from torch.utils.data import TensorDataset, DataLoader

# Define your parameters
input_size = 300  # Dimension for word embeddings
hidden_size = 8
output_size = 2  # Binary classification
dropout_prob = 0.5
learning_rate = 0.001
num_epochs = 15
batch_size = 64
num_layers = 2  # Number of LSTM layers
bidirectional = True  # Use bidirectional LSTM

class BiLSTMClassifier(nn.Module):
    def __init__(self, input_size, hidden_size, output_size, num_layers, bidirectional, dropout_prob=0.5):
        super(BiLSTMClassifier, self).__init__()
        self.dropout = nn.Dropout(dropout_prob)
        self.lstm = nn.LSTM(input_size, hidden_size, num_layers=num_layers, batch_first=True, bidirectional=bidirectional)
        self.fc = nn.Linear(hidden_size * 2 if bidirectional else hidden_size, output_size)  # Multiply by 2 if bidirectional

    def forward(self, x):
        embedded = self.dropout(x)  # No embedding layer, directly use the input
        lstm_out, _ = self.lstm(embedded)
        output = self.fc(lstm_out)  # Take the last hidden state
        return output

# Convert document_representations into PyTorch tensor
document_tensor = torch.stack(document_representations)

# Convert labels into PyTorch tensor
labels_tensor = torch.tensor(y_train)

# Create DataLoader object for training
train_dataset = TensorDataset(document_tensor, labels_tensor)
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

# Instantiate your BiLSTM classifier model
model = BiLSTMClassifier(input_size, hidden_size, output_size, num_layers, bidirectional, dropout_prob)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

# Training loop
for epoch in range(num_epochs):
    model.train()
    running_loss = 0.0
    for i, (inputs, labels) in enumerate(train_loader, 1):
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        running_loss += loss.item()
        
        if i % 1000 == 0:
            print(f'Epoch [{epoch+1}/{num_epochs}], Iteration [{i}/{len(train_loader)}], Training Loss: {loss.item():.4f}')
    
    average_loss = running_loss / len(train_loader)
    print(f'Epoch [{epoch+1}/{num_epochs}], Average Training Loss: {average_loss:.4f}')


Epoch [1/15], Iteration [1000/20000], Training Loss: 0.4973
Epoch [1/15], Iteration [2000/20000], Training Loss: 0.5126
Epoch [1/15], Iteration [3000/20000], Training Loss: 0.5954
Epoch [1/15], Iteration [4000/20000], Training Loss: 0.4584
Epoch [1/15], Iteration [5000/20000], Training Loss: 0.5859
Epoch [1/15], Iteration [6000/20000], Training Loss: 0.4023
Epoch [1/15], Iteration [7000/20000], Training Loss: 0.4319
Epoch [1/15], Iteration [8000/20000], Training Loss: 0.5530
Epoch [1/15], Iteration [9000/20000], Training Loss: 0.5080
Epoch [1/15], Iteration [10000/20000], Training Loss: 0.5964
Epoch [1/15], Iteration [11000/20000], Training Loss: 0.4581
Epoch [1/15], Iteration [12000/20000], Training Loss: 0.5891
Epoch [1/15], Iteration [13000/20000], Training Loss: 0.5233
Epoch [1/15], Iteration [14000/20000], Training Loss: 0.5418
Epoch [1/15], Iteration [15000/20000], Training Loss: 0.4233
Epoch [1/15], Iteration [16000/20000], Training Loss: 0.4837
Epoch [1/15], Iteration [17000/20

In [237]:
model.eval()
correct = 0
total = 0
with torch.no_grad():
    for inputs, labels in test_loader:
        outputs = model(inputs)
        _, predicted = torch.max(outputs, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()

# Step 5: Calculate the accuracy of the model on the test dataset
print(f'Accuracy on test data: {100 * correct / total:.2f}%')


Accuracy on test data: 78.01%


In [238]:
# Save the model
torch.save(model.state_dict(), 'stacked_bilstm_model.pth')
