In [111]:
import numpy as np
import pandas as pd
import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader, TensorDataset
from torch.nn.utils.rnn import pad_sequence
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
import torch.optim as optim
import torchdata.datapipes as dp
import torchtext.transforms as T
import spacy
from torchtext.vocab import build_vocab_from_iterator
from torch.nn.utils.rnn import pad_sequence
from tqdm import tqdm, trange
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import torch.nn.functional as F
import gensim.downloader as api
from gensim.models import KeyedVectors
from gensim.models import Word2Vec

In [112]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
tweets = pd.read_csv('/kaggle/input/twitter-airline-sentiment/Tweets.csv')
texts = tweets['text']
labels = tweets['airline_sentiment']
tweets = pd.DataFrame(list(zip(texts, labels)), columns=['texts', 'labels'])
tweets

Unnamed: 0,texts,labels
0,@VirginAmerica What @dhepburn said.,neutral
1,@VirginAmerica plus you've added commercials t...,positive
2,@VirginAmerica I didn't today... Must mean I n...,neutral
3,@VirginAmerica it's really aggressive to blast...,negative
4,@VirginAmerica and it's a really big bad thing...,negative
...,...,...
14635,@AmericanAir thank you we got on a different f...,positive
14636,@AmericanAir leaving over 20 minutes Late Flig...,negative
14637,@AmericanAir Please bring American Airlines to...,neutral
14638,"@AmericanAir you have my money, you change my ...",negative


In [114]:
t = []  # Initialize an empty list to store all words
for text in tweets['texts']:
    text = text.lower().replace('@','').split()
    t.extend(text)  # Extend the list with individual words instead of appending the whole split text

t = set(t)  # Convert the list to a set to get unique words
word_to_idx = {word: i for i, word in enumerate(t)}  # Create word-to-index mapping
print("Length of vocabulary:", len(word_to_idx))

Length of vocabulary: 26764


In [115]:
def preprocess(df):
    df['texts'] = df['texts'].apply(lambda x: x.lower().replace('@',''))
    df['tokenized_texts'] = df['texts'].apply(lambda x: x.split())  # Tokenize the sentence
    return df

tweets = preprocess(tweets)

In [118]:
# Train Word2Vec model
model = Word2Vec(sentences=tweets['tokenized_texts'], vector_size=100, window=5, min_count=1, workers=4)

In [119]:
# Generate embeddings for each tokenized text
embeddings = [model.wv[tokenized_text] for tokenized_text in tweets['tokenized_texts']]

In [133]:
# Define a custom dataset class
class TextDataset(Dataset):
    def __init__(self, embeddings, labels):
        self.embeddings = embeddings
        self.labels = labels
        
    def __len__(self):
        return len(self.embeddings)
    
    def __getitem__(self, idx):
        return torch.tensor(self.embeddings[idx]), torch.tensor(self.labels[idx])

# Initialize LabelEncoder
label_encoder = LabelEncoder()

# Encode labels
labels_encoded = label_encoder.fit_transform(labels)

# Split data into training and validation sets

train_embeddings, val_embeddings, train_labels, val_labels = train_test_split(embeddings, labels_encoded, test_size=0.2, random_state=42)

# Create datasets and dataloaders
train_dataset = TextDataset(train_embeddings, train_labels)
val_dataset = TextDataset(val_embeddings, val_labels)

batch_size = 32
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, collate_fn=lambda batch: (pad_sequence([item[0] for item in batch]), torch.tensor([item[1] for item in batch])))
val_loader = DataLoader(val_dataset, batch_size=batch_size)

In [135]:
# Define the Generator and Discriminator classes
class Generator(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(Generator, self).__init__()
        self.model = nn.Sequential(
            nn.Linear(input_size, hidden_size),
            nn.ReLU(),
            nn.Linear(hidden_size, output_size)
        )

    def forward(self, x):
        return self.model(x)

class Discriminator(nn.Module):
    def __init__(self, input_size):
        super(Discriminator, self).__init__()
        self.model = nn.Sequential(
            nn.Linear(input_size, 128),  # Adjust input_size to match the output size of the Generator
            nn.ReLU(),
            nn.Linear(128, 1),
            nn.Sigmoid()
        )

    def forward(self, x):
        return self.model(x)

In [141]:
# Define the size of the embeddings
embed_size = 100

# Define the size of the noise vector
noise_size = 100

# Instantiate Generator and Discriminator
generator = Generator(input_size=noise_size, hidden_size=128, output_size=embed_size)
discriminator = Discriminator(input_size=embed_size)

In [137]:
# Define loss function and optimizers
criterion = nn.BCELoss()
optimizer_gen = torch.optim.Adam(generator.parameters(), lr=0.001)
optimizer_disc = torch.optim.Adam(discriminator.parameters(), lr=0.001)

In [143]:
# Training loop with tqdm
num_epochs = 50
for epoch in range(num_epochs):
    loop = tqdm(train_loader, leave=True)
    for real_data, _ in loop:
        batch_size = real_data.size(0)
        real_data = real_data.float()  # Convert to float
        real_data = real_data.to(device)

        # Train Discriminator
        optimizer_disc.zero_grad()
        disc_real = discriminator(real_data)
        loss_disc_real = criterion(disc_real, torch.ones_like(disc_real))  # Real data should be classified as 1
        loss_disc_real.backward()

        noise = torch.rand(batch_size, noise_size).to(device)
        fake_data = generator(noise)
        disc_fake = discriminator(fake_data.detach())
        loss_disc_fake = criterion(disc_fake, torch.zeros_like(disc_fake))  # Fake data should be classified as 0
        loss_disc_fake.backward()
        
        optimizer_disc.step()

        # Train Generator
        optimizer_gen.zero_grad()
        disc_fake = discriminator(fake_data)
        loss_gen = criterion(disc_fake, torch.ones_like(disc_fake))  # Generator wants to fool the discriminator, so labels are 1
        loss_gen.backward()
        optimizer_gen.step()

        loop.set_description(f"Epoch [{epoch+1}/{num_epochs}]")
        loop.set_postfix(gen_loss=loss_gen.item(), disc_loss=loss_disc_real.item() + loss_disc_fake.item())

    if (epoch + 1) % 10 == 0:
        print(f'\nEpoch {epoch+1}/{num_epochs}:\t Generator loss: {loss_gen.item():.4f}\t Discriminator loss: {loss_disc_real.item() + loss_disc_fake.item():.4f}')

Epoch [1/50]: 100%|██████████| 366/366 [00:04<00:00, 74.22it/s, disc_loss=1.46, gen_loss=0.75] 
Epoch [2/50]: 100%|██████████| 366/366 [00:04<00:00, 81.72it/s, disc_loss=1.46, gen_loss=0.752]
Epoch [3/50]: 100%|██████████| 366/366 [00:05<00:00, 68.11it/s, disc_loss=1.46, gen_loss=0.75] 
Epoch [4/50]: 100%|██████████| 366/366 [00:04<00:00, 73.95it/s, disc_loss=1.45, gen_loss=0.753]
Epoch [5/50]: 100%|██████████| 366/366 [00:04<00:00, 79.01it/s, disc_loss=1.45, gen_loss=0.752]
Epoch [6/50]: 100%|██████████| 366/366 [00:04<00:00, 79.98it/s, disc_loss=1.45, gen_loss=0.75] 
Epoch [7/50]: 100%|██████████| 366/366 [00:04<00:00, 90.79it/s, disc_loss=1.46, gen_loss=0.75]  
Epoch [8/50]: 100%|██████████| 366/366 [00:04<00:00, 77.11it/s, disc_loss=1.46, gen_loss=0.751]
Epoch [9/50]: 100%|██████████| 366/366 [00:04<00:00, 77.04it/s, disc_loss=1.46, gen_loss=0.753]
Epoch [10/50]: 100%|██████████| 366/366 [00:04<00:00, 74.62it/s, disc_loss=1.44, gen_loss=0.753]



Epoch 10/50:	 Generator loss: 0.7526	 Discriminator loss: 1.4412


Epoch [11/50]: 100%|██████████| 366/366 [00:04<00:00, 77.84it/s, disc_loss=1.45, gen_loss=0.75] 
Epoch [12/50]: 100%|██████████| 366/366 [00:04<00:00, 81.23it/s, disc_loss=1.45, gen_loss=0.749]
Epoch [13/50]: 100%|██████████| 366/366 [00:04<00:00, 77.77it/s, disc_loss=1.46, gen_loss=0.75] 
Epoch [14/50]: 100%|██████████| 366/366 [00:05<00:00, 73.19it/s, disc_loss=1.45, gen_loss=0.75] 
Epoch [15/50]: 100%|██████████| 366/366 [00:04<00:00, 86.26it/s, disc_loss=1.46, gen_loss=0.751]
Epoch [16/50]: 100%|██████████| 366/366 [00:04<00:00, 76.92it/s, disc_loss=1.47, gen_loss=0.749]
Epoch [17/50]: 100%|██████████| 366/366 [00:04<00:00, 75.80it/s, disc_loss=1.45, gen_loss=0.752]
Epoch [18/50]: 100%|██████████| 366/366 [00:04<00:00, 81.44it/s, disc_loss=1.45, gen_loss=0.754]
Epoch [19/50]: 100%|██████████| 366/366 [00:04<00:00, 83.94it/s, disc_loss=1.46, gen_loss=0.748] 
Epoch [20/50]: 100%|██████████| 366/366 [00:04<00:00, 76.94it/s, disc_loss=1.45, gen_loss=0.756]



Epoch 20/50:	 Generator loss: 0.7561	 Discriminator loss: 1.4529


Epoch [21/50]: 100%|██████████| 366/366 [00:04<00:00, 78.55it/s, disc_loss=1.45, gen_loss=0.753]
Epoch [22/50]: 100%|██████████| 366/366 [00:04<00:00, 82.11it/s, disc_loss=1.46, gen_loss=0.752]
Epoch [23/50]: 100%|██████████| 366/366 [00:04<00:00, 88.79it/s, disc_loss=1.45, gen_loss=0.751] 
Epoch [24/50]: 100%|██████████| 366/366 [00:04<00:00, 88.16it/s, disc_loss=1.45, gen_loss=0.752] 
Epoch [25/50]: 100%|██████████| 366/366 [00:04<00:00, 80.97it/s, disc_loss=1.45, gen_loss=0.751]
Epoch [26/50]: 100%|██████████| 366/366 [00:04<00:00, 77.43it/s, disc_loss=1.45, gen_loss=0.753]
Epoch [27/50]: 100%|██████████| 366/366 [00:04<00:00, 77.49it/s, disc_loss=1.45, gen_loss=0.755]
Epoch [28/50]: 100%|██████████| 366/366 [00:04<00:00, 85.87it/s, disc_loss=1.46, gen_loss=0.751]
Epoch [29/50]: 100%|██████████| 366/366 [00:04<00:00, 75.73it/s, disc_loss=1.45, gen_loss=0.752]
Epoch [30/50]: 100%|██████████| 366/366 [00:04<00:00, 80.85it/s, disc_loss=1.45, gen_loss=0.752] 



Epoch 30/50:	 Generator loss: 0.7520	 Discriminator loss: 1.4453


Epoch [31/50]: 100%|██████████| 366/366 [00:04<00:00, 86.19it/s, disc_loss=1.46, gen_loss=0.748]
Epoch [32/50]: 100%|██████████| 366/366 [00:04<00:00, 77.41it/s, disc_loss=1.45, gen_loss=0.751]
Epoch [33/50]: 100%|██████████| 366/366 [00:04<00:00, 81.81it/s, disc_loss=1.46, gen_loss=0.748]
Epoch [34/50]: 100%|██████████| 366/366 [00:04<00:00, 85.19it/s, disc_loss=1.46, gen_loss=0.749]
Epoch [35/50]: 100%|██████████| 366/366 [00:04<00:00, 89.34it/s, disc_loss=1.45, gen_loss=0.747]
Epoch [36/50]: 100%|██████████| 366/366 [00:03<00:00, 99.50it/s, disc_loss=1.46, gen_loss=0.75]  
Epoch [37/50]: 100%|██████████| 366/366 [00:04<00:00, 85.06it/s, disc_loss=1.45, gen_loss=0.75]  
Epoch [38/50]: 100%|██████████| 366/366 [00:04<00:00, 83.86it/s, disc_loss=1.45, gen_loss=0.753]
Epoch [39/50]: 100%|██████████| 366/366 [00:04<00:00, 89.00it/s, disc_loss=1.44, gen_loss=0.752] 
Epoch [40/50]: 100%|██████████| 366/366 [00:04<00:00, 79.95it/s, disc_loss=1.47, gen_loss=0.75] 



Epoch 40/50:	 Generator loss: 0.7498	 Discriminator loss: 1.4660


Epoch [41/50]: 100%|██████████| 366/366 [00:03<00:00, 93.50it/s, disc_loss=1.45, gen_loss=0.752] 
Epoch [42/50]: 100%|██████████| 366/366 [00:04<00:00, 78.61it/s, disc_loss=1.46, gen_loss=0.751]
Epoch [43/50]: 100%|██████████| 366/366 [00:04<00:00, 79.31it/s, disc_loss=1.46, gen_loss=0.751]
Epoch [44/50]: 100%|██████████| 366/366 [00:04<00:00, 79.23it/s, disc_loss=1.45, gen_loss=0.754] 
Epoch [45/50]: 100%|██████████| 366/366 [00:04<00:00, 85.30it/s, disc_loss=1.44, gen_loss=0.754] 
Epoch [46/50]: 100%|██████████| 366/366 [00:04<00:00, 88.01it/s, disc_loss=1.45, gen_loss=0.75] 
Epoch [47/50]: 100%|██████████| 366/366 [00:05<00:00, 70.73it/s, disc_loss=1.47, gen_loss=0.751]
Epoch [48/50]: 100%|██████████| 366/366 [00:04<00:00, 84.09it/s, disc_loss=1.45, gen_loss=0.751]
Epoch [49/50]: 100%|██████████| 366/366 [00:03<00:00, 97.92it/s, disc_loss=1.45, gen_loss=0.754] 
Epoch [50/50]: 100%|██████████| 366/366 [00:04<00:00, 90.67it/s, disc_loss=1.46, gen_loss=0.751] 


Epoch 50/50:	 Generator loss: 0.7511	 Discriminator loss: 1.4592





In [144]:
# Generate noise vector
noise_vector = torch.rand(1, noise_size).to(device)

# Pass noise vector through generator
generated_embeddings = generator(noise_vector)

# Convert embeddings to text
# For example, if you have a Word2Vec model, you can find the closest words to the generated embeddings
# and construct a sentence from those words
generated_text = []
for embedding in generated_embeddings:
    # Find closest word to each embedding
    closest_word = model.wv.similar_by_vector(embedding.cpu().detach().numpy(), topn=1)[0][0]
    generated_text.append(closest_word)

# Join words to form a sentence
generated_sentence = ' '.join(generated_text)

print("Generated Sentence:", generated_sentence)


Generated Sentence: #whyabcwhy


In [145]:
# Generate 10 sentences
num_sentences = 10
generated_sentences = []

for _ in range(num_sentences):
    # Generate noise vector
    noise_vector = torch.rand(1, noise_size).to(device)

    # Pass noise vector through generator
    generated_embeddings = generator(noise_vector)

    # Convert embeddings to text
    generated_text = []
    for embedding in generated_embeddings:
        # Find closest word to each embedding
        closest_word = model.wv.similar_by_vector(embedding.cpu().detach().numpy(), topn=1)[0][0]
        generated_text.append(closest_word)

    # Join words to form a sentence
    generated_sentence = ' '.join(generated_text)
    
    # Append generated sentence to the list
    generated_sentences.append(generated_sentence)

# Print generated sentences
for i, sentence in enumerate(generated_sentences, 1):
    print(f"Sentence {i}: {sentence}")


Sentence 1: ground,
Sentence 2: terminal?
Sentence 3: terminal?
Sentence 4: terminal?
Sentence 5: #unitedhatesusall
Sentence 6: #unitedhatesusall
Sentence 7: ground,
Sentence 8: terminal?
Sentence 9: http://t.co/7z3gqebfk2
Sentence 10: 695
