# Dependencies

In [1]:
!pip install numpy
!pip install pandas
!pip install torch
!pip install transformers
!pip install scikit-learn


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.0[0m[39;49m -> [0m[32;49m23.1.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
^C
[31mERROR: Operation cancelled by user[0m[31m
[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.0[0m[39;49m -> [0m[32;49m23.1.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.0[0m[39;49m -> [0m[32;49m23.1.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.0[0m[39;49m -> [0m[32;49m23.1.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49

In [None]:
!nvidia-smi

# Imports

In [1]:
import torch
from sklearn.model_selection import train_test_split
from transformers import GPT2LMHeadModel, get_scheduler, GPT2Tokenizer
from torch.utils.data import DataLoader, Dataset

# Constants

In [None]:
from google.colab import drive
GOOGLE_DRIVE_BASE_DIR = "/content/drive/MyDrive"
drive.mount("/content/drive")

# Trump tweets generation

In [None]:
trump_file_path = f"{GOOGLE_DRIVE_BASE_DIR}/trump_tweets.txt"

In [None]:
# biden = open('dataset/biden_tweets.txt').read().splitlines()
trump = open(trump_file_path, encoding="utf-8").read().splitlines()

seed = 40

train_data, test_data = train_test_split(trump, test_size=0.3, random_state=seed)

test_data, val_data = train_test_split(test_data, test_size=0.5, random_state=seed)

# Initialize the GPT-2 model and tokenizer
model = GPT2LMHeadModel.from_pretrained('gpt2')
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')

tokenizer.add_special_tokens({
  "eos_token": "</s>",
  "bos_token": "<s>",
  "unk_token": "<unk>",
  "pad_token":  tokenizer.eos_token,
  "mask_token": "<mask>"
})

train_tokenized = tokenizer(train_data, padding="max_length", truncation=True, max_length=128)
val_tokenized = tokenizer(val_data, padding="max_length", truncation=True, max_length=128)
test_tokenized = tokenizer(test_data, padding="max_length", truncation=True, max_length=128)

class TweetDataset(Dataset):
    def __init__(self, encodings):
        self.encodings = encodings

    def __getitem__(self, index):
        return {key: torch.tensor(val[index]) for key, val in self.encodings.items()}

    def __len__(self):
        return len(self.encodings.input_ids)


train_dataset = TweetDataset(train_tokenized)
val_dataset = TweetDataset(val_tokenized)
test_dataset = TweetDataset(test_tokenized)

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=32, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

optimizer = torch.optim.AdamW(model.parameters(), lr=1e-5)

max_epochs = 5
scheduler = get_scheduler("linear", optimizer, num_warmup_steps=0, num_training_steps=max_epochs * len(train_loader))

# Filtered trump tweets: 879
# Filtered biden tweets: 439

device = f"cuda:{torch.cuda.current_device()}" if torch.cuda.is_available() else "cpu"
model.to(device)

for epoch in range(max_epochs):
    model.train()
    train_loss = 0

    for batch in train_loader:
        optimizer.zero_grad()

        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["input_ids"].to(device)
        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss

        train_loss += loss.item()

        loss.backward()
        optimizer.step()
        scheduler.step()

    avg_train_loss = train_loss / len(train_loader)
    print(f"Epoch {epoch + 1}: Average training loss = {avg_train_loss}")

    # Evaluate on the validation set
    model.eval()
    eval_loss = 0

    with torch.no_grad():
        for batch in val_loader:
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels = input_ids.clone().detach()

            outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss
            eval_loss += loss.item()

    avg_eval_loss = eval_loss / len(val_loader)
    print(f"Epoch {epoch + 1}: Average validation loss = {avg_eval_loss}")

model.eval()
test_loss = 0

with torch.no_grad():
    for batch in test_loader:
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = input_ids.clone().detach()

        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        test_loss += loss.item()

avg_test_loss = test_loss / len(test_loader)
print(f"Test loss: {avg_test_loss}")


In [None]:
prompt = "Trump is "
max_length = 140
model.eval()
input_ids = tokenizer.encode(prompt, return_tensors='pt').to(device)

output = model.generate(
    input_ids=input_ids,
    do_sample=True,
    num_return_sequences=4,
    temperature=0.9,
    max_length=max_length,
    top_k=50,
    top_p=0.95,
)

generated_tweets = [tokenizer.decode(tweet, skip_special_tokens=True) for tweet in output]

# Print generated tweets
for tweet in generated_tweets:
    print(tweet)

In [None]:
# gpt2.copy_checkpoint_to_gdrive(run_name='trump_tweets')

In [None]:
# generated_trump_tweets = gpt2.generate(sess, length=256, temperature=0.7, nsamples=5, batch_size=5,
#                                        return_as_list=True, run_name='trump_tweets',
#                                        prefix="Generate a list of tweets.\n")
#
# for tweet in generated_trump_tweets:
#     print(tweet + '\n\n')

## Run from checkpoint

In [None]:
# gpt2.copy_checkpoint_from_gdrive(run_name='trump_tweets')

In [None]:
# sess = gpt2.start_tf_sess()

In [None]:
# gpt2.load_gpt2(sess, run_name='trump_tweets')

In [None]:
# generated_trump_tweets = gpt2.generate(sess, length=256, temperature=0.7, nsamples=5, batch_size=5,
#                                        return_as_list=True, run_name='trump_tweets',
#                                        prefix="Generate a list of tweets.\n")
#
# for tweet in generated_trump_tweets:
#     print(tweet + '\n\n')

# Biden tweets generation

In [None]:
# biden_file_path = f"{GOOGLE_DRIVE_BASE_DIR}/biden_tweets.txt"

In [None]:
# sess = gpt2.start_tf_sess()
#
# gpt2.finetune(sess, dataset=biden_file_path, model_name='124M', steps=1000, restore_from='fresh',
#               run_name='biden_tweets',
#               print_every=10, sample_every=200, save_every=500)

In [None]:
# gpt2.copy_checkpoint_to_gdrive(run_name='biden_tweets')

In [None]:
# generated_biden_tweets = gpt2.generate(sess, length=256, temperature=0.7, nsamples=5, batch_size=5,
#                                        return_as_list=True, run_name='biden_tweets',
#                                        prefix="Generate a list of tweets.\n")
#
# for tweet in generated_biden_tweets:
#     print(tweet + '\n\n')

## Run from checkpoint

In [None]:
# gpt2.copy_checkpoint_from_gdrive(run_name='biden_tweets')

In [None]:
# sess = gpt2.start_tf_sess()

In [None]:
# gpt2.load_gpt2(sess, run_name='biden_tweets')

In [None]:
# generated_biden_tweets = gpt2.generate(sess, length=256, temperature=0.7, nsamples=5, batch_size=5,
#                                        return_as_list=True, run_name='biden_tweets',
#                                        prefix="Generate a list of tweets.\n")
#
# for tweet in generated_biden_tweets:
#     print(tweet + '\n\n')

# Users generation

In [None]:
# users_file_path = f"{GOOGLE_DRIVE_BASE_DIR}/users.txt"

In [None]:
# sess = gpt2.start_tf_sess()
#
# gpt2.finetune(sess, dataset=users_file_path, model_name='124M', steps=1000, restore_from='fresh', run_name='users',
#               print_every=10, sample_every=200, save_every=500)

In [17]:
# gpt2.copy_checkpoint_to_gdrive(run_name='users')

In [None]:
# users = gpt2.generate(sess, length=256, temperature=0.7, nsamples=5, batch_size=5,
#                       return_as_list=True, run_name='users',
#                       prefix="Generate a list of usernames and descriptions.\n")
#
# for user in users:
#     print(user + '\n\n')

## Run from checkpoint

In [3]:
# gpt2.copy_checkpoint_from_gdrive(run_name='users')

In [4]:
# sess = gpt2.start_tf_sess()

In [None]:
# gpt2.load_gpt2(sess, run_name='users'),

In [None]:
# users = gpt2.generate(sess, length=256, temperature=0.7, nsamples=5, batch_size=5,
#                       return_as_list=True, run_name='users',
#                       prefix="Generate a list of usernames and descriptions.\n")
#
# for user in users:
#     print(user + '\n\n')