In [None]:

!nvidia-smi

In [None]:
!pip install --upgrade  textblob gensim pytorch-nlp swifter

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
import itertools
import sys
from textblob import TextBlob, Word
import numpy as np
import random
import re
import swifter
from torch.utils.data import TensorDataset
from torch.utils.data import DataLoader

import os
import pandas as pd
import gensim
import warnings
import nltk

def set_seeds_and_trace():
  os.environ['PYTHONHASHSEED'] = '0'
  np.random.seed(42)
  random.seed(42)

device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
set_seeds_and_trace()
warnings.filterwarnings('ignore')
nltk.download('punkt')
textblob_tokenizer = lambda x: TextBlob(x).words


In [None]:
%%writefile get_data.sh
if [ ! -f train_corpus_descriptions_airbnb.csv ]; then
  wget -O train_corpus_descriptions_airbnb.csv https://www.dropbox.com/s/5rp7ibop99qyafo/train_corpus_descriptions_airbnb.csv?dl=0
fi

if [ ! -f test_corpus_descriptions_airbnb.csv ]; then
    wget -O test_corpus_descriptions_airbnb.csv https://www.dropbox.com/s/a29bbkg8hi4q4f4/test_corpus_descriptions_airbnb.csv?dl=0
fi

In [None]:
!bash get_data.sh

In [None]:
train_path = "./train_corpus_descriptions_airbnb.csv"
test_path = "./test_corpus_descriptions_airbnb.csv"
# Read, then decode for py2 compat.
corpus_size=25000
airbnb_reviews = pd.read_csv(train_path, header=None, names=["review"]).dropna().sample(corpus_size).reset_index(drop=True)
test_airbnb_reviews = pd.read_csv(test_path, header=None, names=["review"]).dropna().sample(2000).reset_index(drop=True)


In [None]:
airbnb_reviews

In [None]:
# Take a look at the first review in text
print(airbnb_reviews.iloc[0].review)


In [None]:
import re
def preprocess_text(text, should_join=True):
    # Implement the preprocessing logic
    if should_join:
      return ' '.join(gensim.utils.simple_preprocess(text))
    else:
      return gensim.utils.simple_preprocess(text)

In [None]:
X_preprocessed = None # apply the preprocessing function to each review

In [None]:
def get_maximum_review_length(df):
    pass  # Implement to get the maximum length in words of all reviews


maximum = get_maximum_review_length(airbnb_reviews.review)


In [None]:
import itertools
from torchnlp.encoders import LabelEncoder


list_of_words = list(itertools.chain.from_iterable([preprocess_text(sentence, should_join=False) for sentence in airbnb_reviews.review]))
ids_from_words = LabelEncoder(list_of_words, reserved_labels=['UNK'], unknown_index=0, min_occurrences=3)
# This one is our tokenizer!

In [None]:

def ids_from_text(text):
  return ids_from_words.batch_encode(text)

def text_from_ids(ids):
  return ids_from_words.batch_decode(ids)


In [None]:
ids_from_text('Only you can prevent forest fires'.lower().split())

In [None]:
# Cool solution
def pad_sequence_of_tokens(x, maxlen, unk_token='[UNK]'):
  pass # Implement function such that it extends the list x up to maxlen length with the unk_token
  return x

def get_tensor(x, maximum=maximum):
  padding = (0, maximum-ids_from_text(x).shape[-1])
  return torch.squeeze(F.pad(ids_from_text(x), padding, "constant", 0).to(torch.long))


In [None]:
import torch.nn.functional as F
def get_ids_tensor(srs):

  processed = srs.swifter.apply(lambda x: pad_sequence_of_tokens(preprocess_text(x, should_join=False), maxlen=maximum))
  result = processed.swifter.apply(get_tensor).to_list()
  return torch.stack(result)

In [None]:
all_ids = get_ids_tensor(srs=X_preprocessed.reset_index(drop=True))
all_ids

In [None]:
class LSTMDataset(torch.utils.data.Dataset):
    def __init__(self, all_ids):
        self.all_ids = all_ids

    def __len__(self):
        return len(self.all_ids)

    def split_input_target(self, sequence):
        input_text = sequence[:-1]
        target_text = sequence[1:]
        return input_text, target_text

    def __getitem__(self, index):
        input_seq, target_seq = self.split_input_target(self.all_ids[index])
        return torch.tensor(input_seq), torch.tensor(target_seq)


In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test= None # Do a train test split of all_ids
X_train = torch.Tensor(X_train).to(torch.long)
X_test = torch.Tensor(X_test).to(torch.long)



In [None]:
train_ds = LSTMDataset(X_train)
test_ds = LSTMDataset(X_test)

In [None]:
train_dl = None # Create the dataloaders, set drop_last to True
test_dl = None # Create the dataloaders, set drop_last to True

In [None]:
input_example, target_example = train_ds[0]
print("Input :", ' '.join(text_from_ids(input_example)))
print("Target:", ' '.join(text_from_ids(target_example)))

In [None]:
class LSTMModel(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, num_layers):
        super().__init__()
        self.embedding_dim = embedding_dim
        self.hidden_dim = hidden_dim
        self.num_layers = num_layers
        self.vocab_size = vocab_size

        self.embedding = None # Create an embedding module, think about input and output dimensions
        self.lstm = nn.LSTM(input_size=embedding_dim, hidden_size=hidden_dim, num_layers=num_layers, batch_first=True)
        self.fc = None # Create a final Linear module, think about input and output dimensions

    def forward(self, x, hidden):
        embedded = self.embedding(x)
        lstm_out, hidden = self.lstm(embedded, hidden)
        out = self.fc(lstm_out)
        return out, hidden

    def init_hidden(self, batch_size):
        weight = next(self.parameters()).data
        hidden = (
            weight.new(self.num_layers, batch_size, self.hidden_dim).zero_().to(device),
            weight.new(self.num_layers, batch_size, self.hidden_dim).zero_().to(device)
        )
        return hidden


In [None]:
# Hyperparameters
sequence_length = 100  # length of sequence for a training example
embedding_dim = 256  # embedding dimension
hidden_dim = 512  # LSTM hidden dimensions
num_layers = 2  # number of LSTM layers
batch_size = 256  # batch size
num_epochs = 20  # number of epochs to train
lr = 0.001  # learning rate
vocab_size = len(ids_from_words.token_to_index)

In [None]:
model = LSTMModel(vocab_size, embedding_dim, hidden_dim, num_layers).to(device)

In [None]:
criterion = None # Choose a loss function
optimizer = optim.Adam(model.parameters(), lr=lr)


In [None]:
for epoch in range(num_epochs):
    model.train()
    hidden = None # Initialize the hidden state
    for batch, (x, y) in enumerate(train_dl):
        x = x.to(device)
        y = y.to(device)
        # Implement the training loop
        for hidden_state in hidden:
          hidden_state.detach_()

    print(f'Epoch {epoch+1}/{num_epochs}, Loss: {loss.item()}')

In [None]:
# Function to generate text from the model
def generate_text(model, start_string, generation_length=1000):
    model.eval()  # evaluation mode

    # Begin with the start string, and predict the next word for generation_length steps
    words = start_string.split()
    hidden = model.init_hidden(1)
    for _ in range(generation_length):
        indexes = []
        for word in words:
          if word in ids_from_words.token_to_index:
            index = ids_from_words.token_to_index[word]
          else:
            index = 0
          indexes.append(index)
        x = torch.tensor([indexes], dtype=torch.long).to(device)
        out, hidden = model(x, hidden)

        out_dist = torch.exp(out[:, -1])
        top_i = torch.multinomial(out_dist, 1)[0]

        words.append(ids_from_words.index_to_token[top_i.item()])

    return ' '.join(words)

In [None]:
import time
start = time.time()
start_string = 'I was walking down the pathway'

result = generate_text(model, start_string)

end = time.time()
print(result, '\n\n' + '_'*80)
print('\nRun time:', end - start)

