## Imports and model

In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import os
import pandas as pd
import numpy as np

import itertools

import nltk
nltk.download('stopwords')
nltk.download('punkt')

from collections import Counter


# All files are expected to be in same folder
def parse_data(folder_path='anecdots', files_cnt=1):
    parsed_values: list = []
    cnt = 1
    for each in os.listdir(folder_path):
        with open(folder_path + '/' + each, 'r') as f:
            buf = pd.read_csv(folder_path + '/' + each, sep=',')
            parsed_values += buf['content'].tolist()
        if cnt >= files_cnt:
            break
        else:
            cnt += 1
    return parsed_values


class AE(nn.Module):
    def __init__(self, input_shape: int):
        super().__init__()
        # Encoder
        self.line1 = nn.Linear(in_features=input_shape, out_features=input_shape * 3)
        self.line2 = nn.Linear(in_features=input_shape * 3, out_features=input_shape * 9)
        self.line3 = nn.Linear(in_features=input_shape * 9, out_features=input_shape)
        self.line4 = nn.Linear(in_features=input_shape, out_features=input_shape // 4)

        # Decoder
        self.line5 = nn.Linear(in_features=input_shape // 4, out_features=input_shape // 2)
        self.line6 = nn.Linear(in_features=input_shape // 2, out_features=input_shape)
        
        # Weight init
        self.line1.weight.data.normal_(0.0,1/np.sqrt(input_shape))
        self.line2.weight.data.normal_(0.0,1/np.sqrt(input_shape))
        self.line3.weight.data.normal_(0.0,1/np.sqrt(input_shape))
        self.line4.weight.data.normal_(0.0,1/np.sqrt(input_shape))
        self.line5.weight.data.normal_(0.0,1/np.sqrt(input_shape))
        self.line6.weight.data.normal_(0.0,1/np.sqrt(input_shape))

    def forward(self, data: torch.Tensor):
        z = self.encode(data)
        z = self.decode(z)
        return z

    def encode(self, data: torch.Tensor):
        z = F.leaky_relu(self.line1(data))
        z = F.leaky_relu(self.line2(z))
        z = F.leaky_relu(self.line3(z))
        z = F.leaky_relu(self.line4(z))
        return z

    def decode(self, features: torch.Tensor):
        z = F.relu(self.line5(features))
        return self.line6(z)


# data: list of words in 2d
def idx_data(data: list):
    lookup = sorted(list(set(itertools.chain.from_iterable([sentence_data for sentence_data in data]))))
    lookup = {value: index for index, value in enumerate(lookup, 1)}
    return lookup, {index: value for index, value in enumerate(lookup, 1)}


def coalesce(*inputs):
    for i in range(len(inputs)):
        if inputs[i] is not None:
            return inputs[i]
    return 0


# 1D list of sentences
def preprocess(text: list) -> tuple:
    # Tokenize all sentences to words. Format is 2D: <sentence, word>
    tokenized_dataset = list()
    for joke in text:
        tokenized_dataset.append(nltk.tokenize.word_tokenize(joke, language='russian'))

    # Postprocess tokens
    for joke in range(len(tokenized_dataset)):
        for i in range(len(tokenized_dataset[joke])):
            tokenized_dataset[joke][i] = str(tokenized_dataset[joke][i]).lower().replace("'",'').replace('*', '').strip("'.-")

    # Filter all inputs, leave only those who met more than once
    buffer = list()
    for f in tokenized_dataset:
        buffer += f
    counter = Counter(buffer)
    buffer = list()
    for val,cnt in counter.most_common():
        if cnt > 1:
            buffer.append(val)
        else:
            break
    buffer = {v: k for k,v in enumerate(sorted(buffer))}
    print(buffer)
    
    result = list()
    for file_data in tokenized_dataset:
        file_result = list()
        for x in file_data:
            if buffer.get(x) is not None:
                file_result.append(x)
        result.append(file_result)
    print(max([len(x) for x in result]))
    tokenized_dataset = result

    # Drop tail (optional)
    res_len = [len(tokenized_dataset[i]) for i in range(len(tokenized_dataset))]
    tokenized_dataset = tokenized_dataset[:len(tokenized_dataset) - len(tokenized_dataset) % batch_size]

    # Convert tokens to vectors using positional-like
    word_to_idx, ids_to_word = idx_data(tokenized_dataset)
    indexes = []
    for sentence in tokenized_dataset:
        indexes.append([coalesce(word_to_idx.get(word)) for word in sentence])

    # Pad to 2D matrix
    max_line_len = len(max(tokenized_dataset, key=len))
    tensor = torch.zeros(size=(len(text), max_line_len))
    for i in range(len(indexes)):
        for j in range(len(indexes[i])):
            tensor[i, j] = indexes[i][j]

    return tensor, word_to_idx, ids_to_word, res_len

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [15]:
from tqdm import tqdm

# Get some data for model. We have Russian jokes.
batch_size = 64
rus_data = parse_data(files_cnt=5)

# <cnt of lines, cnt of words>
dataset, direct_lookup, reverse_lookup, batch_lens = preprocess(rus_data)

print('Data shape:', len(dataset), len(direct_lookup))
dataset /= len(direct_lookup)

2433
Data shape: 50128 75070


In [3]:
# Device
device = torch.device('cpu') if not torch.cuda.is_available() else torch.device('cuda')

# Our model
model = AE(input_shape=dataset.shape[1])
model.to(device)

optimizer = torch.optim.Adam(model.parameters(), lr=.001)
loss = nn.CrossEntropyLoss()
loss.to(device)


# Train
model.train()
prev_sum = .0
for epoch in range(10):
    loss_sum = .0
    for batch in tqdm(range(len(dataset) // batch_size)):
        optimizer.zero_grad()
        batch_data = dataset[batch_size*batch:(batch+1)*batch_size, :].to(device)
        output = model(batch_data)
        # Generate labels {as True for 0, False for 1}
        labels = torch.sign(torch.abs(torch.round(torch.sum(output - batch_data, dim=1))))

        loss_res = loss(output, labels.long())
        loss_sum += loss_res
        loss_res.backward()
        optimizer.step()
    print(epoch, loss_sum.item())
    if loss_sum.item() < 1e-6:
        break


100%|██████████| 783/783 [00:43<00:00, 17.96it/s]
  0%|          | 3/783 [00:00<00:30, 25.98it/s]

0 8.036158561706543


100%|██████████| 783/783 [00:43<00:00, 17.83it/s]

1 0.0





In [123]:
print(f'Length of direct lookup: {len(direct_lookup)}')
model.eval()

# Use data
for i in range(10):
    testing: torch.Tensor = model(dataset[i, :].to(device))
    # Yes, we had to use normalization in the end.
    testing -= testing.min()
    values: np.ndarray = np.round(testing.cpu().detach().numpy())
    print('> ', rus_data[i], '\n', ' '.join([reverse_lookup.get(idx) for idx in values.tolist()[:batch_lens[i]] if idx in reverse_lookup.keys()]).lower(), end='\n\n')

Length of direct lookup: 75070
>  Поздравляем всех с наступающим новым годомОГНЕННОЙ СОБАКИ БАСКЕРВИЛЛЕЙ!Новый год надо встречать в ОШЕЙНИКАХ, НАМОРДНИКАХ,в одежде из собачей шерсти, на столе должно быть МЯСОи КОСТОЧКИ!!!www.tao.nm.ru 
 беспокойная пристрелили англичане xx бродить iid блдь.и блестящее баскервиль баз алее беда арабами.россия 1612 браузер alex_spb алика березу беседую madonna вероятность was бедненький бандуре безрезультатно.пришедший ата бендукидзе арбузом бензоколонку www.liveastrology.org брякнулся

>  - Ну вот, и погода разгулялась!- Да! Не на шутку! 
 аза купальный professional fig антиглобалисты 80 акциями але абы www.swinga.net moi автозавод shits 10-00

>  - Чей это пепел в избирательной урне?- Этот пепел Надежды избирателей (на честные выборы)! 
 белые-белые превентивные аминь whiskas бочкарев huul бизнес-план билета банды ахметовым аккумуляторов барьер антона 15:00 борисовна ___б актив безупречная бело-голубого

>  Надо бы и россиянам проголосовать против евро-

In [121]:
from random import random

def generate(model: AE, output_size:int=20, model_in: int = dataset.shape[1]//4):
    # Some random noise that will be decoded
    noise = torch.Tensor([[random() for _ in range(model_in)]]).to(device)
    # From 0..1, match to [0 .. dictionary length]
    generated = model.decode(noise) * len(direct_lookup)
    generated -= generated.min()

    # Since we use constant-length vectors, we have to cut results
    values: np.ndarray = np.round(generated.cpu().detach().numpy())[0, :output_size]
    is_it_joke = ' '.join([reverse_lookup.get(idx) for idx in values.tolist() if idx in reverse_lookup.keys()])
    return is_it_joke

generate(model=model, output_size=30)

'подписи браузере приличная прекращаются'