<a href="https://colab.research.google.com/github/ldash/Minimalist-Google-Calendar/blob/master/WordCloud.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import nltk
import numpy as np
import requests
import string

from io import BytesIO
from nltk import word_tokenize
from nltk.corpus import stopwords
from PIL import Image
from wordcloud import WordCloud

prefix = '/content/drive/MyDrive/Colab Notebooks/'

def remove_chars_from_text(text, chars):
    return ''.join([ch for ch in text if ch not in chars])


def process(input_text):
    input_text = remove_chars_from_text(input_text, string.punctuation + '\xa0«»\t—…')
    input_text = remove_chars_from_text(input_text, string.digits)
    text_tokens = word_tokenize(input_text)
    russian_stopwords = stopwords.words('russian')
    russian_stopwords.extend([
        'alyona', 'bondarenko', 'lev', 'dashevskiy', 'media', 'omitted', 'deleted', 'message', 'check', 'add',
        'favorites', 'mean', 'list', 'keep', 'top', 'picks', 'track', 'top', 'address', 'missed', 'voice', 'call',
        'video', 'file', 'attached', 'live', 'location', 'payment', 'shared', 'hotel', 'home', 'wholesale',
        'wishlist', 'linkedin', 'verification', 'started', 'smart', 'null', 'entrance', 'station', 'code', 'power',
        'отличное', 'жилье', 'городе', 'улица', 'проспект',
    ])
    text_tokens = [
        r for r in text_tokens if not (
            r in russian_stopwords or
            r.startswith('https') or
            len(r) == 1
        )
    ]
    return text_tokens


# def clean(text_tokens_input):
#     import pymorphy2
#     morph = pymorphy2.MorphAnalyzer()
#     text_tokens_norm = [morph.parse(r)[0].normal_form for r in text_tokens_input]
#     return text_tokens_norm


def form_matrix(img):
    img_matrix = np.array(img)
    img_matrix = img_matrix[:, :, img_matrix.shape[2]-1]
    return np.invert(img_matrix)


def read_img_from_url(url):
    return Image.open(BytesIO(requests.get(url).content))


def plot(key_input, tokens, bc, cw, cc, mask, invert, custom_ending=''):
    file_name = key_input + '_' + bc
    if invert:
        mask = np.invert(mask)
        file_name = file_name + '_inv'
    if custom_ending:
        file_name = file_name + '_' + custom_ending
    svg = WordCloud(
        random_state=0,
        max_words=len(tokens),
        mask=mask,
        background_color=bc,
        contour_width=cw,
        contour_color=cc
    ).generate(' '.join(tokens)).to_svg(embed_font=True)
    with open(prefix + file_name + '.svg', 'w') as f:
        f.write(svg)
        print('Finished writing to: ' + prefix + file_name + '.svg')


def adjust_sizes(img_url_l_input, img_url_a_input):
    img_l_input = read_img_from_url(img_url_l_input)
    img_a_input = read_img_from_url(img_url_a_input)
    if img_l_input.size[1] > img_a_input.size[1]:
        ratio = img_a_input.size[1] / float(img_l_input.size[1])
        img_l_input = img_l_input.resize((int(img_l_input.size[0]*ratio), int(img_l_input.size[1]*ratio)))
    else:  # img_a.size[1] > img_l.size[1]:
        ratio = img_l_input.size[1] / float(img_a_input.size[1])
        img_a_input = img_a_input.resize((int(img_a_input.size[0]*ratio), int(img_a_input.size[1]*ratio)))
    return img_l_input, img_a_input


nltk.download('stopwords')
nltk.download('punkt')

try:
    text = open(prefix + 'WhatsApp Chat with Alyona Bondarenko.txt', 'r', encoding='utf-8').read().lower()
except Exception as e:
    print(e)
    raise e

arr = text.split('\n')
l = [r for r in arr if " - lev dashevskiy: " in r]
a = [r for r in arr if " - alyona bondarenko: " in r]
d = {'comb': ' '.join(arr), 'l': ' '.join(l), 'a': ' '.join(a)}

l_tokens = process(d['l'])
a_tokens = process(d['a'])

img_url_l = 'http://clipart-library.com/images_k/silhouette-of-man-head/silhouette-of-man-head-2.png'
img_url_a = 'http://clipart-library.com/images_k/woman-silhouette-art/woman-silhouette-art-1.png'
img_l, img_a = adjust_sizes(img_url_l, img_url_a)
plot('l', l_tokens, 'aliceblue', 3, 'white', form_matrix(img_l), False, 'head')
plot('a', a_tokens, 'mistyrose', 3, 'white', form_matrix(img_a), False, 'head')

# img_url_l = 'http://clipart-library.com/images_k/silhouette-man-in-suit/silhouette-man-in-suit-20.png'
# img_url_a = 'http://clipart-library.com/images_k/african-american-woman-silhouette/african-american-woman-silhouette-21.png'
# img_l, img_a = adjust_sizes(img_url_l, img_url_a)
# plot('l', l_tokens, 'aliceblue', 3, 'white', form_matrix(img_l), False, 'height')
# plot('a', a_tokens, 'mistyrose', 3, 'white', form_matrix(img_a), False, 'height')
# plot('l', l_tokens, 'aliceblue', 3, 'white', form_matrix(img_l), True, 'height')
# plot('a', a_tokens, 'mistyrose', 3, 'white', form_matrix(img_a), True, 'height')


key = 'comb'
text_tokens_orig = process(d[key])

full = read_img_from_url('http://clipart-library.com/images_k/black-heart-transparent-background/' +
                          'black-heart-transparent-background-1.png')
# broken = read_img_from_url('http://clipart-library.com/img/1087197.png')
# broken = broken.resize((int(broken.size[0]*0.5), int(broken.size[1]*0.5)))

# plot(key, text_tokens_orig, 'white', 0, '', form_matrix(full), False, 'full')
# plot(key, text_tokens_orig, 'white', 0, '', form_matrix(full), True, 'full')
plot(key, text_tokens_orig, 'black', 0, '', form_matrix(full), False, 'full')
# plot(key, text_tokens_orig, 'black', 0, '', form_matrix(full), True, 'full')

# plot(key, text_tokens_orig, 'white', 0, '', form_matrix(broken), False, 'broken')
# plot(key, text_tokens_orig, 'black', 0, '', form_matrix(broken), False, 'broken')


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


Finished writing to: /content/drive/MyDrive/Colab Notebooks/l_aliceblue_head.svg
Finished writing to: /content/drive/MyDrive/Colab Notebooks/a_mistyrose_head.svg
Finished writing to: /content/drive/MyDrive/Colab Notebooks/comb_black_full.svg
