In [5]:
import json
import numpy as np
from PIL import Image
from pathlib import Path
from collections import Counter
from hazm import word_tokenize, sent_tokenize, Normalizer
from wordcloud import WordCloud
import arabic_reshaper
from bidi.algorithm import get_display

### Load JSON data

In [6]:
with open('src/data/CS-Stack.json') as f:
    data = json.load(f)

In [7]:
message = iter(data['messages'])

In [8]:
next(message)

{'id': 1,
 'type': 'service',
 'date': '2021-07-06T00:16:27',
 'actor': 'CS Stackoverflow [IRAN]',
 'actor_id': 'channel1564092519',
 'action': 'migrate_from_group',
 'title': 'CS Stackoverflow [IRAN]',
 'text': ''}

### Load and normalize stopwords

In [9]:
def load_stopwords(file_path: str):
    """
    This function get a directory of stop words file with .txt extension, 
    then load, readlines and normalize stopwords.
    """
    # load
    stopwords = open(file_path).readlines()
    
    # remove '\n'
    stopwords = list(map(str.strip, stopwords)) 
    
    # normalize stop words
    normalizer = Normalizer()
    stopwords = list(map(normalizer.normalize, stopwords))
    
    return stopwords

In [10]:
load_stopwords('src/data/stopwords.txt')[:10]

['و', 'در', 'به', 'از', 'که', 'می', 'این', 'است', 'را', 'با']

### Stopwords detection

In [11]:
def stopwords_detection(text: str, stopwords_file_path: str):
    """
    This function takes a text-string argument,
    tokenize text, remove stop words and returns a list of text words.
    """
    stopwords = load_stopwords(stopwords_file_path)
    tokens = word_tokenize(text)
    return list(filter(lambda item: item not in stopwords, tokens))

In [12]:
stopwords_detection(
    text='سلام من امیر محمد یمینی هستم. اما اگر بخواهید من را بدنام کنید، مجبورم فایل نصبی را تغییر دهم.',
    stopwords_file_path='src/data/stopwords.txt'
)

['سلام',
 'امیر',
 'محمد',
 'یمینی',
 'هستم',
 'بخواهید',
 'بدنام',
 'مجبورم',
 'فایل',
 'نصبی',
 'تغییر',
 'دهم']

### Parse text from json file

In [13]:
def parse_json(json_file_path: str, stopwords_file_path: str):
    """
    This function takes a telegram chat json file, returns text content without stopwords.
    """
    
    with open(json_file_path) as f:
        data = json.load(f)
         
    text_str = ''
    text_list = ''
    message = data['messages']

    for msg in message:

        # string text messages
        if isinstance(msg['text'], str):
            msgs = stopwords_detection(msg['text'], stopwords_file_path)
            text_str += f" {' '.join(msgs)}"
            
        # list text messages
        elif isinstance(msg['text'], list):
            for i in msg['text']:
                if isinstance(i, str):
                    msgs = stopwords_detection(i, stopwords_file_path)
                    text_list += f" {' '.join(msgs)}"
                else:
                    pass

    text_content = text_str + ' ' + text_list
    return text_content

In [None]:
parse_json(
    json_file_path='src/data/CS-Stack.json', 
    stopwords_file_path='src/data/stopwords.txt'
)

### Make a word cloud

In [None]:
text_content = parse_json(
    json_file_path='src/data/CS-Stack.json', 
    stopwords_file_path='src/data/stopwords.txt'
)

text_norm = normalizer.normalize(text_content)
Counter(word_tokenize(normalizer.normalize(text_content))).most_common()[:10]

In [None]:
text = arabic_reshaper.reshape(text_norm[:500000])
text = get_display(text)

In [None]:
# def reshaper(text: str):
#     """
#     This function tries to reshape text and display it in three iteration.
#     if "arabic.reshaper" can't handle vloume of text, each iteration will decrease 30% of text.
#     """
#     try:
#         text = arabic_reshaper.reshape(text)
#         text = get_display(text)
#         print(len(text))
#         return text
#     except:
#         # try:
#         # len_text = len(text)
#         # decrease_amount = len_text*(0.3)

#         new_len = int(len(text) - (0.3*len(text)))
#         text = text[:new_len]
#         text = arabic_reshaper.reshape(text)
#         text = get_display(text)
#         print(len(text))
#         return text
# #         except:
# #             try:
# #                 len_text = len(text)
# #                 decrease_amount = len_text*(6/10)

# #                 new_len = int(len_text - decrease_amount)
# #                 text = text[:new_len]
# #                 text = arabic_reshaper.reshape(text)
# #                 text = get_display(text)
# #                 print(len(text))
# #                 return text
# #             except:
# #                 try:
# #                     len_text = len(text)
# #                     decrease_amount = len_text*(8/10)

# #                     new_len = int(len_text - decrease_amount)
# #                     text = text[:new_len]
# #                     text = arabic_reshaper.reshape(text)
# #                     text = get_display(text)
# #                     print(len(text))
# #                     return text
# #                 except Exception as e:
# #                     return e

In [None]:
# for i in range(5):
#     len_text = len(text_norm)
#     decrease_amount = len_text*(i/10)

#     new_len = int(len_text - decrease_amount)
#     text = text_norm[:new_len]

#     print(f"len_text: {len_text}")
#     print(f"decreade amount: {decrease_amount}")
#     print(f"new_len: {len(text)}")
#     print("-"*40)

In [None]:
alice_mask = np.array(Image.open('src/data/python_logo.png'))

In [None]:
wordcloud = WordCloud(
    font_path='src/data/Mitra_Bold.ttf',
    width=800,
    height=400,
    background_color='white'
).generate(text)

import matplotlib.pyplot as plt
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")

In [None]:
with open('src/data/text.txt') as f:
    text = f.readlines()

In [None]:
text_c = ''
for i in text:
    text_c += i

In [None]:
Counter(text_c.split()).most_common(10)

In [2]:
# test for task

In [3]:
# test