# COVID 19 Analysis
a topic-modeling approch

Kiarash Kiani

kiani@kiarash.info

In [1]:
from gensim.test.utils import common_texts
from gensim.corpora.dictionary import Dictionary
from gensim.models import LdaModel, LdaMulticore
from hazm import word_tokenize, stopwords_list
from nltk.corpus import PlaintextCorpusReader
import pandas as pd
import emoji
from multiprocessing import Pool
import tqdm
import time

# 1. Loading dataset

In [2]:
df = pd.read_csv('data/Labeled-Data-v1.csv')
docs = df['Content'].values

## 1.1 Removing Emojis

In [3]:
def remove_emoji(text):
    return emoji.get_emoji_regexp().sub(u'', text)

docs = [remove_emoji(doc) for doc in docs]

## 1.2 Removing URLs

In [4]:
import re

def remove_url(text):
    return re.sub(r'https?:\/\/.*[\r\n]*', '', text, flags=re.MULTILINE)

docs = [remove_url(doc) for doc in docs]

# 2. Tokenizing Documents

In [5]:
tokenized_docs = [word_tokenize(doc) for doc in docs]

# 3. Removing Stop Words

## 3.1 Comparing hazm stopwords with our collection

In [6]:
MY_STOPWORDS = []
HAZM_STOPWORDS = stopwords_list()

with open('data/stopwords.txt', encoding='utf-8') as words_file:
    MY_STOPWORDS = words_file.read().split('\n')

MY_STOPWORDS.extend([
    "ویروس",
    "کرونا",
    "#کرونا",
    "??",
    "???",
    "?",
    "!!!",
    "!!",
    "!",
    "RT",
    ".",
    "،",
    ":",
    "؟",
    "»",
    "«"
])

print(f'hazm stopwords: {len(HAZM_STOPWORDS)}, my stopwords: {len(MY_STOPWORDS)}')

hazm stopwords: 389, my stopwords: 1332


In [29]:
cleaned_data = [[word for word in doc if not word in MY_STOPWORDS] for doc in tokenized_docs]
cleaned_data = [[word for word in doc if not word in HAZM_STOPWORDS] for doc in cleaned_data]

In [None]:
print(cleaned_data)

# 4. Create a corpus from a list of texts

In [30]:
common_dictionary = Dictionary(cleaned_data)
common_corpus = [common_dictionary.doc2bow(text) for text in cleaned_data]

# 5. Creating Model

In [31]:
lda = LdaMulticore(common_corpus, id2word=common_dictionary, workers=3, passes=10)

# 6. Results

In [34]:
print(f'number of topics: {len(lda.print_topics(-1, 30))} \n')
report = ""
for idx, topic in lda.show_topics(-1, 30, formatted=False):
        report += 'Topic: {}, Words: [{}]'.format(idx, ', '.join([f"({w[0]} {w[1]})" for w in topic])) + "\n"
with open('report.txt', 'w') as report_file:
        report_file.write(report)

36597594153136015), (کارکنان 0.003513590432703495), (خانگی 0.0033201680053025484), (یکبار 0.0032928932923823595), (لوازم 0.0032476093620061874), (دستکش 0.0031157126650214195), (سوختگیری 0.0030582156032323837), (هزینه 0.0029574416112154722), (پرداخت 0.0028756828978657722), (ترامپ 0.002827220130711794), (ثانوی 0.0027075756806880236), (کار 0.002616898389533162), (دولت 0.0025682495906949043), (ضدعفونی 0.0024624636862426996), (خبر 0.0023861045483499765), (خرید 0.0022948109544813633), (جهان 0.0022885131184011698), (فرآیند 0.0022852204274386168), (اطلاعیه 0.0022766971960663795), (مواد 0.002259295666590333), (کارت 0.0022212115582078695), (مقابله 0.0022211677860468626)]
Topic: 79, Words: [(بیماری 0.014309586025774479), (نفر 0.00873642135411501), (کودکان 0.007221418432891369), (گرمخانه 0.007185801398009062), (بهداشت 0.0061977519653737545), (تهران 0.006101210601627827), (سال 0.00592300109565258), (علمی 0.005854554008692503), (کار 0.004980584140866995), (قرنطینه 0.004801157396286726), (شیوع 0.0047

In [None]:
print(len(lda.show_topics(num_words=20)))

In [None]:
print(len(lda.print_topics()))