In [1]:
import nltk
import numpy as np
import pandas as pd
import re
from operator import itemgetter
nltk.download('stopwords')
import json
import os

[nltk_data] Downloading package stopwords to /home/chirag/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
from tqdm import tqdm

import gensim
import unicodedata
import spacy

In [3]:
import preprocessor as p
p.set_options(p.OPT.URL, p.OPT.EMOJI, p.OPT.RESERVED, p.OPT.SMILEY)

In [4]:
## STATIC VARIABLES

DATA_DIR = "COVID-19-Twitter-Indian-Data"
MALLET_PATH = 'mallet-2.0.8/bin/mallet'

In [5]:
file_names_hourly = os.listdir(DATA_DIR)
file_names_hourly.remove("metadata.csv")
#Mapping Files From Hourly to Daily Basis
file_names_daily = [file_name[:-7] for file_name in file_names_hourly]
file_names_df = pd.DataFrame({'Hourly' : file_names_hourly, 'Daily': file_names_daily})

In [6]:
file_names_df.head()

Unnamed: 0,Hourly,Daily
0,coronavirus-tweet-id-2020-05-14-17.csv,coronavirus-tweet-id-2020-05-14
1,coronavirus-tweet-id-2020-04-22-17.csv,coronavirus-tweet-id-2020-04-22
2,coronavirus-tweet-id-2020-03-29-20.csv,coronavirus-tweet-id-2020-03-29
3,coronavirus-tweet-id-2020-05-05-13.csv,coronavirus-tweet-id-2020-05-05
4,coronavirus-tweet-id-2020-03-22-17.csv,coronavirus-tweet-id-2020-03-22


In [7]:
def corrupt_or_not(file_name):
    """Some csv files are corrupt this is a program to spot them in DATA_DIR,
    return : True if opens False for corrupt(not open)"""
    try:
        pd.read_csv(os.path.join(*[DATA_DIR,file_name]))
        return False
    except:
        return True

file_names_df['Corrupt'] = file_names_df['Hourly'].apply(corrupt_or_not)
file_names_df.groupby('Corrupt').count()

Unnamed: 0_level_0,Hourly,Daily
Corrupt,Unnamed: 1_level_1,Unnamed: 2_level_1
False,3017,3017
True,85,85


In [8]:
#Removing Corrupt Files and 
#Converting the Groupby object to dict such that key is the day and values are the hourly file names
file_names_df = file_names_df[file_names_df['Corrupt'] == False]
file_daily_hourly_map = file_names_df.groupby('Daily')['Hourly'].apply(list).to_dict()

In [9]:
# daily_full_tweets = {}

# for key,files in tqdm(file_daily_hourly_map.items()):
#     hourly_df = [pd.read_csv(os.path.join(*[DATA_DIR,file_name])) for file_name in files]
#     daily_df = pd.concat(hourly_df)
#     daily_df = daily_df[(daily_df['full_text'] != 'No Value Mentioned') | (daily_df['full_retweet_text'] != 'No Value Mentioned')]
#     daily_df.loc[daily_df['full_text'] == 'No Value Mentioned','full_text'] =  daily_df.loc[daily_df['full_text'] == 'No Value Mentioned','full_retweet_text']
#     #Forcefully type casting to str because some values were just float
#     daily_df['full_text'] = daily_df['full_text'].astype(str).apply(p.clean)
#     daily_full_tweets[key] = " ".join(daily_df['full_text'].astype(str).values)

In [10]:
from spacy.tokenizer import Tokenizer

In [11]:
def remove_accent_chars(text):
    text = unicodedata.normalize('NFKD',text).encode('ascii','ignore').decode('utf-8','ignore')
    return text

def remove_special_characters(text, remove_digits=False):
    """This takes text as input and then finds whether each character is not a-z A-Z 0-9 and replaces them with nothing """
    pattern = r'[^a-zA-z\s]' if not remove_digits else r'[^0-9a-zA-z\s]'
    text = re.sub(pattern, '', text)
    return text

def cleaner(doc):
    return " ".join(map(str.lower,(map(str,([token.lemma_ for token in doc if not token.is_stop | token.is_space | token.is_punct | token.like_url])))))


def pipeline_2_tokenizer(daily_df):
    text_data_cleaned = list(nlp.pipe(daily_df.full_text.values.tolist(),disable=["tagger", "parser","ner"]))
    text_data_cleaned = [t for t in text_data_cleaned if t]
    text_tokens = []
    for doc in text_data_cleaned:
        tokens = []
        for t in tokenizer(doc):
            if len(t.text) == 1 or len(list(set(t.text))) == 1:
                pass
            else:
                tokens.append(t.text)
        text_tokens.append(tokens)
    return text_tokens

nlp = spacy.load("en_core_web_sm",max_length = 2000000)
nlp.add_pipe(cleaner,name="cleaner",first=True)
nlp.add_pipe(remove_accent_chars,name='accent_char_removal',after='cleaner')
nlp.add_pipe(remove_special_characters,name='remove_special_char',after='accent_char_removal')
tokenizer = Tokenizer(nlp.vocab)
    
def single_frame(file_names):
    "Concatenates all dataframe from a day and returns dataframe after fixing the full_text column"
    hourly_df = [pd.read_csv(os.path.join(*[DATA_DIR,file_name])) for file_name in file_names]
    daily_df = pd.concat(hourly_df)
    daily_df = daily_df[(daily_df['full_text'] != 'No Value Mentioned') | (daily_df['full_retweet_text'] != 'No Value Mentioned')]
    daily_df.loc[daily_df['full_text'] == 'No Value Mentioned','full_text'] =  daily_df.loc[daily_df['full_text'] == 'No Value Mentioned','full_retweet_text']
    daily_df['full_text'] = daily_df['full_text'].astype(str).apply(p.clean)
    return daily_df

In [12]:
def generate_bow_corpus(text_tokens):
    bigram = gensim.models.Phrases(text_tokens, min_count=5, threshold=50,delimiter=b'_') # higher threshold fewer phrases.
    bigram_model = gensim.models.phrases.Phraser(bigram)
    norm_corpus_bigrams = [bigram_model[doc] for doc in text_tokens]
    dictionary = gensim.corpora.Dictionary(norm_corpus_bigrams)
    dictionary.filter_extremes(no_below=2, no_above=0.9)
    bow_corpus = [dictionary.doc2bow(text) for text in norm_corpus_bigrams]
    
    return bow_corpus,dictionary

def topic_generator(bow_corpus,dictionary,num_topics=2,chunksize=1740):
    lda_model = gensim.models.LdaModel(corpus=bow_corpus, id2word=dictionary,chunksize=1740, alpha='auto',eta='auto', random_state=42,
                                       iterations=500, num_topics=2,passes=20, eval_every=None)

    return lda_model

In [13]:
def topic_extraction(num_of_topics = 2, num_words_per_topic = 5):
    daily_topic_topics = []
    keys_without_topics = []
    for key,file_names in tqdm(file_daily_hourly_map.items()):
        per_day = {}
        per_day['Key'] = key
        daily_df = single_frame(file_names)
        text_tokens = pipeline_2_tokenizer(daily_df)
        bow_corpus,dictionary = generate_bow_corpus(text_tokens)
        try:
            lda_model = topic_generator(bow_corpus, dictionary)
            #Adjust number of topics and words in the topics here.
            for topic_id, topics in lda_model.print_topics(num_topics=num_of_topics, num_words=num_words_per_topic):
                keywords = list(map(lambda x : x[1:-1], re.findall('"\w+"',topics)))
                per_day[f"Topic {topic_id}"] = " ".join(keywords)
            daily_topic_topics.append(per_day)
        except:
            print(key)
    return daily_topic_topics

In [None]:
daily_topic_topics = topic_extraction(num_of_topics = 2, num_words_per_topic = 5)

  0%|          | 0/131 [00:00<?, ?it/s]

coronavirus-tweet-id-2020-01-21


  5%|▍         | 6/131 [01:31<38:16, 18.37s/it]

coronavirus-tweet-id-2020-01-26


  6%|▌         | 8/131 [03:03<1:08:21, 33.35s/it]