In [None]:
import pandas as pd

# text preprocessing
import preprocessor as p
import string
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer 
import re

# visualization
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import NMF
from sklearn.cluster import KMeans 

import umap.umap_ as umap
from yellowbrick.text import UMAPVisualizer

import matplotlib.pyplot as plt

In [None]:
stop_words = stopwords.words('english')
# [stop_words.append(x.replace('\'', '')) for x in stop_words if "'" in x]
stop_words.extend(['coronavirus', 'covid', 'covidー', 'coronavirusoutbreak', 'coronaviruspandemic'])
stop_words = set(stop_words)
lemmatizer = WordNetLemmatizer()

In [None]:
def cleanText(text):
    p.set_options(p.OPT.URL, p.OPT.MENTION, p.OPT.EMOJI, p.OPT.SMILEY)
    preprocesser = lambda x: p.clean(x) #removes URL, @Mentions, Emojis, Smileys
    
    # prevent acronym for United States from losing meaning
    expand_us = lambda x: x.replace('U.S.', 'United States')
    
    # remove punctuation
    table = str.maketrans('', '', string.punctuation)
    remove_punctuation = lambda x: x.translate(table)
    
    # remove numbers
    text_nonum = lambda x: re.sub(r'\d+', '', x)
    
    # convert all letters to lowercase
    text_lower = lambda x: x.lower()

    # substitute multiple spaces with single space
    text_nospaces = lambda x: re.sub(r'\s+', ' ', x, flags=re.I)

    # remove all single characters
    text_single = lambda x: re.sub(r'\s+[a-zA-Z]\s+', ' ', x)
    
    # tokenize words
    tokenize = lambda x: word_tokenize(x)
    remove_stop = lambda x: [w for w in x if w not in stop_words]
#     lemmatize_tweet = lambda x: [lemmatizer.lemmatize(word) for word in x]
    create_string = lambda x: ' '.join(x)
    
    for function in [preprocesser, expand_us, remove_punctuation, 
                     text_nonum, text_lower, text_nospaces, 
                     text_single,
                     tokenize, remove_stop, 
                     # lemmatize_tweet, 
                     create_string
                    ]:
        text = text.map(function)
    return text

In [None]:
new_df['full_text'] = cleanText(new_df.full_text)

In [None]:
def display_topics(model, feature_names, no_top_words, topic_names=None):
    for ix, topic in enumerate(model.components_):
        if not topic_names or not topic_names[ix]:
            print("\nTopic ", ix)
        else:
            print("\nTopic: '", topic_names[ix], "'")
        print(", ".join([feature_names[i]
                         for i in topic.argsort()[:-no_top_words - 1:-1]]))

In [None]:
index_names = [f'component_{x}' for x in range(5)]

In [None]:
# tf, dtm = createCustomTFIDFvectorizer(new_df, n_range=(1,2), stop = stop_words)

# don't use TF-IDF for tweets
# use CV because it's a small doc
# suggestion - binary = True ~ can throw in a lot of noise
cv = CountVectorizer(ngram_range = (1,1), stop_words = stop_words, min_df=3, max_df=0.85)
data_dtm = cv.fit_transform(new_df.full_text)
data_dtm = pd.DataFrame(data_dtm.toarray(), columns=cv.get_feature_names())
data_dtm.index = new_df.index