# Exploratory Data Analysis

## Word2Vec Model Creation

In [None]:
#Libraries
import nltk
import gensim
from gensim import corpora, models, similarities
from gensim.models import word2vec
%matplotlib inline

In [None]:
#Word2Vec model creation

#Covidtweet_selective_model = gensim.models.Word2Vec(tok_corp, size=100, window=10, min_count=500, workers=4)
#Covidtweet_selective_model.save('Covidtweet_model_7.20')

#loading Word2Vec
Covidtweet_model = gensim.models.Word2Vec.load("Covidtweet_model_7.20")

In [None]:
#Word2Vec Vocabulary
tweet_vocab = Covidtweet_model.wv.vocab
words = len(tweet_vocab)
widget_input = list(tweet_vocab)
print(f'There are {words} words in this Word2Vec vocabulary.')

## Top 50 Tokens In Corpus

In [None]:
#top tokens
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import CountVectorizer
from yellowbrick.text import FreqDistVisualizer

fig= plt.figure(figsize=(20,5))
plt.xticks(rotation=25)

vectorizer = CountVectorizer()
docs = vectorizer.fit_transform(clean_tweet)
features = vectorizer.get_feature_names()

visualizer = FreqDistVisualizer(features=features, orient='v')
visualizer.fit(docs)
visualizer.show()

In [None]:
#closest word graph
from tweet_functions import *
from ipywidgets import interact, interactive, fixed, interact_manual
import ipywidgets as widgets

interact(display_closestwords_tsnescatterplot,model=fixed(Covidtweet_model),word=widget_input,size=fixed(100))



## Topic Modeling with Latent Dirichlet Allocation

In [None]:
from gensim.models import LdaModel
from gensim.corpora.dictionary import Dictionary

common_dictionary = Dictionary(tok_corp)
common_corpus = [common_dictionary.doc2bow(text) for text in tok_corp]

lda = LdaModel(common_corpus, num_topics=20, alpha='auto', eval_every=5)

In [None]:
from gensim import corpora, models

list_of_list_of_tokens = list(tok_corp)
dictionary_LDA = corpora.Dictionary(list_of_list_of_tokens)
dictionary_LDA.filter_extremes(no_below=3)
corpus = [dictionary_LDA.doc2bow(list_of_tokens) for list_of_tokens in list_of_list_of_tokens]

num_topics = 20
%time lda_model = models.LdaModel(corpus, num_topics=num_topics, \
                                  id2word=dictionary_LDA, \
                                  passes=4, alpha=[0.01]*num_topics, \
                                  eta=[0.01]*len(dictionary_LDA.keys()))

In [None]:
num_topics = 15
for i,topic in lda_model.show_topics(formatted=True, num_topics=num_topics, num_words=10):
    print(str(i)+": "+ topic)
    print()

In [None]:
import pyLDAvis
import pyLDAvis.gensim
vis = pyLDAvis.gensim.prepare(topic_model=lda_model, corpus=corpus, dictionary=dictionary_LDA)
pyLDAvis.enable_notebook()
pyLDAvis.display(vis)

## Top N-Grams

In [None]:
import nltk, re, string, collections
from nltk.util import ngrams # function for making ngrams
#from nltk import bigrams
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from collections import Counter

In [None]:
# Getting trigrams  
vectorizer = CountVectorizer(ngram_range = (3,3)) 
X1 = vectorizer.fit_transform(clean_tweet)  
features = (vectorizer.get_feature_names()) 
#print("\n\nFeatures : \n", features) 
#print("\n\nX1 : \n", X1.toarray()) 

vectorizer = TfidfVectorizer(ngram_range = (3,3)) 
X2 = vectorizer.fit_transform(clean_tweet) 
scores = (X2.toarray()) 
#print("\n\nScores : \n", scores) 

In [None]:
sums = X2.sum(axis = 0) 
data1 = [] 
for col, term in enumerate(features): 
    data1.append( (term, sums[0,col] )) 
ranking = pd.DataFrame(data1, columns = ['term','rank']) 
words = (ranking.sort_values('rank', ascending = False)) 
print ("\n\nWords head : \n", words.head(20))