In [1]:
import pandas as pd
import numpy as np
import re
from nltk.tokenize import RegexpTokenizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation

In [2]:
data = pd.read_excel("Hurricane_Harvey.xlsx")
print(data[["Likes","Retweets"]].describe())

# extracted_data = data[(data["Likes"] >= 3) & (data["Retweets"] >= 2)]
# text_corpus = extracted_data["Tweet"]
text_corpus = data["Tweet"]

               Likes       Retweets
count  398867.000000  398867.000000
mean        3.660571       2.171333
std        28.014071      18.606119
min         0.000000       0.000000
25%         0.000000       0.000000
50%         0.000000       0.000000
75%         1.000000       0.000000
max       993.000000     991.000000


In [3]:
new_corpus = []

for text in text_corpus:

    no_url = re.sub(r"http\S+", "", str(text))
    new_corpus.append(no_url)

new_corpus = np.array(new_corpus)

In [4]:
tokenizer = RegexpTokenizer(r'\w+')

tfidf = TfidfVectorizer(lowercase=True,
                        stop_words='english',
                        ngram_range = (1,1),
                        tokenizer = tokenizer.tokenize,
                        max_features=  500,
                        encoding='utf-8')

train_data = tfidf.fit_transform(new_corpus.astype('U'))




In [5]:
# Define the number of topics or components
num_components=20

# Create LDA object
model=LatentDirichletAllocation(n_components=num_components)

# Fit and Transform SVD model on data
lda_matrix = model.fit_transform(train_data)

# Get Components 
lda_components=model.components_

In [6]:
# Print the topics with their terms
terms = tfidf.get_feature_names_out()

for index, component in enumerate(lda_components):
    zipped = zip(terms, component)
    top_terms_key=sorted(zipped, key = lambda t: t[1], reverse=True)[:7]
    top_terms_list=list(dict(top_terms_key).keys())
    print("Topic "+str(index)+": ",top_terms_list)

Topic 0:  ['safe', 'stay', 'path', 'praying', 'hurricaneharvey', 'texas', 'hope']
Topic 1:  ['hurricaneharvey', 'harvey2017', 'prayfortexas', 'louisiana', 'harvey', 'watching', 'texas']
Topic 2:  ['oil', 'gas', 'prices', 'harvey', 'hurricane', 'god', 'slams']
Topic 3:  ['dog', 'jeff_piotrowski', 'periscope', 'harvey', 'food', 'hurricane', 'house']
Topic 4:  ['disaster', 'trump', 'hurricane', 'harvey', 'updates', 'major', 'texas']
Topic 5:  ['prayers', 'affected', 'thoughts', 'harvey', 'hurricane', 'way', 'hurricaneharvey']
Topic 6:  ['mph', 'winds', 'hurricane', 'katrina', 'harvey', '130', 'climate']
Topic 7:  ['news', 'hurricane', 'category', 'breaking', 'harvey', 'texas', '4']
Topic 8:  ['powerful', 'u', 'know', 'hurricane', 'harvey', 'need', 'emergency']
Topic 9:  ['space', 'seen', 'station', 'nasa', 'international', 'cupola', 'com']
Topic 10:  ['good', 'got', 'hurricane', 'harvey', 'check', 't', 'evacuate']
Topic 11:  ['com', 'twitter', 'pic', 'hurricane', 'harvey', 'hurricaneharve