# Part b3 -- Topic Modeling

Identifying topics that exist in our Twitter data space.

**Load lib codes**

In [34]:
from os import chdir
chdir('/home/jovyan/work/Analyzing_Unstructured_Data_for_Finance/Analyzing_Unstructured_Data_for_Finance/')

from lib import *
# suppress_warnings()

In [72]:
X = joblib.load('../Analyzing_Unstructured_Data_for_Finance/data_cleaned/b1.X.pickle')

In [76]:
X = X['cleaned_text']

**Re-do TFIDF vectorizer just for topic modeling, because we might actually want to keep some noisy words here to help us understand the topics**

In [73]:
from sklearn.feature_extraction.text import TfidfVectorizer, ENGLISH_STOP_WORDS
from sklearn.feature_extraction import text

In [98]:
stopwords = set(ENGLISH_STOP_WORDS)
stopwords.add('rt')
stopwords.add('just')
stopwords.add('amp')
stopwords.add('says')
stopwords.add('re')
stopwords.add('via')
stopwords.add('will')
stopwords.add('say')
stopwords.add('one')
stopwords.add('may')
stopwords.add('time')
stopwords.add('make')
stopwords.add('right')

stopwords.add('don')
stopwords.add('new')
stopwords.add('today')
stopwords.add('year')
stopwords.add('like')
stopwords.add('day')
stopwords.add('know')
stopwords.add('need')

In [99]:
tfidf = TfidfVectorizer(ngram_range=(1,2), min_df=20, stop_words=stopwords)
X_tfidf = tfidf.fit_transform(X)

In [100]:
from sklearn.decomposition import TruncatedSVD

In [101]:
def perform_latent_semantic_analysis(n_components, vectorizer):
    SVD = TruncatedSVD(n_components)
    component_names = ["component_"+str(i+1) for i in range(n_components)]
    latent_semantic_analysis = pd.DataFrame(SVD.fit_transform(X_tfidf),
                                            index = X.index,
                                            columns = component_names)
    vocabulary_expression = pd.DataFrame(SVD.components_,
                                         index = component_names,
                                         columns = vectorizer.get_feature_names())
    svd = SVD
    return latent_semantic_analysis, vocabulary_expression, svd

In [102]:
latent_semantic_analysis, vocabulary_expression, svd = perform_latent_semantic_analysis(5, tfidf)

In [103]:
tfidf_word_index = tfidf.get_feature_names()

In [109]:
# These words make up the "topic" in comp_1
comp_1 = list(zip(tfidf_word_index, svd.components_[0]))
comp_1.sort(key=lambda x: abs(x[1]), reverse=True)
comp_1[:20]

[('trump', 0.65527752852095789),
 ('president', 0.26503754892247217),
 ('donald', 0.20414177783501389),
 ('donald trump', 0.19947939446665114),
 ('obama', 0.14929761261486607),
 ('president obama', 0.12884986134715995),
 ('market', 0.1248465384794098),
 ('stock', 0.10654107896268575),
 ('stocks', 0.10629463186021025),
 ('people', 0.088451756040355281),
 ('president trump', 0.085389752646524278),
 ('america', 0.0845361947189429),
 ('great', 0.08054329886014687),
 ('hillary', 0.07271545191667475),
 ('comey', 0.072468408835816123),
 ('cnnpolitics', 0.067087617105536471),
 ('watch', 0.066896817482028206),
 ('tax', 0.063947689708080874),
 ('good', 0.06286356921105099),
 ('china', 0.06144462112762529)]

In [105]:
# These words make up the "topic" in comp_2
comp_2 = list(zip(tfidf_word_index, svd.components_[1]))
comp_2.sort(key=lambda x: abs(x[1]), reverse=True)
comp_2[:20]

[('trump', -0.40476957173121331),
 ('market', 0.40146207205688056),
 ('stock', 0.37513016977207486),
 ('stocks', 0.23183874389248504),
 ('stock market', 0.17859000268665384),
 ('donald', -0.16906782209491469),
 ('donald trump', -0.16636769808413204),
 ('apple', 0.13594605201311727),
 ('freddiethekat', 0.12573489220933382),
 ('buy', 0.095676656265232279),
 ('investors', 0.088797629291874722),
 ('oil', 0.083345062966595149),
 ('best', 0.082360104133754261),
 ('president', -0.081543337366573085),
 ('earnings', 0.07995316597515735),
 ('aapl', 0.07958483767585095),
 ('big', 0.079448741492871858),
 ('good', 0.071125678435279255),
 ('reutersmoney', 0.069122609211972427),
 ('people', 0.068762749353918934)]

In [106]:
# These words make up the "topic" in comp_3
comp_3 = list(zip(tfidf_word_index, svd.components_[2]))
comp_3.sort(key=lambda x: abs(x[1]), reverse=True)
comp_3[:20]

[('president', 0.53702768864876316),
 ('obama', 0.47133569548776894),
 ('president obama', 0.44968137435260053),
 ('trump', -0.33285824611683201),
 ('donald', -0.13826879113005544),
 ('donald trump', -0.13570917512346786),
 ('market', -0.11428141366205727),
 ('stock', -0.11013115251687997),
 ('live', 0.09188217542387063),
 ('watch', 0.075672593524523421),
 ('people', 0.074577224677821394),
 ('live president', 0.071394738492822407),
 ('stocks', -0.056913009782821336),
 ('stock market', -0.056419917367247642),
 ('america', 0.05535721038809395),
 ('sotu', 0.05024550365684622),
 ('speaking', 0.0478772574458508),
 ('change', 0.044129283500909063),
 ('obama speaking', 0.043970813529582244),
 ('watch president', 0.043476710369392907)]

In [107]:
# These words make up the "topic" in comp_4
comp_4 = list(zip(tfidf_word_index, svd.components_[3]))
comp_4.sort(key=lambda x: abs(x[1]), reverse=True)
comp_4[:20]

[('market', 0.51257894458174946),
 ('stock', 0.42598919008672548),
 ('stocks', -0.31355718450200992),
 ('stock market', 0.26983719583420751),
 ('freddiethekat', -0.23189141984496631),
 ('apple', -0.1534450699316616),
 ('reutersmoney', -0.12798758465837462),
 ('president', 0.1238787039922416),
 ('people', -0.11150355310687396),
 ('street', -0.099056103046263902),
 ('wall', -0.097658239068006775),
 ('reutersbiz', -0.097188817486524234),
 ('wall street', -0.092838177437844852),
 ('obama', 0.087495070824675711),
 ('watch', -0.084883593220301173),
 ('president obama', 0.084223836011930384),
 ('trump', 0.081081917581068697),
 ('world', -0.080336553953993584),
 ('aapl', -0.077702823386958941),
 ('great', -0.070813698802350389)]

In [108]:
# These words make up the "topic" in comp_5
comp_5 = list(zip(tfidf_word_index, svd.components_[4]))
comp_5.sort(key=lambda x: abs(x[1]), reverse=True)
comp_5[:20]

[('stocks', 0.55251254968930907),
 ('great', -0.32336290439986376),
 ('people', -0.31346228987832986),
 ('freddiethekat', 0.21928496569413311),
 ('good', -0.17753500631479649),
 ('president', 0.1467851538215281),
 ('reutersmoney', 0.13626052812424586),
 ('world', -0.11995990475819079),
 ('thank', -0.11870743477319928),
 ('work', -0.1121384315016606),
 ('wall', 0.10974775598096527),
 ('street', 0.10836611528442459),
 ('obama', 0.10670318119441033),
 ('thanks', -0.10661697087432681),
 ('wall street', 0.10506372125196427),
 ('president obama', 0.1026877615376042),
 ('america', -0.095800139677664417),
 ('hillary', -0.094377253880984585),
 ('want', -0.093793907605281959),
 ('reutersbiz', 0.093536406912372638)]