In [1]:
import os
os.chdir("C:\\PyBox\\")

from datetime import datetime, date
from collections import Counter

import pybox as pb
from pybox.GLOBALS import *

from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import NMF, LatentDirichletAllocation

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

import numpy as np
import plotly.express as px

In [2]:
data_path = "C:\\PyBox\\boxes\\scrapers\\data\\2020Q3"

data = pb.flow.table_from_parquet("CleanedReuters(2020_06_29_02_01,2020_10_05_01_56)", data_path)

In [3]:
labels_dict = Counter(data["Label"])
labels = pb.DataTable({"Label": list(labels_dict.keys()), "Count": list(labels_dict.values())})
labels.sort(["Count"], reverse_order=True)
labels.display(20, "head")

Index,Count DataType:int,Label DataType:str
0,7187,HEALTHCARE
1,5709,APAC
2,5575,HEALTHCARE & PHARMA
3,5511,EMERGING MARKETS
4,4296,BUSINESS NEWS
5,2800,SPORTS NEWS
6,2764,FINANCIALS
7,1879,U.S. MARKETS
8,1737,ENVIRONMENT
9,1732,BANKS


In [4]:
def words_removal(row):
    to_remove = ["said", "july", "august", "september", "reported", "week", "data"]
    new_row = row.lower()
    for word in to_remove:
        new_row = new_row.replace(word, "")
    return new_row

def filter_function(row):
    if (
        (datetime(2020,7,1) <= row["LastModificationDate"] < datetime(2020,10,1)) 
        and row["Label"] in ["BUSINESS NEWS", "FINANCIALS", "BANKS", "BONDS NEWS"]
    ):
        return True
    else:
        return False
        
data.filter(filter_function)

data.apply("Body", words_removal)

In [5]:
data.display(5)

Index,LastModificationDate DataType:datetime,Label DataType:str,Headline DataType:str,Body DataType:str
0,2020-07-01 00:25:00,FINANCIALS,Dun & Bradstreet raises $1.7 billion in upsized IPO -source,new york june (reuters) - us business analytics firm dun & bradstreet raised $17 billion in its initial public offering (ipo) after it sold more stock than expected above its indicated price range a person familiar with the matter on tuesday the company priced million shares at $22 per share the source it was previously seeking to sell million shares for between $19 and $21 per share the source requested anonymity ahead of an official announcement dun & bradstreet did not immediately respond to a request for comment (reporting by echo wang in new york editing by chris reese)
1,2020-07-01 00:33:35,BANKS,Bid for Cirque du Soleil dismissed as 'pure fiction' by lenders,montreal (reuters) - a stalking horse bid for cirque du soleil entertainment group was dismissed as inadequate by lenders during a quebec court hearing into the company’s restructuring on tuesday canada’s once high-flying cirque received initial protection from its creditors after the covid-19 pandemic forced the famed circus operator to cancel shows and lay off artists montreal-based cirque which grew from a troupe of street-performers in the 1980s to a company with global reach has slashed about 95% of its workforce and suspended shows due to the pandemic the company filed for bankruptcy protection on monday the company has signed an agreement with its existing investors private equity fund tpg capital china's fosun international ltd 0656hk and canadian pension fund caisse de depot et placement du québec under which the consortium will take over cirque's liabilities and invest $300 million to support a restart as part of the investment government body investissement québec will provide $200 million in debt financing the agreement will serve as the “stalking horse” bid in a sale and investment solicitation process subject to court approval joe pasquariello with the goodmans law firm who acted for first lien and second lien lenders the bid is “not what we’re striving for” in a stalking horse “that agreement with all due respect is pure fiction” cirque has received six non-binding offers with a deal eyed by the court heard cirque chief executive daniel lamarre in an interview “we’re open to any solution that someone can bring to the table that will ensure the future of cirque du soleil” quebec superior court judge louis gouin agreed to give the company protection from its creditors for days cirque will seek its immediate provisional recognition in the united states under chapter in the united states bankruptcy court cirque generated about $1 billion in revenues and $157 million in profits last year before the pandemic he the company had almost $15 billion in liabilities as of december
2,2020-07-01 00:36:11,BANKS,Bid for Cirque du Soleil dismissed as 'pure fiction' by lenders,montreal (reuters) - a stalking horse bid for cirque du soleil entertainment group was dismissed as inadequate by lenders during a quebec court hearing into the company’s restructuring on tuesday canada’s once high-flying cirque received initial protection from its creditors after the covid-19 pandemic forced the famed circus operator to cancel shows and lay off artists montreal-based cirque which grew from a troupe of street-performers in the 1980s to a company with global reach has slashed about 95% of its workforce and suspended shows due to the pandemic the company filed for bankruptcy protection on monday the company has signed an agreement with its existing investors private equity fund tpg capital china's fosun international ltd 0656hk and canadian pension fund caisse de depot et placement du québec under which the consortium will take over cirque's liabilities and invest $300 million to support a restart as part of the investment government body investissement québec will provide $200 million in debt financing the agreement will serve as the “stalking horse” bid in a sale and investment solicitation process subject to court approval joe pasquariello with the goodmans law firm who acted for first lien and second lien lenders the bid is “not what we’re striving for” in a stalking horse “that agreement with all due respect is pure fiction” cirque has received six non-binding offers with a deal eyed by the court heard cirque chief executive daniel lamarre in an interview “we’re open to any solution that someone can bring to the table that will ensure the future of cirque du soleil” quebec superior court judge louis gouin agreed to give the company protection from its creditors for days cirque will seek its immediate provisional recognition in the united states under chapter in the united states bankruptcy court cirque generated about $1 billion in revenues and $157 million in profits last year before the pandemic he the company had almost $15 billion in liabilities as of december
...,...,...,...,...
9579,2020-09-30 23:38:52,BANKS,Palantir valued at $20 billion in choppy stock exchange debut,(reuters) - palantir technologies inc pltrn the us analytics firm known for its work with the central intelligence agency and other government agencies was valued at $206 billion in a choppy new york stock exchange debut on wednesday the listing ended years of speculation about when the company co-founded by billionaire peter thiel in would go public and how much it would be worth palantir’s shares closed at $950 apiece below its $10 opening price though topping the weighted average price of $917 in the private markets in as well as a reference price of $725 set by the new york stock exchange on tuesday palantir decided in the middle of to go public with an original timetable to complete the listing in the second half of but the covid-19 pandemic accelerated its plans according to chief operating officer shyam sankar “broadly speaking covid has been a tailwind for our business” sankar in an interview “we started new engagements with customers in the first three s of covid without getting on a plane” sankar added the listing pegs palantir’s valuation at roughly the same level as in a private fundraising round there has been considerable debate about whether investors will view it as a lucrative software provider or a less-glamorous consulting business denver-based palantir went public at a time of strong investor demand for new stocks particularly technology companies that promise rapid growth the company led by ceo alex karp has seen strong demand for its services with revenue rising almost 50% to $4812 million in the first six months of from the comparable period a year earlier however palantir has yet to turn a profit in its years of existence posting a net loss of $1647 million in the same period compared with a loss of $2805 million a year earlier palantir opted to go public through a direct listing rather than a traditional initial public offering meaning it did not raise any money but allowed its investors to sell more shares only two major companies - workplace messaging platform slack technologies inc workn in and music-streaming service spotify technology sa spotn in - have taken the direct listing route workplace software maker asana inc asann also went public on wednesday through a direct listing and its shares opened up 29% “today further advanced the direct listing as an option for companies looking to access the public market” nyse vice chairman and chief commercial officer john tuttle nevertheless palantir closing below its opening price is a sign the first new investors will have lost money which would be a setback for the growth of direct listings according kathleen smith founding principal at renaissance capital a research firm and manager of ipo-focused exchange-traded funds “with direct listings the appropriate starting price would be the opening trade” smith “for the direct listing to gain traction as a new structure investors will have to make money” she added palantir analyzes large amounts of for us government defense and intelligence agencies global banks and energy companies as a public company palantir is expected to face intense scrutiny from investors and the media about its operations after years of being viewed as one of the most reclusive us tech companies morgan stanley credit suisse group ag and goldman sachs & co were the lead banks advising palantir on its listing
9580,2020-09-30 23:52:54,BANKS,Goldman Sachs to go ahead with 'modest' job cuts after coronavirus pause,"(reuters) - goldman sachs group inc gsn plans to move forward with ""a modest number of layoffs"" a company spokesperson on wednesday months after the wall street bank paused job cuts due to the covid-19 pandemic bloomberg news which first about the layoffs the bank was looking to cut about jobs or roughly 1% of its workforce citing people familiar with the matter “at the outbreak of the pandemic the firm announced that it would suspend any job reductions the firm has made a decision to move forward with a modest number of layoffs” a goldman sachs spokesperson many of the cuts in the current round are tied to back-office roles that had been folded into bigger money-making divisions as part of an earlier reorganization according to the bloomberg report goldman sachs’ annual cull has long set it apart from wall street rivals which tend to make mass layoffs periodically in january goldman it was aiming for a 60% efficiency ratio over the next three years compared with 68% in a lower efficiency ratio means a bank is better at managing costs relative to revenue separately the us federal reserve will curb big bank capital distributions through the end of the year meaning the likes of jpmorgan jpmn citi cn wells fargo wfcn and bank of america bacn will be barred from share buybacks and will have to cap dividends into the new year shares of big banks fell between 05% and 1% in extended trade following the news"


In [6]:
daily_frequency_data = pb.DataTable({"Day": sorted(set([d.date() for d in data["LastModificationDate"]]))})
for row in daily_frequency_data:
    row["SubTable"] = pb.DataTable(names=["LastModificationDate", "Body"], dtypes=[datetime, str])

In [7]:
for row in data:
    for sub_row in daily_frequency_data:
        if row["LastModificationDate"].date() == sub_row["Day"]:
            sub_row["SubTable"].insert_row([row["LastModificationDate"], row["Body"]])

In [8]:
tf_vectorizer = CountVectorizer(max_df=0.9, stop_words="english")
tfidf_vectorizer = TfidfVectorizer(max_df=0.9, stop_words="english")

In [9]:
for row in daily_frequency_data:
    tf_fitted = tf_vectorizer.fit_transform(row["SubTable"]["Body"])
    row["FittedTf"] = pb.DataTable({
        "Word": tf_vectorizer.get_feature_names(),
        "Value": tf_fitted.toarray().sum(axis=0)})
    row["FittedTf"].sort(["Value"], reverse_order=True)

    tfidf_fitted = tfidf_vectorizer.fit_transform(row["SubTable"]["Body"])
    row["FittedTfiDf"] = pb.DataTable({
        "Word": tfidf_vectorizer.get_feature_names(),
        "Value": tfidf_fitted.toarray().sum(axis=0)})
    row["FittedTfiDf"].sort(["Value"], reverse_order=True)

In [10]:
tf_fitted = tf_vectorizer.fit_transform(data["Body"])
fitted_tf = pb.DataTable({
    "Word": tf_vectorizer.get_feature_names(),
    "Value": tf_fitted.toarray().sum(axis=0)})
fitted_tf.sort(["Value"], reverse_order=True)

tfidf_fitted = tfidf_vectorizer.fit_transform(data["Body"])
fitted_tfidf = pb.DataTable({
    "Word": tfidf_vectorizer.get_feature_names(),
    "Value": tfidf_fitted.toarray().sum(axis=0)})
fitted_tfidf.sort(["Value"], reverse_order=True)

In [11]:
daily_frequency_data.display()

Index,Day DataType:date,SubTable DataType:DataTable,FittedTf DataType:DataTable,FittedTfiDf DataType:DataTable
0,2020-07-01,"DataTable(shape=2x255,bytesize=932575) LastModificationDate: datetime Body: str","DataTable(shape=2x6453,bytesize=1091155) Value: int64 Word: str","DataTable(shape=2x6453,bytesize=1091155) Value: float64 Word: str"
1,2020-07-02,"DataTable(shape=2x191,bytesize=749184) LastModificationDate: datetime Body: str","DataTable(shape=2x6039,bytesize=1020466) Value: int64 Word: str","DataTable(shape=2x6039,bytesize=1020466) Value: float64 Word: str"
2,2020-07-03,"DataTable(shape=2x140,bytesize=488524) LastModificationDate: datetime Body: str","DataTable(shape=2x4427,bytesize=748210) Value: int64 Word: str","DataTable(shape=2x4427,bytesize=748210) Value: float64 Word: str"
3,2020-07-04,"DataTable(shape=2x16,bytesize=54950) LastModificationDate: datetime Body: str","DataTable(shape=2x847,bytesize=144312) Value: int64 Word: str","DataTable(shape=2x847,bytesize=144312) Value: float64 Word: str"
4,2020-07-05,"DataTable(shape=2x23,bytesize=66009) LastModificationDate: datetime Body: str","DataTable(shape=2x1070,bytesize=181716) Value: int64 Word: str","DataTable(shape=2x1070,bytesize=181716) Value: float64 Word: str"
...,...,...,...,...
87,2020-09-26,"DataTable(shape=2x19,bytesize=94450) LastModificationDate: datetime Body: str","DataTable(shape=2x1385,bytesize=235023) Value: int64 Word: str","DataTable(shape=2x1385,bytesize=235023) Value: float64 Word: str"
88,2020-09-27,"DataTable(shape=2x17,bytesize=70087) LastModificationDate: datetime Body: str","DataTable(shape=2x1015,bytesize=172582) Value: int64 Word: str","DataTable(shape=2x1015,bytesize=172582) Value: float64 Word: str"
89,2020-09-28,"DataTable(shape=2x97,bytesize=342257) LastModificationDate: datetime Body: str","DataTable(shape=2x3703,bytesize=625997) Value: int64 Word: str","DataTable(shape=2x3703,bytesize=625997) Value: float64 Word: str"
90,2020-09-29,"DataTable(shape=2x118,bytesize=391248) LastModificationDate: datetime Body: str","DataTable(shape=2x4031,bytesize=681356) Value: int64 Word: str","DataTable(shape=2x4031,bytesize=681356) Value: float64 Word: str"


In [12]:
daily_frequency_data["FittedTfiDf",0].display(10,"head")

Index,Value DataType:float64,Word DataType:str
0,7.267630645785151,bank
1,7.097822840546106,billion
2,6.565059553202424,year
3,6.046381193068194,wednesday
4,5.7117161372423535,million
5,5.499896554578329,june
6,5.101671216951377,rose
7,5.060089719227646,coronavirus
8,4.942697381777197,new
9,4.684746212845944,pandemic


In [13]:
weekly_time_series = pb.DataTable({"Date": daily_frequency_data["Day"]})


for i, row in enumerate(daily_frequency_data):
    for word in fitted_tfidf["Word"][:20]:
        for sub_row in row["FittedTfiDf"]:
            if word == sub_row["Word"]:
                weekly_time_series[word, i] = sub_row["Value"]
                continue

weekly_time_series.display()

Index,Date DataType:date,year DataType:float64,bank DataType:float64,billion DataType:float64,new DataType:float64,index DataType:float64,million DataType:float64,china DataType:float64,market DataType:float64,coronavirus DataType:float64,dollar DataType:float64,economy DataType:float64,pandemic DataType:float64,government DataType:float64,company DataType:float64,rose DataType:float64,economic DataType:float64,shares DataType:float64,quarter DataType:float64,month DataType:float64,fell DataType:float64
0,2020-07-01,6.565059553202424,7.267630645785151,7.097822840546106,4.942697381777197,4.492038369684406,5.7117161372423535,4.286784425693934,4.369482871750026,5.060089719227646,3.3650054238304192,4.613444569914605,4.684746212845944,4.065370190700369,4.049407654716334,5.101671216951377,4.087995904707773,2.6281189054459504,3.200789458898972,3.7500656654573445,2.834424634893013
1,2020-07-02,4.564303246480108,5.885888424962554,4.340488690302834,4.634867865025779,3.50604215701332,3.720956116115145,3.516503763967199,3.318408687955999,4.260738830899942,3.7673696503010947,3.0567782775332857,2.971233608534222,3.095450437953187,3.1326772174689204,3.830847124055496,2.7116558759456693,2.2058853585170652,1.3566748971140161,3.094365902229602,1.682419288580681
2,2020-07-03,4.677165193070012,5.008494081206841,3.4124343628597678,3.525539429354398,3.1982480529559583,2.355319110359834,3.604477687310158,2.774070990947968,3.004245241082908,1.2318211050122057,1.9553788349317776,1.4869194223963023,3.8134373155005665,1.5556993505088519,2.8635296300682755,2.4160242536438234,2.3794357314824968,1.504177180494695,2.4665628515046634,1.7596371107738058
3,2020-07-04,0.27118040569965535,0.5813813680998333,0.6068491786062358,,0.21234133389723864,0.33365553465297243,0.4441446118747987,0.23162940907538784,0.3208922777141603,,0.16682776732648621,0.16959301582255656,0.5041718021878321,0.6845499416796376,,0.145124190764369,0.21234133389723864,0.10714337108653119,0.10202578193676003,0.19251643620393902
4,2020-07-05,0.4489996131292052,0.4123386733217932,0.8219745651499654,0.576934723948024,0.3096312500267081,0.4228348688626685,0.1445907312750566,0.0767485721589652,0.8073430644125579,,1.7752720529078638,0.7520153506325175,1.005730484947789,0.45169597216388746,0.0767485721589652,0.11754935406289423,0.3885608765719884,,0.2618037905851479,0.12015418155514554
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
87,2020-09-26,0.6434835606365574,1.3069731364397779,0.10486518494083645,0.7559231096442357,0.04612602576653888,0.3548449613110113,0.44159003689673715,0.13837807729961663,0.2676799324814557,0.04612602576653888,0.29270068738727484,0.29139438023763053,0.560975154068255,0.310536153488986,0.10576749651144257,0.2094097556653296,0.1393487193794039,0.2393532028214906,0.245955177542256,
88,2020-09-27,0.4486161896660076,0.9061347941209077,0.5189884604781325,0.35571613250114864,0.36682184279516283,0.07058033110272631,0.21112533265493155,0.4623527427453009,0.36233959435719504,,0.1736028278655534,0.314092850047354,0.927290334160603,0.33357708442804096,0.2728456613242443,0.6944113114622136,0.3679934611847082,,0.04291048472639459,0.11980105466145342
89,2020-09-28,2.104195028429481,3.1069902592164444,1.9168717124234282,2.1205696574186246,2.841713999395929,2.179326903116233,3.2200666552420385,1.834265033163343,1.3309533137923255,2.100327991014319,1.1961427316126583,1.2596655243125168,1.6923151683929294,1.567374940025984,1.3420567598437954,1.5036802006575272,1.857006131313865,0.4245319913193524,1.7510095567404005,1.3296130324494118
90,2020-09-29,3.662876759002441,2.4126428877773436,3.580865756660267,2.616161645584936,2.1580143637282054,2.628729678913778,2.238407994826687,2.7075243961750557,2.261754029159889,1.5264805271375401,1.9874031451666607,2.0261790269661435,1.8136673365664608,1.659654355023993,0.9820214791539066,1.5611978527947763,2.054465752887439,0.9187268865978381,1.2567112851538684,1.4389359963103594


In [14]:
normalized_daily_tfidf = dict()
daily_tfidf_words = [col for col in weekly_time_series.columns if col != "Date"]

for row in weekly_time_series:
    tfidf_mean = np.mean([val for val in row.content.values() if val is not None and not isinstance(val, date)])
    tfidf_std = np.std([val for val in row.content.values() if val is not None and not isinstance(val, date)])

    for column in weekly_time_series.columns:
        if column == "Date":
            normalized_daily_tfidf[column] = weekly_time_series[column]
        else:
            normalized_daily_tfidf[column] = [
                (tfidf - tfidf_mean) / tfidf_std if tfidf is not None
                else 0 for tfidf in weekly_time_series[column]
            ]


In [15]:
pb.DataTable(normalized_daily_tfidf).display()

Index,Date DataType:date,year DataType:float64,bank DataType:float64,billion DataType:float64,new DataType:float64,index DataType:float64,million DataType:float64,china DataType:float64,market DataType:float64,coronavirus DataType:float64,dollar DataType:float64,economy DataType:float64,pandemic DataType:float64,government DataType:float64,company DataType:float64,rose DataType:float64,economic DataType:float64,shares DataType:float64,quarter DataType:float64,month DataType:float64,fell DataType:float64
0,2020-07-01,4.1804705150330514,5.087001492251145,4.867897636075216,2.0871285312955106,1.5056409652147673,3.079397258618336,1.240800779804434,1.3475069963127924,2.238600448529766,0.0514247463177493,1.6622919895693715,1.7542928542878293,0.9551088904885089,0.9345123508681682,2.292253261979741,0.984302961872785,-0.8993835955885937,-0.160463930480589,0.5482698924035014,-0.6331862861692406
1,2020-07-02,1.5988846953821017,3.304132628260252,1.3100956596972508,1.6899345738678104,0.23340514503574486,0.5107096960188843,0.24690380855221283,-0.008699253877202603,1.2071940192216333,0.5705973101634697,-0.34628227449165494,-0.4566609866280189,-0.29638339344359915,-0.24834949456936323,0.6525026104546817,-0.7915954268175706,-1.4441936425044268,-2.5399341270859015,-0.2977827752851084,-2.119624517656849
2,2020-07-03,1.744511026742774,2.1720263400174895,0.11262254238509879,0.25856257256876497,-0.1637431192165771,-1.2513785288204833,0.3604169996580815,-0.7110608933111963,-0.41406591286272426,-2.7010335963899443,-1.7674234565727278,-2.371878967931236,0.63003864783353,-2.283131884393184,-0.5956319660488867,-1.173050380332194,-1.2202607325656483,-2.3496111967708573,-1.1078401759988514,-2.0199899773940873
3,2020-07-04,-3.940553071496638,-3.5402992259464607,-3.507437983305374,0,-4.016473418665448,-3.859941101690596,-3.717376495484522,-3.991585919272858,-3.8764095956371563,0,-4.07519980011731,-4.071631786221124,-3.6399231130886003,-3.4071803020213576,0,-4.103204032966767,-4.016473418665448,-4.152210873614768,-4.158814124363228,-4.04205358282092
4,2020-07-05,-3.7111120632042627,-3.758415856286859,-3.2298606264417424,-3.5460367530249837,-3.8909397557046,-3.744872562944454,-4.103892358400674,-4.191429433935636,-3.2487397246592478,0,-1.9998161337486955,-3.3201293490969443,-2.992759448635575,-3.7076329377229458,-4.191429433935636,-4.138783981993312,-3.789096465767181,0,-3.9526517646431274,-4.135422960116714
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
87,2020-09-26,-3.460168457854727,-2.6040645562327165,-4.15515042853632,-3.3150871481965303,-4.230941857943836,-3.8326002790211215,-3.7206726762919704,-4.11190857674346,-3.9450697495107514,-4.230941857943836,-3.912785344935942,-3.914470879552177,-3.5666294654662285,-3.8897721542638246,-4.153986171426268,-4.0202560486475525,-4.110656152437353,-3.9816198697635534,-3.9731013089255556,0
88,2020-09-27,-3.711606796406995,-3.121268264163554,-3.620805105159131,-3.831476202498963,-3.8171464492372342,-4.19938834606775,-4.018042431120023,-3.693882453606607,-3.822929916703378,0,-4.066457905786567,-3.885182930994087,-3.09397116556637,-3.860042326406245,-3.938404383876817,-3.394456127514199,-3.8156347041984553,0,-4.235090886550593,-4.135878601505038
89,2020-09-28,-1.5754051814948011,-0.28149350463783246,-1.81710938838217,-1.554276916219121,-0.6237807828744275,-1.4784621496805666,-0.13559046810304692,-1.9236971977369492,-2.573122814053496,-1.580394839116521,-2.747069579146572,-2.6651058035944994,-2.10685581468666,-2.2680668131337702,-2.5587959823551905,-2.3502524523280552,-1.89435424561858,-3.7426827573925907,-2.0311221532998074,-2.574852185740384
90,2020-09-29,0.4357696124950541,-1.1774133735376267,0.32995040803193815,-0.9148121074944968,-1.5059618253182356,-0.898595511543443,-1.4022295230199657,-0.7969262957265701,-1.3721060187345218,-2.320833077601385,-1.7261023297025022,-1.6760696164421864,-1.9502745081636728,-2.1489982247184147,-3.0233512968031215,-2.2760371395294703,-1.639571113212475,-3.1050206245739864,-2.668917673083152,-2.4337922214072036


In [16]:
fig = px.line(normalized_daily_tfidf, x="Date", y=daily_tfidf_words,
              hover_data={"Date": "|%B %d, %Y"},
              title="TFiDF in 2020 Q3", width=1400, height=600)
fig.update_xaxes(
    dtick="M1",
    tickformat="%b\n%Y")
fig.show()

In [17]:
for row in daily_frequency_data:
    tf_fitted = tf_vectorizer.fit_transform(row["SubTable"]["Body"])
    lda = LatentDirichletAllocation(
        n_components=1, 
        evaluate_every=5, 
        n_jobs=-1, 
        learning_offset=50., 
        random_state=0, 
        max_iter=20
    ).fit(tf_fitted)

    normalized_components = lda.components_ / lda.components_.sum(axis=1)[:, np.newaxis]

    row["FittedLda"] = pb.DataTable({
        "Word": tf_vectorizer.get_feature_names(),
        "Value": normalized_components.tolist()[0]})
    row["FittedLda"].sort(["Value"], reverse_order=True)

In [18]:
tf_fitted = tf_vectorizer.fit_transform(data["Body"])
lda = LatentDirichletAllocation(
    n_components=1, 
    evaluate_every=5, 
    n_jobs=-1, 
    learning_offset=50., 
    random_state=0, 
    max_iter=20
).fit(tf_fitted)
normalized_components = lda.components_ / lda.components_.sum(axis=1)[:, np.newaxis]

fitted_lda = pb.DataTable({
    "Word": tf_vectorizer.get_feature_names(),
    "Value": normalized_components.tolist()[0]})
fitted_lda.sort(["Value"], reverse_order=True)

In [19]:
lda_time_series = pb.DataTable({"Date": daily_frequency_data["Day"]})

for i, row in enumerate(daily_frequency_data):
    for word in fitted_lda["Word"][:20]:
        for sub_row in row["FittedLda"]:
            if word == sub_row["Word"]:
                lda_time_series[word, i] = sub_row["Value"]
                continue

lda_time_series.display()

Index,Date DataType:date,year DataType:float,new DataType:float,bank DataType:float,billion DataType:float,market DataType:float,coronavirus DataType:float,million DataType:float,pandemic DataType:float,government DataType:float,china DataType:float,index DataType:float,economy DataType:float,economic DataType:float,month DataType:float,dollar DataType:float,company DataType:float,markets DataType:float,investors DataType:float,rose DataType:float,covid DataType:float
0,2020-07-01,0.005937339496668249,0.003862352336932898,0.0047252182845456195,0.004622496147925058,0.0034925526450988783,0.004211607601442812,0.003923985618905237,0.0036569080636917766,0.0027529532614308347,0.002465331278893261,0.0025885978428379346,0.0031227529532648563,0.003122752953264856,0.0026091422701620473,0.0018900873138181155,0.0024858757062173734,0.0023215202876244747,0.0016024653312805431,0.0028145865434031714,0.0024242424242450363
1,2020-07-02,0.0048446498751811345,0.00501767308500904,0.00425142744148546,0.0032874409867299855,0.0033615937909419434,0.004449168252717351,0.0031144177769020795,0.0028178065600542425,0.0024717601403984302,0.0024223249375904575,0.0023234545319745113,0.002644783350226336,0.002496477741802417,0.002496477741802417,0.002768371357246268,0.00247176014039843,0.0022245841263585655,0.002372889734782484,0.0030155473712861333,0.0031144177769020795
2,2020-07-03,0.006410977392874038,0.004423949312041665,0.005548682187984518,0.003449180819557861,0.0035991452030169073,0.004461440407906428,0.0025493945188035786,0.0015371349304550124,0.003899073969935001,0.004161511640988334,0.0032617253402340515,0.0020245191766969143,0.0026993589022626264,0.002961796573315958,0.0011997150676721577,0.0012746972594016807,0.0027368499981273876,0.0023619390394797705,0.0029243054774511956,0.0026618678063978643
3,2020-07-04,0.0022742040285901274,,0.004223521767381832,0.004873294346979069,0.0019493177387915116,0.0019493177387915116,0.0016244314489928975,0.0009746588693956802,0.00454840805718045,0.002923976608187361,0.0016244314489928975,0.0009746588693956802,0.0009746588693956802,0.0009746588693956802,,0.006822612085770781,,0.0009746588693956802,,0.0022742040285901274
4,2020-07-05,0.0030651340996172083,0.003320561941251993,0.0025542784163476405,0.003831417624521562,0.0005108556832694247,0.005363984674330269,0.0030651340996172083,0.004086845466156347,0.00689655172413898,0.0010217113665389505,0.0015325670498085088,0.009195402298852042,0.0007662835249041787,0.0015325670498085088,,0.0025542784163476405,0.0010217113665389507,,0.0005108556832694247,0.0012771392081737283
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
87,2020-09-26,0.005017022039062095,0.004479483963448277,0.005375380756137975,0.0007167174341515694,0.0007167174341515694,0.0012542555097653735,0.002508511019530943,0.0014334348683033111,0.003762766529296518,0.002329331660993004,0.00035835871707573405,0.0014334348683033111,0.0010750761512274372,0.0012542555097653735,0.00035835871707573405,0.0012542555097653737,0.0005375380756136428,0.002329331660993004,0.0005375380756136428,0.0016126142268412492
88,2020-09-27,0.002610346464167325,0.0018984337921216375,0.004508780256289165,0.0021357380161368665,0.0030849549121977848,0.0030849549121977843,0.00047460844803031576,0.0016611295681064095,0.005220692928334856,0.001423825344091182,0.0016611295681064095,0.0009492168960607327,0.0030849549121977848,0.00047460844803031576,,0.0018984337921216377,0.0016611295681064095,0.004034171808258704,0.0009492168960607327,0.0016611295681064095
89,2020-09-28,0.003953786906292476,0.004159178433892097,0.004724005134791052,0.0029268292682943734,0.0031835686777938986,0.0020025673940960815,0.0033889602053935194,0.0017458279845965563,0.002310654685495512,0.004775353016690959,0.0039024390243925706,0.0017458279845965563,0.002516046213095132,0.0029268292682943734,0.002875481386394468,0.0020025673940960815,0.002310654685495512,0.003234916559693804,0.0016944801026966508,0.0025673940949950376
90,2020-09-29,0.007632577428753165,0.004323656578137536,0.003176564016590784,0.005911938586433037,0.00419129974411291,0.0035736345186646603,0.0038824671313887863,0.003308920850615409,0.0029118503485415343,0.001852995676344533,0.002338304067768159,0.0027794935145169083,0.0016324009529701587,0.0020294714550440334,0.001985352510369158,0.0016765198976450332,0.0022941851230932836,0.0030000882378912845,0.0009706167828470347,0.0020735903997189087


In [21]:
lda_daily = dict()
daily_lda_words = list()
for col in lda_time_series.columns:
    lda_daily[col] = lda_time_series[col]
    if col != "Date":
        daily_lda_words.append(col)

fig = px.line(lda_daily, x="Date", y=daily_lda_words,
              hover_data={"Date": "|%B %d, %Y"},
              title="LDA in 2020 Q3", width=1400, height=600)
fig.update_xaxes(
    dtick="M1",
    tickformat="%b\n%Y")
fig.show()