# Build Embeddings. 

In [1]:
# Python module. 
import re, os 
import pandas as pd 
from gensim.models import FastText 

# Change the current directory from (./notebook) to root directory. 
if not re.match(r".+/MADS-M2-estimating-news-impact-on-financial-market$", os.getcwd()): 
	os.chdir("../..") 

# For clearing safe warnings. Not important. 
from IPython.display import clear_output

# Custom config. 
from source.config_py.config import DIR_DATASET, DIR_MLTOPIC, PARAM_SEED 

# Custom modules. 
from source.modules.manage_files import ManageFiles 
from source.modules.processor_topic import get_token_variation, replace_references 

# Preview. 
print(os.getcwd()) 

/Users/lioneltay/Dropbox/Courses/michigan_mads/SIADS_694_695_milestone_2_Eric_Gilbert/submission/MADS-M2-estimating-news-impact-on-financial-market


## Configurations (general). 

In [2]:
# Pandas DF config. 
pd.set_option("display.max_rows", 50, "display.max_columns", 50, "display.max_colwidth", 200)

# File management setup. 
manage_files = ManageFiles() 

# For clearing the output. Not important. 
clear_output()

## Load & consolidate dataset. 

In [3]:
# Load dataset. 
df_business = manage_files.read_from_parquet(filename="cnn_news_business.parquet") 

# FIlter columns
usecols = ["date", "category", "section", "headline", "second_headline", "description", "article_text", "entities"] 
df_business = df_business[usecols] 

# Preview. 
df_business 

Read from (cnn_news_business.parquet)


Unnamed: 0_level_0,date,category,section,headline,second_headline,description,article_text,entities
headline_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
3,2022-03-18,business,success,"Two years later, remote work has changed millions of careers - CNN","Two years later, remote work has changed millions of careers",Here's a look at how the pandemic reshaped people's careers in ways they never expected.,"The pandemic thrust the working world into a new reality in March 2020 as offices closed and millions of people were forced to learn how to do their jobs from home.Two years later, employers and w...","{'characters': [[[ 0 11], [ 4 11], [922 933], [926 933], [1242 1253], [1246 1253], [1644 1655], [1648 1655], [1739 1750], [1743 1750], [2802 2813], [2806 2813], [2929 2940], [2933 2940], [6377 638..."
4,2022-03-19,business,investing,Why March is so volatile for stocks - CNN,Why March is so volatile for stocks,"March Madness isn't just for college basketball fans. That phrase is also an apt description of the volatility on Wall Street, and this March is no different.","New York (CNN Business)March Madness isn't just for college basketball fans. That phrase is also an apt description of the volatility on Wall Street, and this March is no different.Just look at wh...","{'characters': [[[0 7]], [[10 12]], [[10 21]], [[14 21]], [[52 58], [52 69]], [[60 69]], [[137 147]], [[199 204], [968 973], [1138 1143], [1379 1384], [1707 1712], [1837 1841], [2038 2043], [2527 ..."
5,2022-03-20,business,investing,Stocks week ahead: Big Oil rakes in billions as prices soar. Lawmakers want them to pay us back - CNN,Big Oil rakes in billions as prices soar. Lawmakers want them to pay us back,"As crude prices surge, oil companies are raking in money -- enormous profits gained from practically no extra investment.","A version of this story first appeared in CNN Business' Before the Bell newsletter. Not a subscriber? You can sign up right here. New York (CNN Business)As crude prices surge, oil companies are ra...","{'characters': [[[42 44], [140 142], [6077 6079], [6077 6080], [6386 6390]], [[42 53], [140 151]], [[46 53], [144 151]], [[118 122]], [[130 137]], [[156 160], [176 178], [361 363], [658 660], [143..."
6,2022-03-18,business,energy,"Oil 'emergency': Work from home and drive slower, IEA says - CNN","Oil 'emergency': Work from home and drive slower, IEA says","Governments around the world must consider drastic steps to slash oil demand in the face of an emerging global energy crisis caused by Russia's invasion of Ukraine, the International Energy Agency...","New York (CNN Business)Governments around the world must consider drastic steps to slash oil demand in the face of an emerging global energy crisis caused by Russia's invasion of Ukraine, the Inte...","{'characters': [[[0 7]], [[10 12]], [[10 21]], [[14 21], [790 797], [2255 2258]], [[23 33]], [[89 91], [611 613], [611 624], [982 984], [1059 1061], [1560 1562], [1797 1799], [1841 1843], [1948 19..."
7,2022-03-20,business,perspectives,Opinion: Technology is transforming the nature of money. Here's how it will affect our lives - CNN,Technology is transforming the nature of money. Here's how it will affect our lives,The convenience of digital payments to both consumers and businesses makes it highly unlikely that cash will survive much longer.,This interview has been edited from its original version. It was originally published in its entirety in the International Monetary Fund's Spring 2022 issue of Finance & Development magazine. Eswa...,"{'characters': [[[105 137], [109 137], [728 758], [732 758], [830 832]], [[123 130], [425 429], [588 596], [746 753], [2114 2122], [5820 5828], [6959 6967], [7166 7174]], [[160 166], [490 496], [5..."
...,...,...,...,...,...,...,...,...
10407,2022-01-13,business,business,Nation's top trade groups hail SCOTUS ruling after filing emergency appeals against Biden's vaccine mandate - CNN,Nation's top trade groups hail SCOTUS ruling after filing emergency appeals against Biden's vaccine mandate,The nation's largest industry trade groups are calling the US Supreme Court's decision to block US President Joe Biden's vaccine or testing requirement for businesses a victory for employers.,(CNN Business)The nation's largest industry trade groups are calling the US Supreme Court's decision to block US President Joe Biden's vaccine or testing requirement for businesses a victory for ...,"{'characters': [[[2 4]], [[ 2 13]], [[ 6 13], [171 180], [418 427], [1859 1868], [2382 2389], [2691 2700], [2714 2723], [2778 2785], [2860 2867], [3059 3066], [3449 3456], [3634 3642], [3696 3704]..."
10408,2022-01-04,business,business,Port of Los Angeles traffic sets record - CNN,Port of Los Angeles traffic sets record in 2021,"A record-setting 10.7 million 20-foot containers passed through the Port of Los Angeles in 2021, up 13% from the previous record set in 2018.","(CNN)A record-setting 10.7 million 20-foot containers passed through the Port of Los Angeles in 2021, up 13% from the previous record set in 2018.And that pace is set to continue, according to Ge...","{'characters': [[[2 4]], [[74 77], [233 236], [393 396], [603 606], [730 734], [800 804], [901 904], [966 969], [1133 1137], [1232 1235], [1284 1289], [1291 1294], [1392 1395], [2144 2148], [2416 ..."
10409,2021-12-06,business,business,"Ford delays return to office for 30,000 workers because of Covid concerns - CNN","Ford delays return to office for 30,000 workers because of Covid concerns","Ford Motor Co. is pushing its return-to-office date to March over concerns over new Covid-19 concerns, the company said Monday.","New York (CNN Business)Ford Motor Co. is pushing its return-to-office date to March over concerns over new Covid-19 concerns, the company said Monday. Assembly line workers returned to work in May...","{'characters': [[[0 7]], [[10 12]], [[10 21]], [[14 21]], [[23 26], [23 32], [23 35], [23 36], [874 877]], [[107 114], [414 421], [423 427]], [[253 264], [257 264]], [[130 136]], [[151 163]], [[21..."
10410,2021-11-18,business,economy,"Thanksgiving dinner will cost Americans 14% more this year, survey finds - CNN","Thanksgiving dinner will cost Americans 14% more this year, survey finds","Thanksgiving dinner will cost 14% more this year, according to new survey by the American Farm Bureau Federation. Dinner for a family of 10 will cost on average $53.31 — up $6.41 from last year's ...","New York (CNN Business)Thanksgiving dinner will cost 14% more this year, according to new survey by the American Farm Bureau Federation. Dinner for a family of 10 will cost on average $53.31 — up ...","{'characters': [[[0 7]], [[10 12], [2112 2114]], [[10 21]], [[14 21]], [[23 34], [1520 1531], [2321 2332], [2447 2458], [2889 2900]], [[104 111], [184 184], [196 196], [230 230], [358 366], [464 4..."


## Replace text with entities title. 

In [4]:
# # Uncomment to run this. It may take a while to complete. 

# df_business_ref = df_business.copy() 

# # Replace specific text with entities title. 
# df_business_ref["token_variation"] = df_business_ref.apply(get_token_variation, axis="columns") 
# df_business_ref["article_text_ref"] = df_business_ref.apply(replace_references, axis="columns") 

# # Preview. 
# df_business_ref 

### Save the processed data. 

In [5]:
# # Cache the processed dataframe. 
# manage_files.write_to_parquet(df_business_ref, filename="cnn_news_business_ref.parquet", index=True) 

## Create embeddings. 

### Build and save the cnn_corpus. 

In [6]:
## Uncomment and run this part if you want to recreate the corpus. 

# # Pick either one to create a corpus from CNN business news. 
# # The (_ref) refers to the specific text that has been replaced 
# # by the entities title for each article. 
# df_cnn_corpus = df_business.copy() 
# df_cnn_corpus = df_business_ref.copy() 

# # Split the body text into separate sentences and save it as corpus. 
# df_cnn_corpus["article_sent"] = df_cnn_corpus["article_text"].str.split(r"\.") 
# df_cnn_corpus = df_cnn_corpus.explode(["article_sent"]) 
# df_cnn_corpus["article_sent"] = df_cnn_corpus["article_sent"].str.strip() + "." 

# # Save it as a corpus. 
# with open(f"{DIR_DATASET}/corpus_.cor", "w") as f: 
# 	cnn_corpus = "\n".join(df_cnn_corpus["article_sent"].to_list()) 
# 	f.write(cnn_corpus) 

# # Preview. 
# print(cnn_corpus[:300], " ...") 

### Train FastText CBOW. 

In [7]:
# # Uncomment this part if you want to build the embeddings. 

# corpus_file = f"{DIR_DATASET}/cnn_corpus.cor" 

# # Create embeddings. 
# cnn_embeddings = FastText(
# 	vector_size=300, window=5, min_count=20, alpha=0.025, sg=0, hs=0, sample=0.001, 
# 	negative=10, ns_exponent=0.5, cbow_mean=1, min_n=3, max_n=6, word_ngrams=1, 
# 	shrink_windows=False, batch_words=10000, workers=3, seed=PARAM_SEED, 
# ) 

# # Build the vocabulary. 
# cnn_embeddings.build_vocab(corpus_file=corpus_file) 

# # The training takes roughly 8 minutes with these parameters. 
# cnn_embeddings.train(
# 	corpus_file=corpus_file, epochs=100, 
# 	total_examples=model.corpus_count, total_words=model.corpus_total_words, 
# ) 

### Save the embeddings. 

In [8]:
# # Save the embeddings. 
# cnn_embeddings.save(f"{DIR_MLTOPIC}/cnn_embeddings.model") 

### Load pre-built embeddings. 

In [9]:
cnn_embeddings = FastText.load(f"{DIR_MLTOPIC}/cnn_embeddings.model") 

# Preview. 
cnn_embeddings

<gensim.models.fasttext.FastText at 0x7ff687a34f40>

### Evaluation. 

In [10]:
cnn_embeddings.wv.most_similar("pandemic"), cnn_embeddings.wv.most_similar("covid") 

([('pandemic.', 0.8037382364273071),
  ('pandemic,', 0.7571540474891663),
  ('pre-pandemic', 0.6355668902397156),
  ('crisis.', 0.2911927103996277),
  ('problems', 0.2882746458053589),
  ('crisis', 0.2880103886127472),
  ('problem', 0.2813224494457245),
  ('Great', 0.2773291766643524),
  ('outbreak', 0.2510329782962799),
  ('rising', 0.24843332171440125)],
 [('Covid', 0.712553083896637),
  ('cover', 0.4673186242580414),
  ('provide', 0.40242406725883484),
  ('providers', 0.3854585587978363),
  ('providing', 0.3788436949253082),
  ('provided', 0.365965873003006),
  ('covering', 0.3486703634262085),
  ('provides', 0.346785306930542),
  ('covered', 0.3427681028842926),
  ('Covid-19', 0.3344852924346924)])

In [11]:
cnn_embeddings.wv.similarity("pandemic", "covid"), cnn_embeddings.wv.most_similar_cosmul("pandemic", "covid") 

(0.04626437,
 [('pre-pandemic', 1.7302595376968384),
  ('pandemic.', 1.7153069972991943),
  ('pandemic,', 1.6258063316345215),
  ('February.', 1.342038869857788),
  ('half', 1.2852768898010254),
  ('March,', 1.2811317443847656),
  ('1.', 1.270282506942749),
  ('baseball', 1.2701747417449951),
  ('Back', 1.2684721946716309),
  ('13', 1.2661775350570679)])

In [12]:
cnn_embeddings.wv.similarity("economic", "interest rate"), cnn_embeddings.wv.most_similar_cosmul("economic", "interest rate") 

(0.19502483,
 [('economics', 1.563862919807434),
  ('Economic', 1.550460696220398),
  ('economist', 1.4416495561599731),
  ('economists', 1.3728910684585571),
  ('MoreIn', 1.3392984867095947),
  ('Republican', 1.3369587659835815),
  ('economies', 1.3204340934753418),
  ('economy.', 1.3162963390350342),
  ('host', 1.2889987230300903),
  ('economy', 1.2784146070480347)])

In [13]:
cnn_embeddings.wv.similarity("crude oil", "natural gas"), cnn_embeddings.wv.most_similar(["crude oil", "natural gas"], negative=["energy"]) 

(0.21848081,
 [('crude', 0.6069318056106567),
  ('natural', 0.5777915120124817),
  ('oil', 0.3565499484539032),
  ('nature', 0.343952476978302),
  ('oil.', 0.28038379549980164),
  ('gas.', 0.2508723735809326),
  ('Oil', 0.23920536041259766),
  ('exports', 0.23526595532894135),
  ('shares', 0.22840961813926697),
  ('producers', 0.22448354959487915)])