In [1]:
import re  
import pandas as pd 
from collections import defaultdict  # For word frequency
import logging  # Setting up the loggings to monitor gensim
logging.basicConfig(format="%(levelname)s - %(asctime)s: %(message)s", datefmt= '%H:%M:%S', level=logging.INFO)

In [2]:
df = pd.read_csv(r'C:\Users\Joyce Huang\Downloads\financial-news-sentiments.csv.', index_col=None)
df.head(3)

Unnamed: 0,sentiments,text
0,neutral,"According to Gran , the company has no plans t..."
1,neutral,Technopolis plans to develop in stages an area...
2,negative,The international electronic industry company ...


In [3]:
df = df[['text']]

In [4]:
brief_cleaning = lambda x: re.sub("[^A-Za-z']+", ' ', str(x)).lower()

In [5]:
df["clean"] = df.text.apply(brief_cleaning)
from nltk.corpus import stopwords
import nltk 
nltk.download('stopwords')
stop_words = set(stopwords.words('english')) 
def cleaning(doc):
    # very simple tokenizer
    txt =doc.strip().split()
    txt = [word for word in txt if word not in stop_words]
    # remove the short sentence, since the context information is not enough
    if len(txt) > 2:
        return ' '.join(txt)

[nltk_data] Downloading package stopwords to C:\Users\Joyce
[nltk_data]     Huang\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [6]:
# remove the short line
txt = [cleaning(doc) for doc in df.clean.values]
df_clean = pd.DataFrame({'clean': txt})
df_clean = df_clean.dropna().drop_duplicates()
df_clean.shape

INFO - 13:30:13: NumExpr defaulting to 8 threads.


(4738, 1)

In [7]:
from gensim.models.phrases import Phrases, Phraser
# take a list of list of words as input
sent = [row.split() for row in df_clean['clean']]
phrases = Phrases(sent, min_count=10, progress_per=10000)
bigram = Phraser(phrases)
sentences = bigram[sent]

INFO - 13:30:13: 'pattern' package not found; tag filters are not available for English
INFO - 13:30:13: collecting all words and their counts
INFO - 13:30:13: PROGRESS: at sentence #0, processed 0 words and 0 word types
INFO - 13:30:13: collected 47878 word types from a corpus of 60074 words (unigram + bigrams) and 4738 sentences
INFO - 13:30:13: using 47878 counts as vocab in Phrases<0 vocab, min_count=10, threshold=10.0, max_vocab_size=40000000>
INFO - 13:30:13: source_vocab length 47878
INFO - 13:30:14: Phraser built with 141 phrasegrams


In [8]:
# check the extracted bigram. Take phoebe_buffay as an example
for idx in range(len(sentences)):
    if "stock" in sentences[idx]:
        print("before bigram extracting")
        print(sent[idx])
        print("after bigram extracting")
        print(sentences[idx])
        break

before bigram extracting
['increase', 'capital', 'stock', 'registered', 'finnish', 'trade', 'register', 'november']
after bigram extracting
['increase', 'capital', 'stock', 'registered', 'finnish', 'trade_register', 'november']


In [9]:
word_freq = defaultdict(int)
for sent in sentences:
    for i in sent:
        word_freq[i] += 1
len(word_freq)

9084

In [10]:
sorted(word_freq, key=word_freq.get, reverse=True)[:10]

["'s",
 'company',
 'eur',
 'finnish',
 'said',
 'eur_mn',
 'finland',
 'group',
 'new',
 'business']

In [11]:
# gensim support multi-core training
import multiprocessing
cores = multiprocessing.cpu_count()
from gensim.models import Word2Vec

In [12]:
w2v_model = Word2Vec(min_count=10,
                     window=3,
                     size=55,
                     sample=6e-5, 
                     alpha=0.05, 
                     min_alpha=0.0007, 
                     negative=20,
                     workers=cores-1)



In [13]:
w2v_model.build_vocab(sentences, progress_per=10000)

INFO - 13:30:14: collecting all words and their counts
INFO - 13:30:14: PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
INFO - 13:30:14: collected 9084 word types from a corpus of 56134 raw words and 4738 sentences
INFO - 13:30:14: Loading a fresh vocabulary
INFO - 13:30:14: effective_min_count=10 retains 1263 unique words (13% of original 9084, drops 7821)
INFO - 13:30:14: effective_min_count=10 leaves 39399 word corpus (70% of original 56134, drops 16735)
INFO - 13:30:14: deleting the raw counts dictionary of 9084 items
INFO - 13:30:14: sample=6e-05 downsamples 1263 most-common words
INFO - 13:30:14: downsampling leaves estimated 12736 word corpus (32.3% of prior 39399)
INFO - 13:30:14: estimated required memory for 1263 words and 55 dimensions: 1187220 bytes
INFO - 13:30:14: resetting layer weights


In [14]:
w2v_model.train(sentences, total_examples=w2v_model.corpus_count, epochs=40, report_delay=1)

INFO - 13:30:14: training model with 7 workers on 1263 vocabulary and 55 features, using sg=0 hs=0 sample=6e-05 negative=20 window=3
INFO - 13:30:14: worker thread finished; awaiting finish of 6 more threads
INFO - 13:30:17: EPOCH 1 - PROGRESS: at 11.12% examples, 425 words/s, in_qsize 5, out_qsize 1
INFO - 13:30:17: worker thread finished; awaiting finish of 5 more threads
INFO - 13:30:19: EPOCH 1 - PROGRESS: at 28.01% examples, 676 words/s, in_qsize 4, out_qsize 1
INFO - 13:30:19: worker thread finished; awaiting finish of 4 more threads
INFO - 13:30:19: worker thread finished; awaiting finish of 3 more threads
INFO - 13:30:20: worker thread finished; awaiting finish of 2 more threads
INFO - 13:30:20: worker thread finished; awaiting finish of 1 more threads
INFO - 13:30:20: worker thread finished; awaiting finish of 0 more threads
INFO - 13:30:20: EPOCH - 1 : training on 56134 raw words (12657 effective words) took 5.6s, 2253 effective words/s
INFO - 13:30:20: worker thread finished

(510354, 2245360)

In [17]:
# we can save the model and load it again
w2v_model.save('tmp_model')
#new_model = gensim.models.Word2Vec.load('tmp_model')

INFO - 13:35:01: saving Word2Vec object under tmp_model, separately None
INFO - 13:35:01: not storing attribute vectors_norm
INFO - 13:35:01: not storing attribute cum_table
INFO - 13:35:01: saved tmp_model


In [18]:
# w2v_model.init_sims(replace=True)
w2v_model.init_sims(replace=True)

INFO - 13:35:26: precomputing L2-norms of word weight vectors


In [19]:
w2v_model.wv.most_similar(positive=["fund"])

[('funds', 0.9890653491020203),
 ('property', 0.9888561964035034),
 ('investment', 0.9834142327308655),
 ('equity', 0.9824492335319519),
 ('private', 0.981529176235199),
 ('sold', 0.979146420955658),
 ('stockholm', 0.9786463379859924),
 ('partners', 0.9778483510017395),
 ('taken', 0.9772882461547852),
 ('owns', 0.9764710664749146)]