In [27]:
import pandas as pd
import numpy as np
from lib.utility import get_text, ProcessPipeline
from lib.nlp import nlpPipeline,featurePipeline
import pickle

from concurrent.futures import ProcessPoolExecutor, ThreadPoolExecutor


# Part 1: data engineering

In [3]:
### read pickle file
with open('outputs/step2_news_raw.pickle', 'rb') as handle:
    texts = pickle.load(handle)

In [78]:
len(texts)

2988

In [14]:
### read pickle file
with open('outputs/step3_LDA_textsProcessed.pickle', 'rb') as handle:
    textsProcessed = pickle.load(handle)

# Part 2: Topic Modeling

In [5]:
from gensim import corpora, models
import gensim
from gensim.models import CoherenceModel


In [6]:
dictionary = corpora.Dictionary(textsProcessed)

In [7]:
dictionary.token2id

{'C': 0,
 'CO': 1,
 'acidif': 2,
 'activ': 3,
 'address': 4,
 'also': 5,
 'among': 6,
 'anthropogen': 7,
 'articl': 8,
 'avail': 9,
 'beli': 10,
 'caus': 11,
 'chang': 12,
 'climat': 13,
 'concern': 14,
 'consensu': 15,
 'consumpt': 16,
 'continu': 17,
 'contribut': 18,
 'cyclon': 19,
 'degre': 20,
 'develop': 21,
 'drought': 22,
 'earth’': 23,
 'econom': 24,
 'electr': 25,
 'emiss': 26,
 'energi': 27,
 'etc': 28,
 'event': 29,
 'expect': 30,
 'extrem': 31,
 'fire': 32,
 'forest': 33,
 'forum': 34,
 'fuell': 35,
 'global': 36,
 'grid': 37,
 'grow': 38,
 'ice': 39,
 'includ': 40,
 'increas': 41,
 'indian': 42,
 'india’': 43,
 'issu': 44,
 'known': 45,
 'level': 46,
 'life': 47,
 'lifetim': 48,
 'link': 49,
 'live': 50,
 'loss': 51,
 'mean': 52,
 'million': 53,
 'one': 54,
 'particularli': 55,
 'polar': 56,
 'potenti': 57,
 'press': 58,
 'problem': 59,
 'public': 60,
 'read': 61,
 'research': 62,
 'rise': 63,
 'sea': 64,
 'solar': 65,
 'strongli': 66,
 'surfac': 67,
 'temperatur': 68,
 '

In [9]:
corpus = [dictionary.doc2bow(singleDocList) for singleDocList  in textsProcessed]

In [10]:
### This list of tuples represents our first document, doc_a. The tuples are (term ID, term frequency) pairs, so if  print(dictionary.token2id) says brocolli’s id is 0, then the first tuple indicates that brocolli appeared twice in doc_a. doc2bow() only includes terms that actually occur: terms that do not occur in a document will not appear in that document’s vector.
print(corpus[0])

[(0, 1), (1, 2), (2, 1), (3, 1), (4, 1), (5, 1), (6, 1), (7, 2), (8, 1), (9, 1), (10, 1), (11, 2), (12, 2), (13, 2), (14, 1), (15, 1), (16, 1), (17, 1), (18, 1), (19, 1), (20, 1), (21, 2), (22, 1), (23, 1), (24, 1), (25, 1), (26, 4), (27, 4), (28, 1), (29, 2), (30, 1), (31, 2), (32, 1), (33, 1), (34, 2), (35, 1), (36, 5), (37, 1), (38, 1), (39, 1), (40, 1), (41, 4), (42, 1), (43, 1), (44, 1), (45, 1), (46, 1), (47, 1), (48, 2), (49, 2), (50, 1), (51, 1), (52, 1), (53, 1), (54, 1), (55, 1), (56, 1), (57, 1), (58, 1), (59, 1), (60, 1), (61, 1), (62, 1), (63, 1), (64, 1), (65, 1), (66, 1), (67, 1), (68, 3), (69, 1), (70, 1), (71, 1), (72, 1), (73, 1), (74, 1), (75, 2), (76, 1), (77, 1)]


In [11]:
ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics=3, id2word = dictionary, passes=20)

In [12]:
ldamodel.print_topics(num_topics=3, num_words=10)

[(0,
  '0.031*"hous" + 0.026*"new" + 0.022*"trump" + 0.022*"democrat" + 0.017*"speaker" + 0.017*"pelosi" + 0.017*"govern" + 0.013*"congress" + 0.012*"—" + 0.012*"leader"'),
 (1,
  '0.014*"energi" + 0.011*"new" + 0.009*"state" + 0.009*"said" + 0.007*"climat" + 0.006*"year" + 0.006*"solar" + 0.005*"chang" + 0.005*"would" + 0.005*"renew"'),
 (2,
  '0.012*"energi" + 0.011*"share" + 0.010*"compani" + 0.009*"market" + 0.007*"rate" + 0.007*"power" + 0.006*"report" + 0.006*"gener" + 0.005*"electr" + 0.005*"stock"')]

In [13]:
# Visualize the topics
from pyLDAvis import gensim as pyLDAvis_gensim
import pyLDAvis
pyLDAvis.enable_notebook()
vis = pyLDAvis_gensim.prepare(ldamodel, corpus, dictionary)
vis

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=True'.


  return pd.concat([default_term_info] + list(topic_dfs))


In [23]:
ldamodel[corpus[0]]

[(1, 0.58676755), (2, 0.4099226)]

In [24]:
### Check each document and its topics
topicDic = {0:[],1:[],2:[]}
for i in range(len(corpus)):
    allTopics = ldamodel[corpus[i]] ### eg: [(1, 0.58676654), (2, 0.4099236)]
    for topic in allTopics:
        topicID = topic[0]
        prob = topic[1]
        if prob > 0.6:
            topicDic[topicID] += [i]
            break


In [26]:
topicDic.keys()

dict_keys([0, 1, 2])

# Part 3: Sentiment analysis

In [33]:
### read pickle file
with open('outputs/step4_sent_features.pickle', 'rb') as handle:
    sentDF = pickle.load(handle)
print(sentDF.shape)
sentDF.head()

(2988, 6)


Unnamed: 0,compound,neg,neu,plarity,pos,subjectivity
0,0.9178,0.045,0.828,0.875,0.127,0.6
1,0.0,0.0,0.0,0.875,0.0,0.6
2,0.0,0.0,0.0,0.875,0.0,0.6
3,0.9468,0.0,0.778,0.875,0.222,0.6
4,0.9584,0.011,0.818,0.875,0.172,0.6


In [38]:
posFeatures = sentDF['pos'].values
negFeatures = sentDF['neg'].values

In [39]:
featureDic = {}
for topicID in topicDic:
    newsIDs = [topicDic[topicID]]
    
    tmpDic = {'pos':posFeatures[newsIDs],'neg':negFeatures[newsIDs]}
    featureDic[topicID] = tmpDic

  """


In [40]:
featureDic

{0: {'pos': array([0.049, 0.012, 0.012, 0.012, 0.012, 0.049, 0.049, 0.012, 0.049,
         0.049, 0.049, 0.049, 0.032, 0.077, 0.049, 0.049, 0.049, 0.049,
         0.049, 0.049, 0.049, 0.012, 0.049, 0.012, 0.049, 0.049, 0.077,
         0.012, 0.012, 0.271, 0.   , 0.049, 0.012, 0.012, 0.012, 0.049,
         0.049, 0.049, 0.049, 0.077, 0.012, 0.049, 0.049, 0.065, 0.012,
         0.012, 0.065, 0.049, 0.07 , 0.053, 0.057, 0.012, 0.049, 0.012,
         0.053, 0.053, 0.053, 0.053, 0.049, 0.057, 0.012, 0.053, 0.053,
         0.053, 0.07 , 0.049, 0.07 , 0.053, 0.07 , 0.049, 0.049, 0.053,
         0.049, 0.07 , 0.053, 0.07 , 0.   , 0.049, 0.012, 0.049, 0.049,
         0.053, 0.053, 0.049, 0.049, 0.049, 0.049, 0.053, 0.012, 0.053,
         0.049, 0.049, 0.049, 0.07 , 0.053, 0.053, 0.049, 0.049, 0.049,
         0.053, 0.   , 0.049, 0.049, 0.053, 0.049, 0.089, 0.049, 0.065,
         0.049, 0.049, 0.049, 0.   , 0.057, 0.058, 0.012, 0.041, 0.057,
         0.   , 0.013, 0.034, 0.049, 0.058, 0.049, 0.0

In [68]:
top3Dic = {}
for topicID in featureDic:
    tmpPos = featureDic[topicID]['pos']
    tmpNeg = featureDic[topicID]['neg']
    
    top3pos = np.argpartition(tmpPos, -3)[-3:]
    top3neg =  np.argpartition(tmpNeg, -3)[-3:]
    
    tmpDic = {}
    tmpDic['pos'] = top3pos
    tmpDic['neg'] = top3neg
    
    top3Dic.update({topicID:tmpDic})

In [69]:
top3Dic

{0: {'pos': array([279, 290, 185]), 'neg': array([279, 100, 486])},
 1: {'pos': array([227,  70, 193]), 'neg': array([558, 729, 316])},
 2: {'pos': array([489, 153, 415]), 'neg': array([346, 381, 454])}}

In [70]:
for topicID in top3Dic:
    print(f"For topic {topicID}: three most positive articles are : {top3Dic[topicID]['pos']}, three most negative articles are : {top3Dic[topicID]['neg']}")

For topic 0: three most positive articles are : [279 290 185], three most negative articles are : [279 100 486]
For topic 1: three most positive articles are : [227  70 193], three most negative articles are : [558 729 316]
For topic 2: three most positive articles are : [489 153 415], three most negative articles are : [346 381 454]


# Part 4: Keyword extraction

In [72]:
### read pickle file
with open('outputs/step4_sent_textsSummarized.pickle', 'rb') as handle:
    textsSummarized = pickle.load(handle)

In [74]:
entityDic = {}

In [None]:
for topicID in topicDic:
    includedNewsIndex = topicDic[topicID]
    includedNews = np.array(textsSummarized)[includedNewsIndex]
    print("Start to process of topic:",topicID)
    pipeline = nlpPipeline(texts=includedNews,targetPOSs=[]) ### only interested in entity
    entities = pipeline.run()
    entityDic[topicID] = entities
    print("Finish process of topic:",topicID)

In [None]:
entityDic