In [60]:
import pandas as pd
import numpy as np
from lib.utility import get_text, ProcessPipeline
import pickle
from concurrent.futures import ProcessPoolExecutor, ThreadPoolExecutor
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from textblob import TextBlob

In [61]:
### read pickle file for get strings rather than words
with open('./data/news.pickle', 'rb') as handle:
    texts = pickle.load(handle)

pipeline = ProcessPipeline(texts,steps=['langdetection','summarization'])
textsSummarized = pipeline.run(return_str=True)

In [4]:
### save as pickle
with open('./data/text_for_sentiment.pickle', 'wb') as handle:
    pickle.dump(textsSummarized, handle, protocol=pickle.HIGHEST_PROTOCOL)

### vaderSentiment

In [68]:
# use pretrained vader sentiment model
analyzer = SentimentIntensityAnalyzer()
# eg:
eg = textsSummarized[0]
senti_eg = analyzer.polarity_scores(eg)
print(senti_eg)

# for the whole texts
senti_list = list(map(lambda x:analyzer.polarity_scores(x),textsSummarized))
print(senti_list[:5])


{'neg': 0.044, 'neu': 0.824, 'pos': 0.133, 'compound': 0.9306}
[{'neg': 0.044, 'neu': 0.824, 'pos': 0.133, 'compound': 0.9306}, {'neg': 0.0, 'neu': 0.0, 'pos': 0.0, 'compound': 0.0}, {'neg': 0.0, 'neu': 0.0, 'pos': 0.0, 'compound': 0.0}, {'neg': 0.0, 'neu': 0.778, 'pos': 0.222, 'compound': 0.9468}, {'neg': 0.011, 'neu': 0.821, 'pos': 0.168, 'compound': 0.9584}]


In [22]:
# In general, we use compound score as sentiment scores
senti_list_compound = list(map(lambda x:x['compound'],senti_list))
print(senti_list_compound[:5])

[0.9306, 0.0, 0.0, 0.9468, 0.9584]


In [69]:
# do classification
senti_class = list(map(lambda x:'Positive' if x>0 else ('Negative' if x<0  else 'Neutral'),senti_list_compound))
senti_class[:5]

['Positive', 'Neutral', 'Neutral', 'Positive', 'Positive']

In [70]:
# save to csv
raw_0 = pd.read_csv('./data/bigquery_raw.csv',index_col=0)
raw_0['Sentiment_score'] = senti_list_compound
raw_0['Sentiment'] = senti_class
print(raw_0.head(5))
raw_0.to_csv('./data/news_with_senti.csv')

             DATE                                             THEMES  \
0  20190101060000  EDUCATION;SOC_POINTSOFINTEREST;SOC_POINTSOFINT...   
1  20190101061500  TAX_FNCACT;TAX_FNCACT_MAN;ARREST;SOC_GENERALCR...   
2  20190101063000  TAX_FNCACT;TAX_FNCACT_LEADER;ENV_NUCLEARPOWER;...   
3  20190101061500  ENV_GREEN;WB_507_ENERGY_AND_EXTRACTIVES;WB_525...   
4  20190101061500  ENV_GREEN;WB_507_ENERGY_AND_EXTRACTIVES;WB_525...   

                                  DocumentIdentifier  Sentiment_score  \
0  https://www.daijiworld.com/chan/exclusiveDispl...           0.9306   
1             https://caymannewsservice.com/2018/12/           0.0000   
2  https://www.vesti.bg/tehnologii/bil-gejts-sash...           0.0000   
3  https://www.ajc.com/business/economy/georgia-p...           0.9468   
4  https://pv-magazine-usa.com/2018/12/18/breakin...           0.9584   

  Sentiment  
0  Positive  
1   Neutral  
2   Neutral  
3  Positive  
4  Positive  


In [34]:
# To figure out most negative or positive news
def most_senti_news(df,Pos = True,top = 10):
    """
    default to find most positive news by using compound score
    return two seperate lists of THEMES and DocumentIdentifier
    """
    if Pos:
        new = df.sort_values(by=['Sentiment_score'], ascending=False).reset_index(drop=True)
    else:
        new = df.sort_values(by=['Sentiment_score'], ascending=True).reset_index(drop=True)
    return new['THEMES'][:top], new['DocumentIdentifier'][:top]


### TextBlob

In [38]:
blob_eg = TextBlob(eg).sentiment
print("blob_eg:",blob_eg)

blob_eg: Sentiment(polarity=0.07952651515151514, subjectivity=0.40354166666666674)


In [48]:
# we could train a supervised classifier for this projects 
# set an example from the document of textblob
# use textblob classifier to easily understand for finance team
from textblob.classifiers import NaiveBayesClassifier
train = [
     ('I love this sandwich.', 'pos'),
     ('this is an amazing place!', 'pos'),
     ('I feel very good about these beers.', 'pos'),
     ('this is my best work.', 'pos'),
     ("what an awesome view", 'pos'),
     ('I do not like this restaurant', 'neg'),
     ('I am tired of this stuff.', 'neg'),
     ("I can't deal with this", 'neg'),
     ('he is my sworn enemy!', 'neg'),
     ('my boss is horrible.', 'neg')
 ]
test = [
     ('the beer was good.', 'pos'),
     ('I do not enjoy my job', 'neg'),
     ("I ain't feeling dandy today.", 'neg'),
     ("I feel amazing!", 'pos'),
     ('Gary is a friend of mine.', 'pos'),
    ("I can't believe I'm doing this.", 'neg')
 ]
cl = NaiveBayesClassifier(train)
blob = list(map(lambda x: cl.classify(x[0]),test))
blob

['pos', 'neg', 'neg', 'pos', 'neg', 'neg']

### Word embedding


In [78]:
# train a model by our docs
# but we need the finance team's support to get labels
# set an example to do, incited from document of tf-idf

# TF-IDF
from sklearn.feature_extraction.text import TfidfVectorizer
texts = [
    "The cat sat on the hat",
    "The dog ate the cat and the hat",
    "Dog and cat are good friends"
]
tfidf = TfidfVectorizer()
features = tfidf.fit_transform(texts)
print("tf_idf:\n\n",pd.DataFrame(features.todense(),columns=tfidf.get_feature_names()))

# svd
uniqueTf = tfidf.get_feature_names()
coMatrix = np.random.randint(3, size=(5, len(uniqueTf)))
print('svd\n\n',pd.DataFrame(columns=uniqueTf,data=coMatrix))
u, s, vt  = np.linalg.svd(coMatrix,full_matrices=False)
print(f"shape of SVD component:\n\n {u.shape},{s.shape},{vt.shape}")
print('score:\n\n',pd.DataFrame(u@np.diag(s)@vt).head(2))

tf_idf:

         and      are       ate       cat       dog  friends     good  \
0  0.000000  0.00000  0.000000  0.257992  0.000000  0.00000  0.00000   
1  0.264148  0.00000  0.347322  0.205134  0.264148  0.00000  0.00000   
2  0.358291  0.47111  0.000000  0.278245  0.358291  0.47111  0.47111   

        hat        on       sat       the  
0  0.332211  0.436818  0.436818  0.664422  
1  0.264148  0.000000  0.000000  0.792443  
2  0.000000  0.000000  0.000000  0.000000  
svd

    and  are  ate  cat  dog  friends  good  hat  on  sat  the
0    2    0    1    2    1        0     2    0   2    0    1
1    2    1    0    1    2        0     1    0   0    1    1
2    2    1    0    0    0        2     1    0   1    2    1
3    2    0    1    2    1        0     1    1   1    1    1
4    0    1    0    2    1        2     1    1   2    2    0
shape of SVD component:

 (5, 5),(5,),(5, 11)
score:

     0             1             2    3    4             5    6             7   \
0  2.0  2.432867e

### Modulize

In [None]:
class sentiment:
    def __init__(self, texts, df = None,Pos = True,top = 10, path):
        self.texts = texts
        self.df = df
        self.Pos = Pos
        self.top = top
        self.path = path
    
    def get_data(self,texts):
        pipeline = ProcessPipeline(texts,steps=['langdetection','summarization'])
        textsSummarized = pipeline.run(return_str=True)
        return textsSummarized
    
    def most_senti_news(self,df,Pos,top):
        """
        default to find most positive news by using compound score
        return two seperate lists of THEMES and DocumentIdentifier
        """
        if Pos:
            new = df.sort_values(by=['Sentiment_score'], ascending=False).reset_index(drop=True)
        else:
            new = df.sort_values(by=['Sentiment_score'], ascending=True).reset_index(drop=True)
        return new['THEMES'][:top], new['DocumentIdentifier'][:top]

    def senti_vendar(self,texts,df):
        textsSummarized = self.get_data(texts)
        analyzer = SentimentIntensityAnalyzer()
        senti_list = list(map(lambda x:analyzer.polarity_scores(x),textsSummarized))
        senti_list_compound = list(map(lambda x:x['compound'],senti_list))
        senti_class = list(map(lambda x:'Positive' if x>0 else ('Negative' if x<0  else 'Neutral'),senti_list_compound))
        df['Sentiment_score'] = senti_list_compound
        df['Sentiment'] = senti_class
        df.to_csv(path,index_col = 0)
        return self.df
    
    def run(self,workers=6):
        with ProcessPoolExecutor(max_workers=workers) as executor:
            res = executor.map(self.senti_vendar(), self.texts, self.df)         
        return list(res)
        
        