In [1]:
from DataProcessor import Data_Processor
from DataProcessor import clean
import numpy as np

[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/yunzehui/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /Users/yunzehui/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/yunzehui/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [66]:
DP=Data_Processor(start_month='2019-01',end_month='2020-02',template="Data/Wealth_Management")
DP.readdata()
DP.specifylang()
DP.clean()

In [76]:
len(DP.textdata()) # 14 months

14

In [10]:
DP.textdata()[0] # Tweets of 2019-01

['Hey  swissborg do you like this article from coindesk a simple  fair ecosystem powered by technology ',
 'Global Insights Survey finding that  of highnetworth individuals worldwide expect their future relationships to be managed primarily or entirely online via luxedigitalmag ',
 'THE DIGITAL EVOLUTION OF WEALTH MANAGEMENT How emerging technologies can improve the user experience while cutting costs and boosting revenue by biuk',
 'The rise of roboadvisers in the UAE\n\nLowcost platforms targeting younger professionals make it easier for residents to save\n\nTheNationalUAE ',
 'You never get a second chance to brand a ',
 'This Simple Trick Could Be The Secret To Warren Buffetts Success ',
 'Doing it all today your Ironwood Group Player Of The Game goes to Grace Pyatt ',
 'Our CEO David Root was featured prominently in a Pittsburgh Post Gazette article this morning on inorganic growth and acquisitions by firms Read about his perspective here ',
 'Is Aphria TSXAPHA Stock the MostWatch

In [32]:
def preprocess(text_list, stem=False):
    # Remove stopwords, set lowercase, link tokens
    tokens = []
    for string in text_list:
        token = DP.getngrams(data=string,num=1,lemma=True)
        m = " ".join(str(v) for v in token)
        tokens.append(m)
    return tokens

In [33]:
text_list_processed = preprocess(DP.textdata()[0])

In [34]:
text_list_processed

['hey swissborg like article coindesk simple fair ecosystem powered technology',
 'global insight survey finding highnetworth individual worldwide expect future relationship managed primarily entirely online luxedigitalmag',
 'digital evolution wealth management emerging technology improve user experience cutting cost boosting revenue biuk',
 'rise roboadvisers uae lowcost platform targeting younger professional easier resident save thenationaluae',
 'never get second chance brand',
 'simple trick could secret warren buffetts success',
 'ironwood group player game go grace pyatt',
 'ceo david root featured prominently pittsburgh post gazette article morning inorganic growth acquisition firm read perspective',
 'aphria tsxapha stock mostwatched cannabis investment right',
 'k access allby law putnams reynolds kspecmag',
 'seems quovo compliment plaid fintech solution well bravo excellent acquisition growth win plaid',
 'hey swissborg like article coindesk simple fair ecosystem powered t

## Predict sentiments

In [2]:
import time
import pandas as pd
from keras.models import load_model
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from gensim.models import Word2Vec
import pickle

w2v_model = Word2Vec.load('model.w2v')
model = load_model('model.h5') 
with open('tokenizer.pkl', 'rb') as handle: tokenizer = pickle.load(handle) 
with open('encoder.pkl', 'rb') as handle: encoder = pickle.load(handle)

SEQUENCE_LENGTH = 300

Using TensorFlow backend.


In [56]:
def decode_sentiment(score, include_neutral=True):
    if include_neutral:        
        label = NEUTRAL
        if score <= SENTIMENT_THRESHOLDS[0]:
            label = NEGATIVE
        elif score >= SENTIMENT_THRESHOLDS[1]:
            label = POSITIVE

        return label
    else:
        return NEGATIVE if score < 0.5 else POSITIVE
    
def predict(text_list_processed, include_neutral=True):
    start_at = time.time()
    result = pd.DataFrame(columns=['text', 'label', 'score', 'elapsed_time'])
    for text in text_list_processed:
        # Tokenize text
        x_test = pad_sequences(tokenizer.texts_to_sequences([text]), maxlen=SEQUENCE_LENGTH)
        # Predict
        score = model.predict([x_test])[0]
        # Decode sentiment
        label = decode_sentiment(score, include_neutral=include_neutral)
        elapsed_time = time.time()-start_at
        result = result.append(pd.DataFrame({'text':[text],'label':[label],'score':[float(score)],'elapsed_time':[elapsed_time]}),ignore_index=True)
    return result  

In [59]:
result = predict(text_list_processed)

In [60]:
result # Sentiment Prediction of tweets in 2019-01

Unnamed: 0,text,label,score,elapsed_time
0,hey swissborg like article coindesk simple fai...,POSITIVE,0.780159,0.055755
1,global insight survey finding highnetworth ind...,POSITIVE,0.900442,0.074221
2,digital evolution wealth management emerging t...,NEUTRAL,0.520578,0.092578
3,rise roboadvisers uae lowcost platform targeti...,POSITIVE,0.872826,0.109462
4,never get second chance brand,NEUTRAL,0.400478,0.125467
...,...,...,...,...
11803,secure money advisor premier retirement planni...,NEUTRAL,0.674813,201.640029
11804,million woman could saving adequately retireme...,NEGATIVE,0.158679,201.657679
11805,million already investing w cbinsights mikequi...,NEUTRAL,0.569644,201.674834
11806,million already investing w cbinsights mikequi...,NEUTRAL,0.569644,201.691989


## Predict tweets between 2017-01 and 2020-05

In [3]:
start_month='2017-01'
end_month='2020-05'

In [7]:
DP2=Data_Processor(start_month,end_month,template="Data/Wealth_Management")
DP2.readdata()
DP2.specifylang()
DP2.clean()

In [4]:
def datelist(start_month, end_month):
    start_year = int(start_month[:4])
    start_month = int(start_month[-2:])
    end_year = int(end_month[:4])
    end_month = int(end_month[-2:])
    if start_year == end_year:
        month_range = range(start_month, end_month + 1)
        date_list = ["{year}-{month:0=2d}".format(year=str(start_year), month=M) for M in month_range]
        return date_list
    year_range = range(start_year + 1, end_year)
    start_year_month_range = range(start_month, 13)
    end_year_month_range = range(1, end_month + 1)
    date_list = ["{year}-{month:0=2d}".format(year=str(start_year), month=M) for M in start_year_month_range]
    date_list += ["{year}-{month:0=2d}".format(year=str(Y), month=M) for Y in year_range for M in range(1, 13)]
    date_list += ["{year}-{month:0=2d}".format(year=str(end_year), month=M) for M in end_year_month_range]
    return date_list

In [5]:
dl = datelist(start_month, end_month)

In [6]:
filepath=[]
for i in range(len(dl)):
    pathi = 'result/'+dl[i]+'.csv'
    filepath.append(pathi)

In [109]:
# run 1 time
for i in range(len(dl)):
    text_list_processed2 = preprocess(DP2.textdata()[i])
    result2 = predict(text_list_processed2)
    result2.to_csv(filepath[i], index = True, header = True)

## Scores Distribution

In [7]:
import os
import pandas as pd
import numpy as np
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
import os
import re
import plotly.tools as tls
import plotly.figure_factory as ff
import plotly as py
import plotly.graph_objs as go
import gensim
from gensim import corpora, models, similarities
import logging
import tempfile
import nltk
nltk.download('stopwords')
from string import punctuation
from collections import OrderedDict
import seaborn as sns
import pyLDAvis.gensim
import matplotlib.pyplot as plt
%matplotlib inline
import pandas as pd
import json
import random

init_notebook_mode(connected=True) #do not miss this line
import warnings
warnings.filterwarnings("ignore")

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/yunzehui/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!

pandas.util.testing is deprecated. Use the functions in the public API at pandas.testing instead.



In [8]:
tweets = pd.DataFrame(columns=['text', 'label', 'score', 'elapsed_time'])
for i in range(len(dl)):
    read = pd.read_csv(filepath[i],index_col=0)
    tweets = tweets.append(read)

In [9]:
score_dist = tweets["label"].value_counts().to_frame()
score_dist.columns = ["count"]
score_dist = score_dist.reset_index().sort_values(by = "index" ,ascending = False)

layout = go.Layout(dict(title = "Scores distribution",
                        plot_bgcolor  = "rgb(243,243,243)",
                        paper_bgcolor = "rgb(243,243,243)",
                        xaxis = dict(gridcolor = 'rgb(255, 255, 255)',
                                     gridwidth = 2),
                        yaxis = dict(gridcolor = 'rgb(255, 255, 255)',
                                     gridwidth = 2),
                        )
                  )

trace = go.Bar(x = score_dist["index"],
               y = score_dist["count"],
               marker = dict(line = dict(width = 1,color = "black"),
                             color = "red")
              )


fig = go.Figure(data = [trace],layout = layout)
py.offline.iplot(fig)

In [113]:
random.sample(list(tweets[tweets["label"]=='POSITIVE']['text']),10)

['thrilled announce metropolitan magazine named andersen best investment company best financial advisor cambridge easton area',
 'lloyd banking group talk schroders set joint venture uk high street lender shareholder breathe sigh relief say cgathompson read view',
 'panel expert gathered event hosted codify seis regtech incubator explored various use case artificial intelligence could provide asset industry',
 'proud partner alzheimercanada ig walk alzheimers walk across canada walking raise vital fund living dementia well care',
 'see opportunity momentum let discus',
 'join msciowilson pm hear economic financial market outlook',
 'estate planning first step planning retirement get help expert price',
 'good luck shortlisted candidate brand management reputation award taking place tonight full list',
 'information visit website',
 'join growing team office southfield mi opening family wealth manager apply']

In [114]:
random.sample(list(tweets[tweets["label"]=='NEUTRAL']['text']),10)

['ftse stock yielding think could explode',
 'carroll financial seen kiplinger',
 'rule never lose money rule never forget rule warren buffett brainyquote',
 'whats thinkadvisor honourablehappy adaliabrand dadsonpowermode psychicdiandra gentrytrotter mattcameron justthemiguel',
 'swissborg decentralized platform nextgen crypto swissborg',
 'superrich',
 'private client paralegal private client paralegal market leading company seeking private client paralegal join team shrewsbury office private client paralegal looking different',
 'firm seeking way cut cost considering option read insight industry impact',
 'financial advisor spokane wa get stock broker job',
 'modera position union pacific corporation unp']

In [115]:
random.sample(list(tweets[tweets["label"]=='NEGATIVE']['text']),10)

['monday bitcoin rally',
 'morgan stanley profit beat estimate lower cost',
 'procter gamble co pg share sold altavista',
 'degree economicsfinance worked yearsi formulate several argument youre thinking vacuum forgetting human nature global competitiveness gross mismanagement tax gov',
 'venezuela national assembly say found million dollar alleges stolen nicolas maduroin fund owned russian originally kazakhstan',
 'judson brewer simple way break bad habit andrew rowan financial planning ballarat',
 'chinese population invested product online terrible result incredible loss control supposed repressed financial system',
 'disappointed investment return u sector rotation strategy logical invest',
 'ask explain evangelistic brexidiot relocating fund cofounded eurozone hm brexidiocy hasnt even left port jrms already vanished horizon lifeboat',
 'allow bank wealthmanagement fund invested stock']