In [None]:
import numpy as np
import pandas as pd
import scipy as sp
import os
import alphalens as al
import nltk
from collections import defaultdict
from nltk.stem import WordNetLemmatizer, SnowballStemmer
from collections import Counter
from nltk.corpus import stopwords
from nltk import word_tokenize
from nltk.tokenize import RegexpTokenizer
from wordcloud import WordCloud
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
TrainingDataPath=os.path.abspath('./Data/train')
TrainingTickerList=os.listdir(TrainingDataPath)
if '.DS_Store' in TrainingTickerList:
    TrainingTickerList.remove('.DS_Store')

In [None]:
TrainingTickerList

In [None]:
word_tokenizer = RegexpTokenizer(r'[^\d\W]+')
sno = SnowballStemmer('english')
wnl = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))
stop_words |= {"may", "business", "company", "could", "service", "result", "product", 
               "operation", "include", "law", "tax", "change", "financial", "require",
               "cost", "market", "also", "user", "plan", "actual", "cash", "other",
               "thereto", "thereof", "therefore", "bloomberg","email","photograph",
               "bloombergquint","productsbloomberg","loginbloomberg", "bloombergconnect",
               "customersbloomberg","inclusioninnovationphilanthropysustainabilitybloomberg","londonbloomberg",
               "distributionbloomberg","lawbloomberg","taxbloomberg","governmentbloomberg","environmentbloombergnef",
               "mediabloomberg","marketsbloomberg","technologybloomberg","pursuitsbloomberg","politicsbloomberg",
               "opinionbloomberg","businessweekbloomberg","conferencesbloomberg", "appsbloomberg",
               "radiobloomberg","servicesbloomberg","onbloomberg","clipsbloomberg","saysbloomberg","bloombergolaf",
               "bloombergwhat","loginsoftwar",'followfacebooktwitterlinkedininstagram',
               'facebooktwitterinstagramlinkedin','twitterfacebook'}

In [None]:
def clean_text(txt):
    #txt: text body string
    #return: cleaned tockens list
    lemm_txt = [ wnl.lemmatize(wnl.lemmatize(w.lower(),'n'),'v') \
                for w in word_tokenizer.tokenize(txt)[1:] if \
                w.isalpha() and "bloomberg" not in w and w not in stop_words \
                and wnl.lemmatize(wnl.lemmatize(w.lower(),'n'),'v') not in stop_words]
    return [ sno.stem(w) for w in lemm_txt if w not in stop_words and len(w) > 2 and "bloomberg" not in w and sno.stem(w) not in stop_words]

In [None]:
def bag_of_words(words):
    # words: cleaned tokens list
    dictWords = defaultdict(int)
    for word in words:
        dictWords[word] +=1
    return dictWords

In [None]:
def struct_article_data(datapath,tickerlist):
    articles=[]
    for ticker in tickerlist:
        folder=os.path.join(datapath,ticker) # folder path
        docname=os.listdir(folder)
        if '.DS_Store' in docname:
            docname.remove('.DS_Store')
        
        for doc in docname: # eg: doc='20190115_+0.036786_0.txt'
            with open(os.path.join(folder,doc)) as f:
                if doc[9:18][-1] == '_':
                    NextDayReturn = np.float(0)
                else:
                    NextDayReturn = np.float(doc[9:18])
                
                articles.append({'BOW':bag_of_words(clean_text(''.join(f))),
                                 'Ticker':ticker,
                                 'ReleaseDate':doc[:4]+'-'+doc[4:6]+'-'+doc[6:8],
                                 'NextDayReturn':NextDayReturn})
    
    return articles

In [None]:
articles_training=struct_article_data(TrainingDataPath,TrainingTickerList)

In [None]:
articles_training[0]

In [None]:
articles_training[20]

In [None]:
len(articles_training)

In [None]:
def sgn(v):
    if v>0:
        return 1;
    if v<0:
        return 0;
    if v==0:
        return 0.5;

In [None]:
def get_sentiment_charged_words(articles, alpha_pos, alpha_neg, kappa, stop_words):
    occurance=defaultdict(int)
    pos_occr=defaultdict(int)
    neg_occr=defaultdict(int)
    for article in articles:
        if sgn(article['NextDayReturn'])==1:
            flag=(1,0)
        elif sgn(article['NextDayReturn'])==0:
            flag=(0,1)
        else:
            flag=(0,0)
        
        for key, value in article['BOW'].items():
            if value>0 and key not in stop_words:
                occurance[key] += 1
                pos_occr[key] += flag[0]
                neg_occr[key] += flag[1]
    
    sentiment_charged_words=[]
    pos_words=[]
    neg_words=[]
    for word, count in occurance.items():
        if (count>kappa) and (pos_occr[word]/count > (0.5+alpha_pos)):
            pos_words.append(word)
            sentiment_charged_words.append(word)
        if (count>kappa) and (neg_occr[word]/count > (0.5+alpha_neg)):
            neg_words.append(word)
            sentiment_charged_words.append(word)
        
    return [pos_words,neg_words,sentiment_charged_words,occurance,pos_occr,neg_occr]

In [None]:
def get_document_matrix(articles, sentiment_charged_words):
    DocMatrix=np.zeros((len(articles),len(sentiment_charged_words)))
    for i in range(len(articles)):
        DocVector=[articles[i]['BOW'][word] for word in sentiment_charged_words]
        if np.sum(DocVector) != 0:
            DocMatrix[i]=DocVector/np.sum(DocVector)
        else:
            DocMatrix[i]=DocVector
    
    return DocMatrix

In [None]:
def get_training_sentiment_score(articles):
    SentimentScoreMatrix=np.zeros((2,len(articles)))
    SentimentScoreMatrix[0]=pd.Series([ article['NextDayReturn'] for article in articles ]).rank().values/len(articles)
    SentimentScoreMatrix[1]=1-SentimentScoreMatrix[0]
    
    return SentimentScoreMatrix

In [None]:
def get_estimated_words_distribution(DocMatrix, SSMatrix):
    #SSMatrix: SentimentScoreMatrix
    WordsDistribution=DocMatrix.T.dot(SSMatrix.T).dot(np.linalg.pinv(SSMatrix.dot(SSMatrix.T)))
    WordsDistribution=np.where(WordsDistribution<0,0,WordsDistribution)
    WordsDistribution=WordsDistribution/WordsDistribution.sum(axis=0)
    
    return WordsDistribution

In [None]:
def get_training_result(articles_training,alpha_pos,alpha_neg,kappa,stop_words):
    # get_sentiment_charged_words(articles, alpha_pos, alpha_neg, kappa):
    [pos_words,neg_words,sentiment_charged_words,occurance,pos_occr,neg_occr]=\
                               get_sentiment_charged_words(articles_training, alpha_pos, alpha_neg, kappa,stop_words)

    # get_document_matrix(articles, sentiment_charged_words):
    DocMatrix=get_document_matrix(articles_training, sentiment_charged_words)

    # get_training_sentiment_score(articles):
    SentimentScoreMatrix=get_training_sentiment_score(articles_training)

    # get_estimated_words_distribution(DocMatrix, SSMatrix):
    WordsDistribution=get_estimated_words_distribution(DocMatrix, SentimentScoreMatrix)
    
    return [pos_words,neg_words,sentiment_charged_words,occurance,pos_occr,neg_occr,WordsDistribution]

In [None]:
def get_sentiment_score_prediction(bow, SCWords, WordsDist, lamb, InitGuess):
    #bow: BoW of the new article
    #SCWords: sentiment-charged words list
    #WordsDist: trained words distribution (of sentiment-charged words), 2darray
    #lamb: coefficient of penalty, i.e learning rate
    #InitGuess: initial guess of sentiment score
    DocVector=[ bow[word] for word in SCWords ]
    if np.sum(DocVector) != 0:
        DocVector=DocVector/np.sum(DocVector)
    else:
        DocVector=np.array(DocVector)
    
    def neg_penalized_likelihood(SentimentScore):
        q = SentimentScore*WordsDist.T[0] + (1-SentimentScore)*WordsDist.T[1]
        val=np.sum([ DocVector[i]*np.log(q[i]) for i in range(DocVector.shape[0]) ]) \
                  + lamb*np.log(SentimentScore*(1-SentimentScore))
        return -val
    
    result=sp.optimize.minimize(neg_penalized_likelihood,InitGuess,method='SLSQP',bounds=[(0,1)])
    
    return result.x[0]

#### Now grid search for appopriate threshold  ( $\alpha_+$, $\alpha_-$, $\kappa$, $\lambda$ )
#### Analysis is based on single factor analysis, mainly focusing on IC / IR ( ?? )

In [None]:
def get_sentiment_score_prediction_table(sentiment_charged_words,WordsDistribution,articles_validation,lamb):
    
    ###  Now we have word distribution from articles_training ###
    ###  Tend to get sentiment score prediction of every article in articles_validation ###
    
    ArticlePrediction={} # save prediction of article which is in articles_validation, form is like:
    
    #   {'AAPL': {'2019-01-16': DefaultDict_float{'PredictionScore':2.35252,
    #                                             'count': 3
    #                                            }
    #             '2019-01-17': DefaultDict_float{'PredictionScore':6.25253,
    #                                             'count': 6
    #                                            }
    #            }
    #
    #    'MSFT': {'2019-01-16': DefaultDict_float{'PredictionScore':2.35252,
    #                                             'count': 3
    #                                            }
    #             '2019-01-17': DefaultDict_float{'PredictionScore':6.25253,
    #                                             'count': 6
    #                                            }
    #            }
    #
    #    }
    
    TrackingNum=0
    print('Prediction work start. Total workload will be: ',len(articles_validation))
    
    for article in articles_validation:
        # get_sentiment_score_prediction(bow, SCWords, WordsDist, lamb, InitGuess):
        PredictedScore=get_sentiment_score_prediction(article['BOW'],sentiment_charged_words,WordsDistribution,lamb,0.5)
        
        TrackingNum += 1
        print('Prediction for a new article done: ',article['Ticker'],', ',article['ReleaseDate'],', TrackingNum: ',TrackingNum)
        
        if article['Ticker'] not in ArticlePrediction.keys():
            ArticlePrediction[article['Ticker']]={}
        
        if article['ReleaseDate'] not in ArticlePrediction[article['Ticker']].keys():
            ArticlePrediction[article['Ticker']][article['ReleaseDate']]=defaultdict(float)
        
        ArticlePrediction[article['Ticker']][article['ReleaseDate']]['Prediction'] += PredictedScore
        ArticlePrediction[article['Ticker']][article['ReleaseDate']]['count'] += 1
    
    
    print('Prediction all complete')
    
    date=set()
    for score in ArticlePrediction.values():
        date |= set(score.keys())
    
    
    print('Start creating table')
    
    table=pd.DataFrame(columns=ArticlePrediction.keys(), index=date)
    for ticker in ArticlePrediction.keys():
        for date_ in date:
            if date_ in ArticlePrediction[ticker].keys():
                PredictionScore=ArticlePrediction[ticker][date_]['Prediction']
                count=ArticlePrediction[ticker][date_]['count']
                table.loc[date_,ticker]=PredictionScore/count
            else:
                table.loc[date_,ticker]=np.nan
    
    print('Create table complete')
    
    table.index=pd.to_datetime(table.index,format='%Y-%m-%d')
    table=table.sort_index(axis=0,ascending=True)
    
    return table


In [None]:
TestDataPath=os.path.abspath('./Data/test')
TestTickerList=os.listdir(TestDataPath)
if '.DS_Store' in TestTickerList:
    TestTickerList.remove('.DS_Store')

In [None]:
TestTickerList

In [None]:
articles_test=struct_article_data(TestDataPath,TestTickerList)

In [None]:
articles_test[0]

In [None]:
len(articles_test)

In [None]:
[pos_words,neg_words,sentiment_charged_words,occurance,pos_occr,neg_occr,WordsDistribution]=\
                                                       get_training_result(articles_training,0.13,0.2,20,stop_words)

In [None]:
plt.figure(figsize=(18,12),dpi=300)
PosWordCloud = WordCloud(background_color="white",width=5000, height=3000, margin=2).generate(' '.join(pos_words))
plt.imshow(PosWordCloud)
plt.axis('off')

In [None]:
plt.figure(figsize=(18,12),dpi=300)
NegWordCloud = WordCloud(background_color="white",width=5000, height=3000, margin=2).generate(' '.join(neg_words))
plt.imshow(NegWordCloud)
plt.axis('off')

In [None]:
sentiment_charged_words

In [None]:
len(sentiment_charged_words)

In [None]:
occurance

In [None]:
pos_occr

In [None]:
pos_words

In [None]:
len(pos_words)

In [None]:
len(neg_words)

In [None]:
WordsDistribution

In [None]:
np.any(np.isnan(WordsDistribution))

In [None]:
SentimentTableTrain=get_sentiment_score_prediction_table(sentiment_charged_words,WordsDistribution,articles_training,0.1)

In [None]:
SentimentTableTest=get_sentiment_score_prediction_table(sentiment_charged_words,WordsDistribution,articles_test,0.1)

In [None]:
SentimentTableTrain

In [None]:
SentimentTableTest

In [None]:
price_test=pd.read_csv('./price_test.csv',index_col=0)
price_test.index=pd.to_datetime(price_test.index,format='%Y-%m-%d')

In [None]:
price_train=pd.read_csv('./price_train.csv',index_col=0)
price_train.index=pd.to_datetime(price_train.index,format='%Y-%m-%d')

In [None]:
price_train

In [None]:
price_test

In [None]:
#return_validation=price_validation.pct_change().shift(-1).iloc[:-1,:]

In [None]:
#np.any(np.isnan(return_validation))

In [None]:
factor_train=al.utils.get_clean_factor_and_forward_returns(SentimentTableTrain.stack().dropna(),
                                                          price_train,
                                                          by_group=False,
                                                          quantiles=5,
                                                          periods=(1,2,3),
                                                          filter_zscore=20)

In [None]:
al.tears.create_returns_tear_sheet(factor_train, group_neutral=False, by_group=False)

In [None]:
al.tears.create_information_tear_sheet(factor_train, by_group=False)

In [None]:
al.tears.create_turnover_tear_sheet(factor_train)

#### Test

In [None]:
factor_test=al.utils.get_clean_factor_and_forward_returns(SentimentTableTest.stack().dropna(),
                                                          price_test,
                                                          by_group=False,
                                                          quantiles=5,
                                                          periods=(1,2,3),
                                                          filter_zscore=20)

#### factor value 从小到大排, 小的落入小号组, 大的落到大号组, 小号组是bottom, 大号组是top

In [None]:
al.tears.create_returns_tear_sheet(factor_test, group_neutral=False, by_group=False)

In [None]:
al.tears.create_information_tear_sheet(factor_test, by_group=False)

In [None]:
al.tears.create_turnover_tear_sheet(factor_test)