In [1]:
import gensim
import os 
import pandas as pd
os.environ["DJANGO_ALLOW_ASYNC_UNSAFE"] = "true"
from data_handler.models import Article
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import sentiwordnet as swn
from sentiment.model import progress
import plotly.express as px
from sentiment.models import Category
import sys
from django.db.models import Avg, Count, Min, Sum
from datetime import timedelta, date
from statsmodels.tsa.api import VAR

tokenizer = RegexpTokenizer(r'\w+')

all_articles = Article.objects.all()
df = pd.read_csv("cleaned_docs.csv")

In [None]:
index = 0
length = len(all_articles)
notfound = []
for article in all_articles:
    index += 1
    progress(index, length, status="Article {}/{}".format(index, length))
    row = df.loc[df['headlines'] == article.headline]
    value = row['clean_docs'].tolist()
    if not value:
        notfound.append(article)
        continue
    article.tokens = value[0]
    article.save()
    
    

In [None]:
from data_handler.models import Category

negativ = [entry['word'] for entry in list(Category.objects.get(name="Negativ").words.all().values('word'))]
neg = [entry['word'] for entry in list(Category.objects.get(name="Ngtv").words.all().values('word'))]

positiv = [entry['word'] for entry in list(Category.objects.get(name="Positiv").words.all().values('word'))]
pos = [entry['word'] for entry in list(Category.objects.get(name="Pstv").words.all().values('word'))]

ngtv = set(negativ+neg)
pstv = set(positiv+pos)

In [None]:
from nltk.tokenize import word_tokenize
index = 0
length = len(all_articles)
cur_art = ""
for article in all_articles:
    index += 1
    cur_art = article
    progress(index, length, status="Article of {}/{}".format(index, length))
    tokens = word_tokenize(article.tokens)
    pos = 0
    neg = 0
    for token in tokens:
        token = token.upper()
        if token in ngtv:
            neg += 1
        elif token in pstv:
            pos += 1
    article.smarter_negative_words = neg
    article.smarter_positive_words = pos
    article.save()
    
    
    

In [None]:
from sentiment.model import preprocess

class ArticleHeadlineIterator(object):
    def __init__(self, articles):
        self.articles = articles
    def __iter__(self):
        for article in self.articles:
            yield article.headline

class ArticleContentIterator(object):
    def __init__(self, articles):
        self.articles = articles
    def __iter__(self):
        for article in self.articles:
            row = article.headline + ". " + article.contents
            yield row




In [None]:
cur_art.smarter_negative_words
cur_art.smarter_positive_words

In [None]:
print(cur_art.headline)
art = df.loc[df['headlines']==cur_art.headline]
# print(art)
# cur_art.tokens = art['clean_docs'].tolist()[0]
# cur_art.save()

In [None]:
contentit = ArticleContentIterator(notfound)

clean_articles = preprocess(contentit)

In [None]:
headlineit = ArticleHeadlineIterator(notfound)

In [None]:
for headline in headlineit:
    art = Article.objects.filter(headline=headline).first()
    try:
        art.delete()
    except:
        continue

In [None]:
row = df.loc[df['headlines']=='How London\'s loss is Dublin\'s gain after Brexit']

In [4]:
from datetime import datetime
import numpy as np

class DataFrameCreator(object):
    def __init__(self, stocks):
        self.stocks = stocks

    def __iter__(self):
        # get the stocks first and iterate through them
        for stock in self.stocks:
            # get sentiment of articles for that day
            date = stock.date
            arts = Article.objects.filter(date_written=date)
            if arts:
                total_length = arts.aggregate(total_length=Sum('length'))['total_length']
                smarter_sum = arts.aggregate(total_neg_words=Sum('smarter_negative_words'))['total_neg_words']
                negative_sentiment = smarter_sum/total_length
                yield {'date':date, 'sentiment':negative_sentiment, 'return': stock.log_return()}
            else:
                continue

def autocorrelate_data():
    stocks = StockPrice.objects.all()
    dfc = DataFrameCreator(stocks)
    df = pd.DataFrame.from_dict(dfc)
    df['date'] = pd.to_datetime(df['date']).dt.date
    df = df[(df.T != 0).all()]
    df  = df.set_index(['date'])
    print(df.head())
    arr = np.asarray(df)
    print(arr)
    # create the VAR model
    model = VAR(endog=arr)
    return model 
    
model = autocorrelate_data()
model_fit = model.fit(maxlags=10, ic='aic')
plot = model_fit.plot()

            sentiment    return
date                           
2016-01-04   0.095122 -0.008310
2016-01-04   0.095122 -1.048494
2016-01-04   0.095122 -0.006398
2016-01-05   0.126354 -0.001299
2016-01-05   0.126354 -0.003196
[[ 0.09512195 -0.0083096 ]
 [ 0.09512195 -1.04849419]
 [ 0.09512195 -0.00639797]
 ...
 [ 0.08818058 -0.25662586]
 [ 0.08818058 -0.01142466]
 [ 0.08818058 -0.00482362]]


ValueError: array must not contain infs or NaNs

In [None]:
plot = model_fit.plot_forecast(100)

In [3]:
type(plot)

model_fit = model.fit(maxlags=5).summary()
print(model_fit)

NameError: name 'plot' is not defined

In [2]:
from data_handler.preprocessing import produce_plots

arts = Article.objects.all()
df = produce_plots(arts)

0       2016-01-01   8.333333  Positive Sentiment
1       2016-01-01  12.037037  Negative Sentiment
2       2016-01-01   0.000000            GBPEUR=X
3       2016-01-01   0.000000            GBPUSD=X
4       2016-01-04  12.765957  Positive Sentiment
...            ...        ...                 ...
150294  2019-12-31  11.851852  Positive Sentiment
150295  2019-12-31   5.185185  Negative Sentiment
150296  2019-12-31  -0.256626                FTSE
150297  2019-12-31  -0.004824            GBPEUR=X
150298  2019-12-31  -0.011425            GBPUSD=X

[150299 rows x 3 columns]


In [None]:
df.head()

In [None]:
df.to_csv("sentiment.csv")

In [4]:
df2 = df.loc[df['line']=='Returns']

In [5]:
df2.head()

Unnamed: 0,asset,date,line,value
2,GBPEUR=X,2016-01-01,Returns,0.0
3,GBPUSD=X,2016-01-01,Returns,0.0
18,GBPUSD=X,2016-01-04,Returns,-0.006398
17,GBPEUR=X,2016-01-04,Returns,-0.006398
16,FTSE,2016-01-04,Returns,-1.048494
