In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn

from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import MinMaxScaler

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, LSTM, Dropout, Dense, Activation

# For Sentiment Analysis
import nltk
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from textblob import TextBlob
from nltk.stem.snowball import SnowballStemmer
stemmer = SnowballStemmer("english")
nltk.download('vader_lexicon')
nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /home/khuzaima/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/khuzaima/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /home/khuzaima/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [79]:
df_news = pd.read_csv('eurusd_news.csv',parse_dates=True,skipinitialspace=True)
## Using the data on from 2011 and onward
# df_news = df_news[(df_news['Date'] > "2011-01-01")]
df_news.drop(columns='Unnamed: 0',inplace=True)
df_news['Date'] = pd.to_datetime(df_news['Date'], infer_datetime_format=True)
df_news.set_index('Date',inplace=True)
df_news.head()

Unnamed: 0_level_0,Title,Article
Date,Unnamed: 1_level_1,Unnamed: 2_level_1
2019-01-18,Forex - U.S. Dollar Rises as Consumer Optimism...,© Reuters.\nInvesting.com - The greenback pick...
2019-01-18,Forex - Upbeat U.S. Data Can't Help Dollar as ...,© Reuters.\nInvesting.com – The U.S. dollar li...
2019-01-17,Forex - U.S. Dollar Remains Steady as Jobless ...,© Reuters.\nInvesting.com - The greenback was ...
2019-01-17,Forex - Dollar Pushes Higher; Pound Slips as R...,© Reuters.\nInvesting.com - The U.S. dollar pu...
2019-01-17,World stocks rise; sterling up ahead of May vote,© Reuters. A trader works on the floor at the ...


In [80]:
df_news.groupby(['Date'])['Title'].head()

Date
2019-01-18    Forex - U.S. Dollar Rises as Consumer Optimism...
2019-01-18    Forex - Upbeat U.S. Data Can't Help Dollar as ...
2019-01-17    Forex - U.S. Dollar Remains Steady as Jobless ...
2019-01-17    Forex - Dollar Pushes Higher; Pound Slips as R...
2019-01-17     World stocks rise; sterling up ahead of May vote
                                    ...                        
2018-01-02    Forex - Dollar Remains at 3-Month Lows in Quie...
2018-01-02        Forex - Weaker Dollar Hits Fresh 3-Month Lows
2018-01-01    Forex - Aussie Gains In Asia After Caixin Manu...
2018-01-01    Forex - Dollar Steady In Early Asia Ahead OF C...
2018-01-01                Forex - Weekly Outlook: January 2 - 5
Name: Title, Length: 1077, dtype: object

In [81]:
df_news.groupby(['Date'])['Article'].head()

Date
2019-01-18    © Reuters.\nInvesting.com - The greenback pick...
2019-01-18    © Reuters.\nInvesting.com – The U.S. dollar li...
2019-01-17    © Reuters.\nInvesting.com - The greenback was ...
2019-01-17    © Reuters.\nInvesting.com - The U.S. dollar pu...
2019-01-17    © Reuters. A trader works on the floor at the ...
                                    ...                        
2018-01-02    Dollar still broadly lower as traders return t...
2018-01-02    © Reuters. Weaker dollar hits fresh 3-month lo...
2018-01-01    Aussie gains in Asia\nInvesting.com - The Auss...
2018-01-01    Dollar steady in Asia\nInvesting.com - The dol...
2018-01-01    © Reuters. Dollar falls to 3-month lows on Fri...
Name: Article, Length: 1077, dtype: object

In [82]:
df_news['Title'] = df_news.groupby(['Date'])['Title'].transform(lambda x : ' '.join(x))
df_news['Article'] = df_news.groupby(['Date'])['Article'].transform(lambda x : ' '.join(x)) 
df_news = df_news.drop_duplicates() 
df_news

Unnamed: 0_level_0,Title,Article
Date,Unnamed: 1_level_1,Unnamed: 2_level_1
2019-01-18,Forex - U.S. Dollar Rises as Consumer Optimism...,© Reuters.\nInvesting.com - The greenback pick...
2019-01-17,Forex - U.S. Dollar Remains Steady as Jobless ...,© Reuters.\nInvesting.com - The greenback was ...
2019-01-16,"Forex - Sterling Rebounds on ""Diminished"" Brex...",© Reuters.\nInvesting.com - The pound steadied...
2019-01-15,Forex - U.S. Dollar Rises After Weak German Da...,© Reuters.\nInvesting.com - The greenback rose...
2019-01-14,Forex - U.S. Dollar Flat as Yen Rebounds Forex...,© Reuters.\nInvesting.com - The greenback was ...
...,...,...
2018-01-05,"Forex- Dollar Rises Despite Fall in Jobs, Serv...",The U.S. dollar rallied on Friday.\nInvesting....
2018-01-04,Forex - Upbeat Economic Data Fails To Rescue D...,© Reuters.\nInvesting.com – The dollar continu...
2018-01-03,Dollar Set to Snap 10-Day Losing Streak After ...,© Reuters.\nInvesting.com – The dollar rebound...
2018-01-02,"Forex - Dollar Weakness Continues Into 2018, F...",Investing.com – The dollar fell to more than t...


In [83]:
# Tokenize and stem text
def tokenize_and_stem(text):
    # first tokenize by sentence, then by word to ensure that punctuation is caught as it's own token
    tokens = [word for sent in nltk.sent_tokenize(text) for word in nltk.word_tokenize(sent)]
    filtered_tokens = []

    stop_words=set(stopwords.words('english'))
    # filter out any tokens not containing letters (e.g., numeric tokens, raw punctuation)
    for token in tokens:
        if re.search('[a-zA-Z]', token) and token not in stop_words:
            filtered_tokens.append(token)
    stems = [stemmer.stem(t) for t in filtered_tokens]
    stems = ' '.join(stems)
    return stems

# Only tokenize the text
def tokenize_only(text):
    # first tokenize by sentence, then by word to ensure that punctuation is caught as it's own token
    tokens = [word.lower() for sent in nltk.sent_tokenize(text) for word in nltk.word_tokenize(sent)]
    top_words=set(stopwords.words('english'))
    filtered_tokens = []
    # filter out any tokens not containing letters (e.g., numeric tokens, raw punctuation)
    for token in tokens:
        if re.search('[a-zA-Z]', token):
            filtered_tokens.append(token)
    filtered_tokens = ' '.join(filtered_tokens)
    return filtered_tokens


In [84]:
df_news.iloc[0]['Title']

"Forex - U.S. Dollar Rises as Consumer Optimism Falls Forex - Upbeat U.S. Data Can't Help Dollar as Sterling Reigns Supreme"

In [85]:
print(tokenize_and_stem(df_news.iloc[0]['Title']))

forex u.s. dollar rise consum optim fall forex upbeat u.s. data ca n't help dollar sterl reign suprem


In [86]:
print(tokenize_only(df_news.iloc[0]['Title']))

forex u.s. dollar rises as consumer optimism falls forex upbeat u.s. data ca n't help dollar as sterling reigns supreme


In [87]:
df_news["Title"] = [tokenize_and_stem(i) for i in df_news['Title']]
df_news["Article"] = [tokenize_and_stem(i) for i in df_news['Article']]
# df_news["Title"] = [tokenize_only(i) for i in df_news['Title']]
# df_news["Article"] = [tokenize_only(i) for i in df_news['Article']]
df_news

Unnamed: 0_level_0,Title,Article
Date,Unnamed: 1_level_1,Unnamed: 2_level_1
2019-01-18,forex u.s. dollar rise consum optim fall forex...,reuter investing.com the greenback pick steam ...
2019-01-17,forex u.s. dollar remain steadi jobless claim ...,reuter investing.com the greenback steadi thur...
2019-01-16,forex sterl rebound diminish brexit risk dolla...,reuter investing.com the pound steadi wednesda...
2019-01-15,forex u.s. dollar rise after weak german data ...,reuter investing.com the greenback rose tuesda...
2019-01-14,forex u.s. dollar flat yen rebound forex yen g...,reuter investing.com the greenback flat monday...
...,...,...
2018-01-05,forex- dollar rise despit fall job servic sect...,the u.s. dollar ralli friday investing.com the...
2018-01-04,forex upbeat econom data fail to rescu dollar ...,reuter investing.com the dollar continu langui...
2018-01-03,dollar set snap 10-day lose streak after stron...,reuter investing.com the dollar rebound near f...
2018-01-02,forex dollar weak continu into fall to more th...,investing.com the dollar fell three-month low ...


In [95]:
counter = 0
for i in df_news['Title']:
    if "miss" in i:
        # print(i)
        counter = counter+1
print(counter)

6


In [91]:
def getSubjectivity(text):
  return TextBlob(text).sentiment.subjectivity

def getPolarity(text):
  return  TextBlob(text).sentiment.polarity

In [92]:
#Adding subjectivity and polarity columns
df_news['Subjectivity_title'] = df_news['Title'].apply(getSubjectivity)
df_news['Polarity_title'] = df_news['Title'].apply(getPolarity)
df_news['Subjectivity_article'] = df_news['Article'].apply(getSubjectivity)
df_news['Polarity_article'] = df_news['Article'].apply(getPolarity)
df_news

Unnamed: 0_level_0,Title,Article,Subjectivity_title,Polarity_title,Subjectivity_article,Polarity_article
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2019-01-18,forex u.s. dollar rise consum optim fall forex...,reuter investing.com the greenback pick steam ...,0.000000,0.000000,0.516381,0.122429
2019-01-17,forex u.s. dollar remain steadi jobless claim ...,reuter investing.com the greenback steadi thur...,0.400000,0.125000,0.355764,0.058835
2019-01-16,forex sterl rebound diminish brexit risk dolla...,reuter investing.com the pound steadi wednesda...,0.312500,0.112500,0.380771,0.014656
2019-01-15,forex u.s. dollar rise after weak german data ...,reuter investing.com the greenback rose tuesda...,0.506250,-0.143750,0.331946,0.095426
2019-01-14,forex u.s. dollar flat yen rebound forex yen g...,reuter investing.com the greenback flat monday...,0.125000,-0.025000,0.332223,0.038962
...,...,...,...,...,...,...
2018-01-05,forex- dollar rise despit fall job servic sect...,the u.s. dollar ralli friday investing.com the...,0.500000,0.250000,0.376388,0.102737
2018-01-04,forex upbeat econom data fail to rescu dollar ...,reuter investing.com the dollar continu langui...,0.388056,-0.056508,0.417297,0.072682
2018-01-03,dollar set snap 10-day lose streak after stron...,reuter investing.com the dollar rebound near f...,0.472619,0.102976,0.328112,0.068333
2018-01-02,forex dollar weak continu into fall to more th...,investing.com the dollar fell three-month low ...,0.412315,0.071944,0.439044,0.137370


In [96]:
sia = SentimentIntensityAnalyzer()

# New words and values
new_words = {
    'crush': 10,
    'beat': 5,
    'rise':100,
    'miss': -5,
    'troubl': -10,
    'fall': -100,
}

sia.lexicon.update(new_words)

df_news['Compound_title'] = [sia.polarity_scores(v)['compound'] for v in df_news['Title']]
df_news['Negative_title'] = [sia.polarity_scores(v)['neg'] for v in df_news['Title']]
df_news['Neutral_title'] = [sia.polarity_scores(v)['neu'] for v in df_news['Title']]
df_news['Positive_title'] = [sia.polarity_scores(v)['pos'] for v in df_news['Title']]

df_news['Compound_article'] = [sia.polarity_scores(v)['compound'] for v in df_news['Article']]
df_news['Negative_article'] = [sia.polarity_scores(v)['neg'] for v in df_news['Article']]
df_news['Neutral_article'] = [sia.polarity_scores(v)['neu'] for v in df_news['Article']]
df_news['Positive_article'] = [sia.polarity_scores(v)['pos'] for v in df_news['Article']]


df_news

Unnamed: 0_level_0,Title,Article,Subjectivity_title,Polarity_title,Subjectivity_article,Polarity_article,Compound_title,Negative_title,Neutral_title,Positive_title,Compound_article,Negative_article,Neutral_article,Positive_article
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
2019-01-18,forex u.s. dollar rise consum optim fall forex...,reuter investing.com the greenback pick steam ...,0.000000,0.000000,0.516381,0.122429,-0.3089,0.471,0.068,0.461,0.9995,0.251,0.358,0.391
2019-01-17,forex u.s. dollar remain steadi jobless claim ...,reuter investing.com the greenback steadi thur...,0.400000,0.125000,0.355764,0.058835,0.9992,0.033,0.173,0.794,0.5584,0.172,0.658,0.170
2019-01-16,forex sterl rebound diminish brexit risk dolla...,reuter investing.com the pound steadi wednesda...,0.312500,0.112500,0.380771,0.014656,-0.8020,0.286,0.714,0.000,-0.9998,0.378,0.369,0.253
2019-01-15,forex u.s. dollar rise after weak german data ...,reuter investing.com the greenback rose tuesda...,0.506250,-0.143750,0.331946,0.095426,0.9998,0.012,0.140,0.848,-0.9999,0.321,0.530,0.149
2019-01-14,forex u.s. dollar flat yen rebound forex yen g...,reuter investing.com the greenback flat monday...,0.125000,-0.025000,0.332223,0.038962,0.3182,0.098,0.744,0.158,-0.9998,0.222,0.692,0.086
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2018-01-05,forex- dollar rise despit fall job servic sect...,the u.s. dollar ralli friday investing.com the...,0.500000,0.250000,0.376388,0.102737,-0.3612,0.422,0.158,0.420,1.0000,0.091,0.453,0.456
2018-01-04,forex upbeat econom data fail to rescu dollar ...,reuter investing.com the dollar continu langui...,0.388056,-0.056508,0.417297,0.072682,0.9992,0.053,0.264,0.683,0.9999,0.177,0.424,0.399
2018-01-03,dollar set snap 10-day lose streak after stron...,reuter investing.com the dollar rebound near f...,0.472619,0.102976,0.328112,0.068333,0.4404,0.114,0.741,0.145,1.0000,0.050,0.617,0.333
2018-01-02,forex dollar weak continu into fall to more th...,investing.com the dollar fell three-month low ...,0.412315,0.071944,0.439044,0.137370,-0.9994,0.778,0.207,0.015,-0.9990,0.342,0.357,0.300


In [97]:
df_forex=pd.read_csv('transformed_data.csv', parse_dates=True, skipinitialspace=True)
df_forex['Date'] = pd.to_datetime(df_forex['date'], infer_datetime_format=True)
df_forex['close_shifted']=df_forex['close'].shift(-1)
df_forex.set_index('Date',inplace=True)
df_forex.head()

Unnamed: 0_level_0,date,open,high,low,high_open,open_low,total_pips,return_1,return_5,RSI,UpperBB,LowerBB,MACD,Signal,EVM,ROC,ForceIndex,close,close_shifted
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
2011-03-02,2011.03.02,1.3776,1.389,1.3743,0.0114,0.0033,0.0147,0.006461,0.00851,71.586716,1.387593,1.344617,0.004981,0.002451,0.028358,0.00851,1008.774,1.3865,1.3969
2011-03-03,2011.03.03,1.3865,1.3973,1.3833,0.0108,0.0032,0.014,0.007501,0.01232,76.234568,1.393238,1.342332,0.006497,0.00326,0.045849,0.01232,1357.977,1.3969,1.3986
2011-03-04,2011.03.04,1.3968,1.4006,1.3941,0.0038,0.0027,0.0065,0.001217,0.017016,89.795918,1.39828,1.34135,0.007746,0.004157,0.052665,0.017016,1646.8686,1.3986,1.3968
2011-03-07,2011.03.07,1.3994,1.4035,1.3955,0.0041,0.0039,0.008,-0.001287,0.011734,81.395349,1.402068,1.341422,0.008493,0.005024,0.056328,0.011734,1197.5688,1.3968,1.3904
2011-03-08,2011.03.08,1.3968,1.3988,1.3862,0.002,0.0106,0.0126,-0.004582,0.009292,71.917808,1.404229,1.342051,0.008471,0.005714,0.026534,0.009292,926.8736,1.3904,1.3908


In [98]:
df_merge = pd.merge(df_forex, df_news, how='inner',on='Date')
df_merge.drop(['date','Title', 'Article'],inplace=True,axis=1)
df_merge.head()



Unnamed: 0_level_0,open,high,low,high_open,open_low,total_pips,return_1,return_5,RSI,UpperBB,...,Subjectivity_article,Polarity_article,Compound_title,Negative_title,Neutral_title,Positive_title,Compound_article,Negative_article,Neutral_article,Positive_article
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2018-01-02,1.20276,1.20795,1.20232,0.00519,0.00044,0.00563,0.004733,0.016618,98.190709,1.200916,...,0.439044,0.13737,-0.9994,0.778,0.207,0.015,-0.999,0.342,0.357,0.3
2018-01-03,1.20572,1.20654,1.20019,0.00082,0.00553,0.00635,-0.003691,0.013182,81.858948,1.203329,...,0.328112,0.068333,0.4404,0.114,0.741,0.145,1.0,0.05,0.617,0.333
2018-01-04,1.20133,1.20879,1.20037,0.00746,0.00096,0.00842,0.004478,0.015262,83.542899,1.206893,...,0.417297,0.072682,0.9992,0.053,0.264,0.683,0.9999,0.177,0.424,0.399
2018-01-05,1.20659,1.20819,1.20198,0.0016,0.00461,0.00621,-0.00319,0.007243,67.128713,1.209079,...,0.376388,0.102737,-0.3612,0.422,0.158,0.42,1.0,0.091,0.453,0.456
2018-01-08,1.20266,1.2051,1.19556,0.00244,0.0071,0.00954,-0.005204,-0.002916,43.169399,1.209862,...,0.435464,0.1563,0.9998,0.008,0.155,0.837,1.0,0.102,0.518,0.38


# END OF PRE PROCESSING

In [70]:
df_merge.columns

Index(['open', 'high', 'low', 'high_open', 'open_low', 'total_pips',
       'return_1', 'return_5', 'RSI', 'UpperBB', 'LowerBB', 'MACD', 'Signal',
       'EVM', 'ROC', 'ForceIndex', 'close', 'close_shifted',
       'Subjectivity_title', 'Polarity_title', 'Subjectivity_article',
       'Polarity_article', 'Compound_title', 'Negative_title', 'Neutral_title',
       'Positive_title', 'Compound_article', 'Negative_article',
       'Neutral_article', 'Positive_article'],
      dtype='object')

In [71]:
df_forex.columns

Index(['date', 'open', 'high', 'low', 'high_open', 'open_low', 'total_pips',
       'return_1', 'return_5', 'RSI', 'UpperBB', 'LowerBB', 'MACD', 'Signal',
       'EVM', 'ROC', 'ForceIndex', 'close', 'close_shifted'],
      dtype='object')

In [72]:
df_news.columns

Index(['Title', 'Article', 'Subjectivity_title', 'Polarity_title',
       'Subjectivity_article', 'Polarity_article', 'Compound_title',
       'Negative_title', 'Neutral_title', 'Positive_title', 'Compound_article',
       'Negative_article', 'Neutral_article', 'Positive_article'],
      dtype='object')

In [73]:
len(df_merge.columns)

30

In [99]:
df_merge.to_csv('merged_data_extra.csv')