# User Input

In [None]:
# enter the location where datasets should be saved
drive_loc = '/content/drive/MyDrive/datasets'


# ----- NASDAQ -----

# enter the nasdaq dataset location
nasdaq_loc = '/content/drive/MyDrive/datasets/nasdaq.csv'


# ----- FINANCIAL PHRASE BANK -----

# enter the financial phrase bank dataset location
financial_phrase_bank_loc = '/content/drive/MyDrive/datasets/all-data.csv'


# ----- SENTFIN -----

# enter the sentfin dataset location
sentfin_loc = '/content/drive/MyDrive/datasets/SEntFiN-v1.1.csv'


# ----- FINANCIAL PHRASE BANK + SEM-EVAL 2017 TASK 5 -----

# enter the train dataset location
train_df_loc = '/content/drive/MyDrive/datasets/train.tsv'

# enter the dev dataset location
dev_df_loc = '/content/drive/MyDrive/datasets/dev.tsv'


# ----- FIQA -----

# enter the Fiqa task 1 headline train dataset location
fiqa_headline_train_loc = '/content/drive/MyDrive/datasets/task1_headline_ABSA_train.json'

# enter the Fiqa task 1 post train dataset location
fiqa_post_train_loc = '/content/drive/MyDrive/datasets/task1_post_ABSA_train.json'


# ----- FPB - FIQA -----

# enter the fpb_fiqa dataset location
fpb_fiqa_loc = '/content/drive/MyDrive/datasets/data.csv'


# ----- Loughran McDonald Dictionary -----

# enter the lmd dataset location
loughran_mcdonald_loc = '/content/drive/MyDrive/datasets/Loughran-McDonald_MasterDictionary_1993-2021.csv'

# Setup

In [None]:
pip install tweet-preprocessor

In [None]:
import os
from os import listdir
from os.path import isfile, join

def save_dataset(dataset_name, dataset, dataset_type, location):
  base_loc = location if location[-1] == '/' else f'{location}/'
  base_type_loc = f'{base_loc}{dataset_type} datasets/'

  if not os.path.exists(base_type_loc):
    os.makedirs(base_type_loc)

  save_loc = f'{base_type_loc}{dataset_name}'

  dataset.to_csv(save_loc, index = False)

In [None]:
def check_set_duplicates(df1, df1_column, df2, df2_column):
  return set(df1[df1_column].str.lower()).intersection(set(df2[df2_column].str.lower()))

def df_no_duplicates(df, df_column, sentences):
  return df[~df[df_column].str.lower().isin(sentences)]

In [None]:
def check_if_equals(dataset_sentences, copy_dataset_sentences):
  l = []
  for sentence in dataset_sentences:
    sent = str(sentence)
    if sent not in copy_dataset_sentences:
      l.append(sent)
      print(f'Missing sentence: {sent}')

  return l

In [None]:
def convert_to_num(data, column):
  sentiment_map = {
      'positive': 1,
      'negative': 0
  }
  return data[column].apply(lambda s: sentiment_map[s]).values

In [None]:
import pandas as pd

train_df = pd.read_csv(train_df_loc, sep='\t', names=['ind', 'sentiment', 'letter', 'text'], header=None)
dev_df = pd.read_csv(dev_df_loc, sep='\t', names=['ind', 'sentiment', 'letter', 'text'], header=None)

# Evaluation Datasets

## FPB Fiqa

In [None]:
train_df_fpb_fiqa = [
    "FastJet slams EasyJet founder Stelios for going public, is 'taking legal advice' over letter about contractual ...",
    'FastJet slams EasyJet founder Stelios for going public, is "taking legal advice" over letter about contractual ...',

    "In the third quarter , net sales increased by 12 % year-on-year to EUR159 .5 m , or by 6 % at comparable currency rates growth .",
    "In the third quarter , net sales increased by 12 % year-on-year to EUR 159.5 million , or by 6 % at comparable currency rates growth .",

    "Tesco, Asda sales fall as march of the discounters continues-Kantar",
    "Tesco, Asda sales fall as march of the discounters continues: Kantar",

    "EU regulator backs approval for GSK injectable asthma drug",
    "UPDATE 1-EU regulator backs approval for GSK injectable asthma drug"
]

In [None]:
fpb_fiqa = pd.read_csv(fpb_fiqa_loc)
fpb_fiqa.columns = ['text', 'sentiment']

mutual_sentences = check_set_duplicates(fpb_fiqa, 'text', train_df, 'text')
len(mutual_sentences)

dataset_nt = df_no_duplicates(fpb_fiqa, 'text', mutual_sentences)
dataset_nt.drop_duplicates('text', inplace=True)

dataset_nt_nn = dataset_nt[dataset_nt['sentiment'] != 'neutral']

dataset_nt_nn = dataset_nt_nn.loc[~dataset_nt_nn['text'].isin(train_df_fpb_fiqa)]

dataset_nt_nn['sentiment'] = convert_to_num(dataset_nt_nn, 'sentiment')

save_dataset('fpb_fiqa.csv', dataset_nt_nn, 'evaluation', drive_loc)

## Fiqa labeled df

In [None]:
import json
import pandas as pd


def get_label(score):
  return 0 if score < 0 else 1


def extract_sentences(dataset_loc):
  f = open(dataset_loc)
  data = json.load(f)

  labeled_sentences = []
  multiple_labels_sentences = []

  for row in data.values():
    print(row)
    if len(row['info']) > 1:
      multiple_labels_sentences.append(row)
      continue
    sentence = row['sentence']
    sentiment_score = float(row['info'][0]['sentiment_score'])
    label = get_label(sentiment_score)
    labeled_sentences.append([sentence, sentiment_score, label])

  return labeled_sentences, multiple_labels_sentences

def classify_multiple_label_sentences(multiple_labels_sentences):
  labeled_sentences, neutral = [], []
  for sentence_row in multiple_labels_sentences:
    sentence = sentence_row['sentence']
    segments = sentence_row['info']

    labels = set()
    sum = 0
    for segm in segments:
      sentiment_score = float(segm['sentiment_score'])
      sum += sentiment_score
      labels.add(get_label(sentiment_score))

    if len(labels) > 1:
      neutral.append(sentence_row)
    else:
      labeled_sentences.append([sentence, sum, list(labels)[0]])

  return labeled_sentences, neutral

In [None]:
train_df_fiqa = [
    'FastJet slams EasyJet founder Stelios for going public, is "taking legal advice" over letter about contractual ...',
    "FastJet slams EasyJet founder Stelios for going public, is 'taking legal advice' over letter about contractual ...",

    'Tesco, Asda sales fall as march of the discounters continues: Kantar',
    'Tesco, Asda sales fall as march of the discounters continues-Kantar',

    "UPDATE 1-EU regulator backs approval for GSK injectable asthma drug",
    "EU regulator backs approval for GSK injectable asthma drug"
]

In [None]:
headlines_res = extract_sentences(fiqa_headline_train_loc)
posts_res = extract_sentences(fiqa_post_train_loc)

labeled_headlines_posts = headlines_res[0] + posts_res[0]
multiple_label_headlines_posts = headlines_res[1] + posts_res[1]

print()
print(len(labeled_headlines_posts), len(multiple_label_headlines_posts))

In [None]:
mls_labeled, mls_neutral = classify_multiple_label_sentences(multiple_label_headlines_posts)

fiqa_labeled = labeled_headlines_posts + mls_labeled
fiqa_labeled_df = pd.DataFrame(fiqa_labeled, columns=['sentence', 'sentiment_score', 'label'])

fiqa_labeled_df.columns = ['text',	'sentiment_score',	'sentiment']

mutual_sentences = check_set_duplicates(fiqa_labeled_df, 'text', train_df, 'text')
print(len(mutual_sentences))

fiqa_labeled_df_nt = df_no_duplicates(fiqa_labeled_df, 'text', mutual_sentences)

fiqa_labeled_df_nt.drop_duplicates('text', inplace=True)

fiqa_labeled_df_nt = fiqa_labeled_df_nt.loc[~fiqa_labeled_df_nt['text'].isin(train_df_fiqa)]

save_dataset('fiqa_labeled_df.csv', fiqa_labeled_df_nt, 'evaluation', drive_loc)

## Financial Phrase Bank

In [None]:
train_df_fpb_df_duplicates = ['The original name Componenta +àm+Ñl , as a subsidiary of the Finnish Componenta Group , has been changed to +àm+Ñl Components and the company has seen a 63 % growth in Q1 2010 , in comparison to Q1 2009 .',
'The original name Componenta +_m+_l , as a subsidiary of the Finnish Componenta Group , has been changed to +_m+_l Components and the company has seen a 63 % growth in Q1 2010 , in comparison to Q1 2009 .',

'YIT Construction and the town of Riihim+ñki have signed a lease contract whereby the town will occupy the Travel Centre office facilities .',
'YIT Construction and the town of Riihim+Æki have signed a lease contract whereby the town will occupy the Travel Centre office facilities .',

'NORDIC BUSINESS REPORT-26 June 2006-Metso Corporation wins EUR50m equipment order in Australia -® 1998-2006 M2 COMMUNICATIONS LTD The Finnish engineering and technology group Metso Corporation said on Monday ( 26 June ) that it has received a EUR50m equipment order in Australia .',
'NORDIC BUSINESS REPORT-26 June 2006-Metso Corporation wins EUR50m equipment order in Australia -_ 1998-2006 M2 COMMUNICATIONS LTD The Finnish engineering and technology group Metso Corporation said on Monday ( 26 June ) that it has received a EUR50m equipment order in Australia .',

'The real estate company posted a net loss of +ó  x201a -¼ 59.3 million +ó  x201a -¼ 0.21 per share compared with a net profit of +ó  x201a -¼ 31 million +ó  x201a -¼ 0.11 per share for the corresponding quarter of 2007 .',
'The real estate company posted a net loss of +  x201a -õ 59.3 million +  x201a -õ 0.21 per share compared with a net profit of +  x201a -õ 31 million +  x201a -õ 0.11 per share for the corresponding quarter of 2007 .',

'- The Group -¦ s result before taxes was EUR -1.9 ( -3.0 ) million .',
'- The Group - s result before taxes was EUR -1.9 ( -3.0 ) million .',

'Finnish Bank of +àland reports its operating profit fell to EUR 4.9 mn in the third quarter of 2007 from EUR 5.6 mn in the third quarter of 2006 .',
'Finnish Bank of +_land reports its operating profit fell to EUR 4.9 mn in the third quarter of 2007 from EUR 5.6 mn in the third quarter of 2006 .',


"Net sales of Finnish food industry company L+ñnnen Tehtaat 's continuing operations increased by 13 % in 2008 to EUR 349.1 mn from EUR 309.6 mn in 2007 .",
"Net sales of Finnish food industry company L+Ænnen Tehtaat 's continuing operations increased by 13 % in 2008 to EUR 349.1 mn from EUR 309.6 mn in 2007 .",

'55 workers in +àm+Ñl will be affected by the close-down .',
'55 workers in +_m+_l will be affected by the close-down .',

"Clothing retail chain Sepp+ñl+ñ 's sales increased by 8 % to EUR 155.2 mn , and operating profit rose to EUR 31.1 mn from EUR 17.1 mn in 2004 .",
"Clothing retail chain Sepp+Æl+Æ 's sales increased by 8 % to EUR 155.2 mn , and operating profit rose to EUR 31.1 mn from EUR 17.1 mn in 2004 .",

'In addition , the production at the Varpaisj+â rvi factory will be stopped at the beginning of April 2009 .',
'In addition , the production at the Varpaisj+_ rvi factory will be stopped at the beginning of April 2009 .',

'The pulp production in Finnish Kemij+ñrvi will also be liquidated and about 1,100 employees loose their jobs .',
'The pulp production in Finnish Kemij+Ærvi will also be liquidated and about 1,100 employees loose their jobs .',

'In Finland , the Bank of +àland reports its operating profit fell to EUR 6.1 mn in the second quarter of 2008 from EUR 7.5 mn in the second quarter of 2007 .',
'In Finland , the Bank of +_land reports its operating profit fell to EUR 6.1 mn in the second quarter of 2008 from EUR 7.5 mn in the second quarter of 2007 .',
"Also Lemmink+ñinen 's profit for accounting period went up to EUR 3.1 mn from EUR -24.5 mn a year ago .",
"Also Lemmink+Æinen 's profit for accounting period went up to EUR 3.1 mn from EUR -24.5 mn a year ago .",

'The adapter , awarded with the `` Certified Integration for SAP -« ; NetWeaver '' endorsement , integrates Basware s invoice automation and procurement solutions with more than 200 different ERP systems .',
'The adapter , awarded with the `` Certified Integration for SAP - ; NetWeaver '' endorsement , integrates Basware s invoice automation and procurement solutions with more than 200 different ERP systems .',

'Net profit fell by almost half to +é 5.5 million from +é 9.4 million at the end of 2007 .',
'Net profit fell by almost half to +â 5.5 million from +â 9.4 million at the end of 2007 .',

'Finnish Bank of +àland reports operating profit of EUR 2.2 mn in the first quarter of 2010 , down from EUR 6.3 mn in the corresponding period in 2009 .',
'Finnish Bank of +_land reports operating profit of EUR 2.2 mn in the first quarter of 2010 , down from EUR 6.3 mn in the corresponding period in 2009 .',

'In the third quarter , net sales increased by 12 % year-on-year to EUR 159.5 million , or by 6 % at comparable currency rates growth .',
'In the third quarter , net sales increased by 12 % year-on-year to EUR159 .5 m , or by 6 % at comparable currency rates growth .',

'The acquisition of +àlandsbanken Sverige in 2009 burdened the performance with EUR 3.0 mn .',
'The acquisition of +_landsbanken Sverige in 2009 burdened the performance with EUR 3.0 mn .',

"CEO Erkki J+ñrvinen is happy with the company 's performance in 2010 .",
"CEO Erkki J+Ærvinen is happy with the company 's performance in 2010 .",
'Copper , lead and nickel also dropped ... HBOS ( HBOS ) plummeted 20 % to 70.3 pence after saying this year+ó ??',
'Copper , lead and nickel also dropped ... HBOS ( HBOS ) plummeted 20 % to 70.3 pence after saying this year+ ??',

"In Q1 of 2009 , Bank of +àland 's net interest income weakened by 10 % to EUR 9.1 mn .",
"In Q1 of 2009 , Bank of +_land 's net interest income weakened by 10 % to EUR 9.1 mn .",

"W+ñrtsil+ñ 's solution has been selected for its low fuel consumption , environmentally sound technology , and global service support .",
"W+Ærtsil+Æ 's solution has been selected for its low fuel consumption , environmentally sound technology , and global service support .",

"The Brazilian unit of Finnish security solutions provider F-Secure signed up 1,500 new clients last year , online news source Reseller Web quoted the division 's commercial director , Vladimir Brand+úo , as saying .",
"The Brazilian unit of Finnish security solutions provider F-Secure signed up 1,500 new clients last year , online news source Reseller Web quoted the division 's commercial director , Vladimir Brand+_o , as saying ."]

In [None]:
fpb_df_train_df_duplicates = [
    "Clothing retail chain Sepp+Æl+Æ 's sales increased by 8 % to EUR 155.2 mn , and operating profit rose to EUR 31.1 mn from EUR 17.1 mn in 2004 .",
"Clothing retail chain Sepp+ñl+ñ 's sales increased by 8 % to EUR 155.2 mn , and operating profit rose to EUR 31.1 mn from EUR 17.1 mn in 2004 .",


"The Brazilian unit of Finnish security solutions provider F-Secure signed up 1,500 new clients last year , online news source Reseller Web quoted the division 's commercial director , Vladimir Brand+_o , as saying .",
"The Brazilian unit of Finnish security solutions provider F-Secure signed up 1,500 new clients last year , online news source Reseller Web quoted the division 's commercial director , Vladimir Brand+úo , as saying .",


"Net sales of Finnish food industry company L+Ænnen Tehtaat 's continuing operations increased by 13 % in 2008 to EUR 349.1 mn from EUR 309.6 mn in 2007 .",
"Net sales of Finnish food industry company L+ñnnen Tehtaat 's continuing operations increased by 13 % in 2008 to EUR 349.1 mn from EUR 309.6 mn in 2007 .",


"Also Lemmink+Æinen 's profit for accounting period went up to EUR 3.1 mn from EUR -24.5 mn a year ago .",
"Also Lemmink+ñinen 's profit for accounting period went up to EUR 3.1 mn from EUR -24.5 mn a year ago .",


'In the third quarter , net sales increased by 12 % year-on-year to EUR159 .5 m , or by 6 % at comparable currency rates growth .',
"In the third quarter , net sales increased by 12 % year-on-year to EUR 159.5 million , or by 6 % at comparable currency rates growth .",

'YIT Construction and the town of Riihim+Æki have signed a lease contract whereby the town will occupy the Travel Centre office facilities .',
'YIT Construction and the town of Riihim+ñki have signed a lease contract whereby the town will occupy the Travel Centre office facilities .',


'NORDIC BUSINESS REPORT-26 June 2006-Metso Corporation wins EUR50m equipment order in Australia -_ 1998-2006 M2 COMMUNICATIONS LTD The Finnish engineering and technology group Metso Corporation said on Monday ( 26 June ) that it has received a EUR50m equipment order in Australia .',
'NORDIC BUSINESS REPORT-26 June 2006-Metso Corporation wins EUR50m equipment order in Australia -® 1998-2006 M2 COMMUNICATIONS LTD The Finnish engineering and technology group Metso Corporation said on Monday ( 26 June ) that it has received a EUR50m equipment order in Australia .',


'The original name Componenta +_m+_l , as a subsidiary of the Finnish Componenta Group , has been changed to +_m+_l Components and the company has seen a 63 % growth in Q1 2010 , in comparison to Q1 2009 .',
'The original name Componenta +àm+Ñl , as a subsidiary of the Finnish Componenta Group , has been changed to +àm+Ñl Components and the company has seen a 63 % growth in Q1 2010 , in comparison to Q1 2009 .',




"CEO Erkki J+ñrvinen is happy with the company 's performance in 2010 .",
"CEO Erkki J+Ærvinen is happy with the company 's performance in 2010 .",



'The adapter , awarded with the `` Certified Integration for SAP -« ; NetWeaver '' endorsement , integrates Basware s invoice automation and procurement solutions with more than 200 different ERP systems .',
'The adapter , awarded with the `` Certified Integration for SAP -‹ ; NetWeaver '' endorsement , integrates Basware s invoice automation and procurement solutions with more than 200 different ERP systems .',
'55 workers in +_m+_l will be affected by the close-down .',
'55 workers in +àm+Ñl will be affected by the close-down .',


'- The Group -¦ s result before taxes was EUR -1.9 ( -3.0 ) million .',
'- The Group -“ s result before taxes was EUR -1.9 ( -3.0 ) million .',


'The real estate company posted a net loss of +ˆ  x201a -õ 59.3 million +ˆ  x201a -õ 0.21 per share compared with a net profit of +ˆ  x201a -õ 31 million +ˆ  x201a -õ 0.11 per share for the corresponding quarter of 2007 .',
'The real estate company posted a net loss of +ó  x201a -¼ 59.3 million +ó  x201a -¼ 0.21 per share compared with a net profit of +ó  x201a -¼ 31 million +ó  x201a -¼ 0.11 per share for the corresponding quarter of 2007 .',


'In addition , the production at the Varpaisj+_ rvi factory will be stopped at the beginning of April 2009 .',
'In addition , the production at the Varpaisj+â rvi factory will be stopped at the beginning of April 2009 .',

'The pulp production in Finnish Kemij+Ærvi will also be liquidated and about 1,100 employees loose their jobs .',
'The pulp production in Finnish Kemij+ñrvi will also be liquidated and about 1,100 employees loose their jobs .',


'Copper , lead and nickel also dropped ... HBOS ( HBOS ) plummeted 20 % to 70.3 pence after saying this year+ˆ ??',
'Copper , lead and nickel also dropped ... HBOS ( HBOS ) plummeted 20 % to 70.3 pence after saying this year+ó ??',


'The acquisition of +_landsbanken Sverige in 2009 burdened the performance with EUR 3.0 mn .',
'The acquisition of +àlandsbanken Sverige in 2009 burdened the performance with EUR 3.0 mn .',

"W+Ærtsil+Æ 's solution has been selected for its low fuel consumption , environmentally sound technology , and global service support .",
"W+ñrtsil+ñ 's solution has been selected for its low fuel consumption , environmentally sound technology , and global service support .",


'In Finland , the Bank of +_land reports its operating profit fell to EUR 6.1 mn in the second quarter of 2008 from EUR 7.5 mn in the second quarter of 2007 .',
'In Finland , the Bank of +àland reports its operating profit fell to EUR 6.1 mn in the second quarter of 2008 from EUR 7.5 mn in the second quarter of 2007 .',



"In Q1 of 2009 , Bank of +_land 's net interest income weakened by 10 % to EUR 9.1 mn .",
"In Q1 of 2009 , Bank of +àland 's net interest income weakened by 10 % to EUR 9.1 mn .",

'Finnish Bank of +_land reports its operating profit fell to EUR 4.9 mn in the third quarter of 2007 from EUR 5.6 mn in the third quarter of 2006 .',
'Finnish Bank of +àland reports its operating profit fell to EUR 4.9 mn in the third quarter of 2007 from EUR 5.6 mn in the third quarter of 2006 .',


'Finnish Bank of +_land reports operating profit of EUR 2.2 mn in the first quarter of 2010 , down from EUR 6.3 mn in the corresponding period in 2009 .',
'Finnish Bank of +àland reports operating profit of EUR 2.2 mn in the first quarter of 2010 , down from EUR 6.3 mn in the corresponding period in 2009 .',


'Net profit fell by almost half to +â 5.5 million from +â 9.4 million at the end of 2007 .',
'Net profit fell by almost half to +é 5.5 million from +é 9.4 million at the end of 2007 .'

]

In [None]:
import pandas as pd
df = pd.read_csv(financial_phrase_bank_loc, encoding='latin-1', names=['sentiment', 'text'], header=None)
df.drop_duplicates(subset = 'text', inplace = True)

print(len(df) - len(set(train_df['text'].values).intersection(set(df.text.values))))
intersection = check_set_duplicates(df, 'text', train_df, 'text')
df = df_no_duplicates(df, 'text', intersection)

fpb = df[df['sentiment'] != 'neutral']

fpb = fpb.loc[~fpb['text'].isin(train_df_fpb_df_duplicates + fpb_df_train_df_duplicates)]

fpb['sentiment'] = convert_to_num(fpb, 'sentiment')

save_dataset('financial_phrase_bank.csv', fpb, 'evaluation', drive_loc)

## dev_df

In [None]:
dev_df = pd.read_csv(dev_df_loc, sep='\t', names=['ind', 'sentiment', 'letter', 'text'], header=None)
dev_df.drop_duplicates(subset = 'text', inplace = True)

print(len(dev_df) - len(set(train_df['text'].values).intersection(set(dev_df.text.values))))
intersection = check_set_duplicates(dev_df, 'text', train_df, 'text')
dev_df = df_no_duplicates(dev_df, 'text', intersection)

save_dataset('dev_df.csv', dev_df, 'evaluation', drive_loc)

# Source datasets

In [None]:
eval_loc_base = drive_loc if drive_loc[-1] == '/' else f'{drive_loc}/'
eval_loc = f'{eval_loc_base}evaluation datasets/'

eval_files_locations = [join(eval_loc, f) for f in listdir(eval_loc) if isfile(join(eval_loc, f))]

eval_datasets = {}

for ev in eval_files_locations:
  ev_name = ev.split('/')[-1].split('.')[0]
  ev_df = pd.read_csv(ev)
  eval_datasets[ev_name] = ev_df

## Nasdaq

In [None]:
import pandas as pd
import preprocessor as p

# enter the location of nasdaq dataset
df = pd.read_csv(nasdaq_loc)

# duplicate headlines
df[df.duplicated('Headline') == True]
# dropping duplicates
df.drop_duplicates(subset='Headline', inplace=True)

# configuring tweet processor to remove the following objects
p.set_options(p.OPT.URL, p.OPT.EMOJI, p.OPT.MENTION, p.OPT.RESERVED, p.OPT.SMILEY)
# removing the objects
df['Headline'] = df['Headline'].apply(lambda x: p.clean(x))

df.columns = ['sentiment', 'ticker', 'text']

save_dataset('nasdaq.csv', df, 'source', drive_loc)

## Financial Phrase Bank

In [None]:
# following are sentences that were found in financial phrase bank as well as in some of the evaluation datasets
# removing them from Financial Phrase Bank source dataset

In [None]:
train_df_fpb_df_duplicates = ['The original name Componenta +àm+Ñl , as a subsidiary of the Finnish Componenta Group , has been changed to +àm+Ñl Components and the company has seen a 63 % growth in Q1 2010 , in comparison to Q1 2009 .',
'The original name Componenta +_m+_l , as a subsidiary of the Finnish Componenta Group , has been changed to +_m+_l Components and the company has seen a 63 % growth in Q1 2010 , in comparison to Q1 2009 .',

'YIT Construction and the town of Riihim+ñki have signed a lease contract whereby the town will occupy the Travel Centre office facilities .',
'YIT Construction and the town of Riihim+Æki have signed a lease contract whereby the town will occupy the Travel Centre office facilities .',

'NORDIC BUSINESS REPORT-26 June 2006-Metso Corporation wins EUR50m equipment order in Australia -® 1998-2006 M2 COMMUNICATIONS LTD The Finnish engineering and technology group Metso Corporation said on Monday ( 26 June ) that it has received a EUR50m equipment order in Australia .',
'NORDIC BUSINESS REPORT-26 June 2006-Metso Corporation wins EUR50m equipment order in Australia -_ 1998-2006 M2 COMMUNICATIONS LTD The Finnish engineering and technology group Metso Corporation said on Monday ( 26 June ) that it has received a EUR50m equipment order in Australia .',

'The real estate company posted a net loss of +ó  x201a -¼ 59.3 million +ó  x201a -¼ 0.21 per share compared with a net profit of +ó  x201a -¼ 31 million +ó  x201a -¼ 0.11 per share for the corresponding quarter of 2007 .',
'The real estate company posted a net loss of +  x201a -õ 59.3 million +  x201a -õ 0.21 per share compared with a net profit of +  x201a -õ 31 million +  x201a -õ 0.11 per share for the corresponding quarter of 2007 .',

'- The Group -¦ s result before taxes was EUR -1.9 ( -3.0 ) million .',
'- The Group - s result before taxes was EUR -1.9 ( -3.0 ) million .',

'Finnish Bank of +àland reports its operating profit fell to EUR 4.9 mn in the third quarter of 2007 from EUR 5.6 mn in the third quarter of 2006 .',
'Finnish Bank of +_land reports its operating profit fell to EUR 4.9 mn in the third quarter of 2007 from EUR 5.6 mn in the third quarter of 2006 .',


"Net sales of Finnish food industry company L+ñnnen Tehtaat 's continuing operations increased by 13 % in 2008 to EUR 349.1 mn from EUR 309.6 mn in 2007 .",
"Net sales of Finnish food industry company L+Ænnen Tehtaat 's continuing operations increased by 13 % in 2008 to EUR 349.1 mn from EUR 309.6 mn in 2007 .",

'55 workers in +àm+Ñl will be affected by the close-down .',
'55 workers in +_m+_l will be affected by the close-down .',

"Clothing retail chain Sepp+ñl+ñ 's sales increased by 8 % to EUR 155.2 mn , and operating profit rose to EUR 31.1 mn from EUR 17.1 mn in 2004 .",
"Clothing retail chain Sepp+Æl+Æ 's sales increased by 8 % to EUR 155.2 mn , and operating profit rose to EUR 31.1 mn from EUR 17.1 mn in 2004 .",

'In addition , the production at the Varpaisj+â rvi factory will be stopped at the beginning of April 2009 .',
'In addition , the production at the Varpaisj+_ rvi factory will be stopped at the beginning of April 2009 .',

'The pulp production in Finnish Kemij+ñrvi will also be liquidated and about 1,100 employees loose their jobs .',
'The pulp production in Finnish Kemij+Ærvi will also be liquidated and about 1,100 employees loose their jobs .',

'In Finland , the Bank of +àland reports its operating profit fell to EUR 6.1 mn in the second quarter of 2008 from EUR 7.5 mn in the second quarter of 2007 .',
'In Finland , the Bank of +_land reports its operating profit fell to EUR 6.1 mn in the second quarter of 2008 from EUR 7.5 mn in the second quarter of 2007 .',
"Also Lemmink+ñinen 's profit for accounting period went up to EUR 3.1 mn from EUR -24.5 mn a year ago .",
"Also Lemmink+Æinen 's profit for accounting period went up to EUR 3.1 mn from EUR -24.5 mn a year ago .",

'The adapter , awarded with the `` Certified Integration for SAP -« ; NetWeaver '' endorsement , integrates Basware s invoice automation and procurement solutions with more than 200 different ERP systems .',
'The adapter , awarded with the `` Certified Integration for SAP - ; NetWeaver '' endorsement , integrates Basware s invoice automation and procurement solutions with more than 200 different ERP systems .',

'Net profit fell by almost half to +é 5.5 million from +é 9.4 million at the end of 2007 .',
'Net profit fell by almost half to +â 5.5 million from +â 9.4 million at the end of 2007 .',

'Finnish Bank of +àland reports operating profit of EUR 2.2 mn in the first quarter of 2010 , down from EUR 6.3 mn in the corresponding period in 2009 .',
'Finnish Bank of +_land reports operating profit of EUR 2.2 mn in the first quarter of 2010 , down from EUR 6.3 mn in the corresponding period in 2009 .',

'In the third quarter , net sales increased by 12 % year-on-year to EUR 159.5 million , or by 6 % at comparable currency rates growth .',
'In the third quarter , net sales increased by 12 % year-on-year to EUR159 .5 m , or by 6 % at comparable currency rates growth .',

'The acquisition of +àlandsbanken Sverige in 2009 burdened the performance with EUR 3.0 mn .',
'The acquisition of +_landsbanken Sverige in 2009 burdened the performance with EUR 3.0 mn .',

"CEO Erkki J+ñrvinen is happy with the company 's performance in 2010 .",
"CEO Erkki J+Ærvinen is happy with the company 's performance in 2010 .",
'Copper , lead and nickel also dropped ... HBOS ( HBOS ) plummeted 20 % to 70.3 pence after saying this year+ó ??',
'Copper , lead and nickel also dropped ... HBOS ( HBOS ) plummeted 20 % to 70.3 pence after saying this year+ ??',

"In Q1 of 2009 , Bank of +àland 's net interest income weakened by 10 % to EUR 9.1 mn .",
"In Q1 of 2009 , Bank of +_land 's net interest income weakened by 10 % to EUR 9.1 mn .",

"W+ñrtsil+ñ 's solution has been selected for its low fuel consumption , environmentally sound technology , and global service support .",
"W+Ærtsil+Æ 's solution has been selected for its low fuel consumption , environmentally sound technology , and global service support .",

"The Brazilian unit of Finnish security solutions provider F-Secure signed up 1,500 new clients last year , online news source Reseller Web quoted the division 's commercial director , Vladimir Brand+úo , as saying .",
"The Brazilian unit of Finnish security solutions provider F-Secure signed up 1,500 new clients last year , online news source Reseller Web quoted the division 's commercial director , Vladimir Brand+_o , as saying ."]

In [None]:
fpb_df_train_df_duplicates = [
    "Clothing retail chain Sepp+Æl+Æ 's sales increased by 8 % to EUR 155.2 mn , and operating profit rose to EUR 31.1 mn from EUR 17.1 mn in 2004 .",
"Clothing retail chain Sepp+ñl+ñ 's sales increased by 8 % to EUR 155.2 mn , and operating profit rose to EUR 31.1 mn from EUR 17.1 mn in 2004 .",


"The Brazilian unit of Finnish security solutions provider F-Secure signed up 1,500 new clients last year , online news source Reseller Web quoted the division 's commercial director , Vladimir Brand+_o , as saying .",
"The Brazilian unit of Finnish security solutions provider F-Secure signed up 1,500 new clients last year , online news source Reseller Web quoted the division 's commercial director , Vladimir Brand+úo , as saying .",


"Net sales of Finnish food industry company L+Ænnen Tehtaat 's continuing operations increased by 13 % in 2008 to EUR 349.1 mn from EUR 309.6 mn in 2007 .",
"Net sales of Finnish food industry company L+ñnnen Tehtaat 's continuing operations increased by 13 % in 2008 to EUR 349.1 mn from EUR 309.6 mn in 2007 .",


"Also Lemmink+Æinen 's profit for accounting period went up to EUR 3.1 mn from EUR -24.5 mn a year ago .",
"Also Lemmink+ñinen 's profit for accounting period went up to EUR 3.1 mn from EUR -24.5 mn a year ago .",


'In the third quarter , net sales increased by 12 % year-on-year to EUR159 .5 m , or by 6 % at comparable currency rates growth .',
"In the third quarter , net sales increased by 12 % year-on-year to EUR 159.5 million , or by 6 % at comparable currency rates growth .",

'YIT Construction and the town of Riihim+Æki have signed a lease contract whereby the town will occupy the Travel Centre office facilities .',
'YIT Construction and the town of Riihim+ñki have signed a lease contract whereby the town will occupy the Travel Centre office facilities .',


'NORDIC BUSINESS REPORT-26 June 2006-Metso Corporation wins EUR50m equipment order in Australia -_ 1998-2006 M2 COMMUNICATIONS LTD The Finnish engineering and technology group Metso Corporation said on Monday ( 26 June ) that it has received a EUR50m equipment order in Australia .',
'NORDIC BUSINESS REPORT-26 June 2006-Metso Corporation wins EUR50m equipment order in Australia -® 1998-2006 M2 COMMUNICATIONS LTD The Finnish engineering and technology group Metso Corporation said on Monday ( 26 June ) that it has received a EUR50m equipment order in Australia .',


'The original name Componenta +_m+_l , as a subsidiary of the Finnish Componenta Group , has been changed to +_m+_l Components and the company has seen a 63 % growth in Q1 2010 , in comparison to Q1 2009 .',
'The original name Componenta +àm+Ñl , as a subsidiary of the Finnish Componenta Group , has been changed to +àm+Ñl Components and the company has seen a 63 % growth in Q1 2010 , in comparison to Q1 2009 .',




"CEO Erkki J+ñrvinen is happy with the company 's performance in 2010 .",
"CEO Erkki J+Ærvinen is happy with the company 's performance in 2010 .",



'The adapter , awarded with the `` Certified Integration for SAP -« ; NetWeaver '' endorsement , integrates Basware s invoice automation and procurement solutions with more than 200 different ERP systems .',
'The adapter , awarded with the `` Certified Integration for SAP -‹ ; NetWeaver '' endorsement , integrates Basware s invoice automation and procurement solutions with more than 200 different ERP systems .',
'55 workers in +_m+_l will be affected by the close-down .',
'55 workers in +àm+Ñl will be affected by the close-down .',


'- The Group -¦ s result before taxes was EUR -1.9 ( -3.0 ) million .',
'- The Group -“ s result before taxes was EUR -1.9 ( -3.0 ) million .',


'The real estate company posted a net loss of +ˆ  x201a -õ 59.3 million +ˆ  x201a -õ 0.21 per share compared with a net profit of +ˆ  x201a -õ 31 million +ˆ  x201a -õ 0.11 per share for the corresponding quarter of 2007 .',
'The real estate company posted a net loss of +ó  x201a -¼ 59.3 million +ó  x201a -¼ 0.21 per share compared with a net profit of +ó  x201a -¼ 31 million +ó  x201a -¼ 0.11 per share for the corresponding quarter of 2007 .',


'In addition , the production at the Varpaisj+_ rvi factory will be stopped at the beginning of April 2009 .',
'In addition , the production at the Varpaisj+â rvi factory will be stopped at the beginning of April 2009 .',

'The pulp production in Finnish Kemij+Ærvi will also be liquidated and about 1,100 employees loose their jobs .',
'The pulp production in Finnish Kemij+ñrvi will also be liquidated and about 1,100 employees loose their jobs .',


'Copper , lead and nickel also dropped ... HBOS ( HBOS ) plummeted 20 % to 70.3 pence after saying this year+ˆ ??',
'Copper , lead and nickel also dropped ... HBOS ( HBOS ) plummeted 20 % to 70.3 pence after saying this year+ó ??',


'The acquisition of +_landsbanken Sverige in 2009 burdened the performance with EUR 3.0 mn .',
'The acquisition of +àlandsbanken Sverige in 2009 burdened the performance with EUR 3.0 mn .',

"W+Ærtsil+Æ 's solution has been selected for its low fuel consumption , environmentally sound technology , and global service support .",
"W+ñrtsil+ñ 's solution has been selected for its low fuel consumption , environmentally sound technology , and global service support .",


'In Finland , the Bank of +_land reports its operating profit fell to EUR 6.1 mn in the second quarter of 2008 from EUR 7.5 mn in the second quarter of 2007 .',
'In Finland , the Bank of +àland reports its operating profit fell to EUR 6.1 mn in the second quarter of 2008 from EUR 7.5 mn in the second quarter of 2007 .',



"In Q1 of 2009 , Bank of +_land 's net interest income weakened by 10 % to EUR 9.1 mn .",
"In Q1 of 2009 , Bank of +àland 's net interest income weakened by 10 % to EUR 9.1 mn .",

'Finnish Bank of +_land reports its operating profit fell to EUR 4.9 mn in the third quarter of 2007 from EUR 5.6 mn in the third quarter of 2006 .',
'Finnish Bank of +àland reports its operating profit fell to EUR 4.9 mn in the third quarter of 2007 from EUR 5.6 mn in the third quarter of 2006 .',


'Finnish Bank of +_land reports operating profit of EUR 2.2 mn in the first quarter of 2010 , down from EUR 6.3 mn in the corresponding period in 2009 .',
'Finnish Bank of +àland reports operating profit of EUR 2.2 mn in the first quarter of 2010 , down from EUR 6.3 mn in the corresponding period in 2009 .',


'Net profit fell by almost half to +â 5.5 million from +â 9.4 million at the end of 2007 .',
'Net profit fell by almost half to +é 5.5 million from +é 9.4 million at the end of 2007 .'

]

In [None]:
fbp_df_dev_df = [
    'DMASIA-16 August 2006-Benefon extends manufacturing capability with ASMobile -_ 2006 Digitalmediaasia.com & DMA Ltd. .',
'DMASIA-16 August 2006-Benefon extends manufacturing capability with ASMobile -® 2006 Digitalmediaasia.com & DMA Ltd. .',


'Finnish P+¦yry has been awarded an engineering contract by CFR , the national railway company of Romania .',
'Finnish P+ yry has been awarded an engineering contract by CFR , the national railway company of Romania .',


'Finnish Bank of +_land +_landsbanken has issued a profit warning .',
'Finnish Bank of +àland +àlandsbanken has issued a profit warning .',

'According to Swedish authorities , traces of the very toxic osmium tetroxide have been found on the coast of Per+Æmeri , the Northernmost part of the Gulf of Bothnia .',
'According to Swedish authorities , traces of the very toxic osmium tetroxide have been found on the coast of Per+ñmeri , the Northernmost part of the Gulf of Bothnia .'

]

In [None]:
fpb_df_fbp_fiqa_df = [
    "Finnish Bank of +_land 's consolidated net operating profit increased from EUR 4.8 mn in the first quarter of 2005 to EUR 6.4 mn in the first quarter of 2006 .",
"Finnish Bank of +àland 's consolidated net operating profit increased from EUR 4.8 mn in the first quarter of 2005 to EUR 6.4 mn in the first quarter of 2006 .",


"Clothing chain Sepp+Æl+Æ 's net sales increased by 7.0 % to EUR 30.8 mn .",
"Clothing chain Sepp+ñl+ñ 's net sales increased by 7.0 % to EUR 30.8 mn .",

'Finnish Bank of +_land reports its operating profit rose to EUR 21.3 mn in the second quarter of 2009 from EUR 6.1 mn in the corresponding period in 2008 .',
'Finnish Bank of +àland reports its operating profit rose to EUR 21.3 mn in the second quarter of 2009 from EUR 6.1 mn in the corresponding period in 2008 .',


"According to HKScan Finland , the plan is to increase J+Ærvi-Suomen Portti 's net sales to EUR 80mn to EUR 100mn .",
"According to HKScan Finland , the plan is to increase J+ñrvi-Suomen Portti 's net sales to EUR 80mn to EUR 100mn .",


"In Q1 of 2010 , Bank of +_land 's net interest income increased from EUR 9.1 mn to EUR 9.7 mn .",
"In Q1 of 2010 , Bank of +àland 's net interest income increased from EUR 9.1 mn to EUR 9.7 mn .",

'According to Sepp+Ænen , the new technology UMTS900 solution network building costs are by one-third lower than that of the building of 3.5 G networks , operating at 2,100 MHz frequency .',
'According to Sepp+ñnen , the new technology UMTS900 solution network building costs are by one-third lower than that of the building of 3.5 G networks , operating at 2,100 MHz frequency .',

'Finnish construction group Lemmink+Æinen has been awarded two road building contracts by the Lithuanian transport administration .',
'Finnish construction group Lemmink+ñinen has been awarded two road building contracts by the Lithuanian transport administration .',



'Finnish Rautaruukki has been awarded a contract to supply and install steel superstructures for the Partihallsf+¦rbindelsen bridge in Gothenburg in Sweden .',
'Finnish Rautaruukki has been awarded a contract to supply and install steel superstructures for the Partihallsf+ rbindelsen bridge in Gothenburg in Sweden .',


"Lule+_ municipality has awarded YIT a 2-year contract , for property management of about one third of the municipality 's properties , with a total area of 140,000 sq. metres .",
"Lule+Ñ municipality has awarded YIT a 2-year contract , for property management of about one third of the municipality 's properties , with a total area of 140,000 sq. metres .",



'DMASIA-16 August 2006-Benefon extends manufacturing capability with ASMobile -_ 2006 Digitalmediaasia.com & DMA Ltd. .',
'DMASIA-16 August 2006-Benefon extends manufacturing capability with ASMobile -® 2006 Digitalmediaasia.com & DMA Ltd. .',



'Finnish P+¦yry has been awarded an engineering contract by CFR , the national railway company of Romania .',
'Finnish P+ yry has been awarded an engineering contract by CFR , the national railway company of Romania .',


"Fiskars , the World 's  1 Scissors Brand TM , recently won Learning -‹ Magazine 's 2011 Teachers ' Choice Award for the Classroom .",
"Fiskars , the World 's  1 Scissors Brand TM , recently won Learning -« Magazine 's 2011 Teachers ' Choice Award for the Classroom .",



'The most loyal customers were found in the Bank of +_land , with an index of 8.0 .',
'The most loyal customers were found in the Bank of +àland , with an index of 8.0 .',

'President and CEO Mika Vehvil+ñinen says the positive signs are first and foremost emerging outside Finland .',
'President and CEO Mika Vehvil+Æinen says the positive signs are first and foremost emerging outside Finland .',



'According to Finnish insurance companies Tapiola , Local Insurance Group ( L+Æhivakuutus ) , and Pohjola the two fierce storms at the end of July and in the beginning of August 2010 that felled trees around Finland have speeded up sales of forest insurance .',
'According to Finnish insurance companies Tapiola , Local Insurance Group ( L+ñhivakuutus ) , and Pohjola the two fierce storms at the end of July and in the beginning of August 2010 that felled trees around Finland have speeded up sales of forest insurance .',



'H+_kan Dahlstr+¦m , head of mobility services at TeliaSonera , has forecast that mobile data volume on the TeliaSonera network in Sweden will rise eight-fold to 200,000 TB by 2014 .',
'H+Ñkan Dahlstr+ m , head of mobility services at TeliaSonera , has forecast that mobile data volume on the TeliaSonera network in Sweden will rise eight-fold to 200,000 TB by 2014 .',


"Finnish Metso will supply new wood handling and bleaching lines , as well as a rebuild of the kraft liner washing line for Klabin 's Tel+—maco Borba mill in the Brazilian state of Paran+è .",
"Finnish Metso will supply new wood handling and bleaching lines , as well as a rebuild of the kraft liner washing line for Klabin 's Tel+¬maco Borba mill in the Brazilian state of Paran+í .",


'Finnish-owned contract manufacturer of electronics Elcoteq Hungary Kft has announced plans to recruit more than 650 new staffers to fulfill new orders in P+_cs , where the company has two plants .',
'Finnish-owned contract manufacturer of electronics Elcoteq Hungary Kft has announced plans to recruit more than 650 new staffers to fulfill new orders in P+®cs , where the company has two plants .',



'Finnish food industry companies HK Ruokatalo and Atria will form a joint venture company called L+Ænsi-Kalkkuna to produce turkey meat .',
'Finnish food industry companies HK Ruokatalo and Atria will form a joint venture company called L+ñnsi-Kalkkuna to produce turkey meat .',



'Etteplan targets to employ at least 20 people in Borl+Ænge .',
'Etteplan targets to employ at least 20 people in Borl+ñnge .',


'Finnish Bank of +_land +_landsbanken has issued a profit warning .',
'Finnish Bank of +àland +àlandsbanken has issued a profit warning .'

]

In [None]:
df = pd.read_csv(financial_phrase_bank_loc, encoding='latin-1', names=['sentiment', 'text'], header=None)
df.drop_duplicates(subset = 'text', inplace = True)

print(len(df) - len(set(train_df['text'].values).intersection(set(df.text.values))))
intersection = check_set_duplicates(df, 'text', train_df, 'text')
df = df_no_duplicates(df, 'text', intersection)

res = check_set_duplicates(df, 'text', dev_df, 'text')
print(len(res))
dev_df_duplicates = list(res)

res = check_set_duplicates(eval_datasets['fpb_fiqa'], 'text', df, 'text')
print(len(res))
fbp_fiqa_duplicates = list(res)

sentences = train_df_fpb_df_duplicates + fpb_df_train_df_duplicates + fbp_df_dev_df + fpb_df_fbp_fiqa_df + dev_df_duplicates + fbp_fiqa_duplicates
sentences = list(set(sentences))
sentences = [s.lower() for s in sentences]
df = df_no_duplicates(df, 'text', sentences)

save_dataset('fpb.csv', df, 'source', drive_loc)

## Sentfin

In [None]:
def parse_sentence(sent):
  sentences = sent.split(',')
  sentiments = [s.replace('"', '').replace("{", '').replace('}', '').split(": ")[-1] for s in sentences]

  if set(sentiments) == 1:
    return sentiments[0]
  else:
    if 'positive' in sentiments and 'negative' in sentiments:
      return 'neutral'
    elif 'positive' in sentiments:
      return 'positive'
    elif 'negative' in sentiments:
      return 'negative'
    else:
      return 'neutral'

In [None]:
import pandas as pd
sentfin = pd.read_csv(sentfin_loc)

sentfin.drop_duplicates(subset='Title', inplace=True)

sentfin["Decisions"] = sentfin['Decisions'].apply(lambda x: parse_sentence(x))

sentfin_neutral = sentfin[sentfin['Decisions'] == 'neutral']
sentfin_neutral.drop(labels=['S No.', 'Words'], axis=1, inplace = True)
sentfin_neutral.columns = ['text', 'sentiment']

In [None]:
data_df = pd.read_csv(fpb_fiqa_loc)
data_df.columns = ['text', 'sentiment']

data_df_neutral = data_df[data_df['sentiment'] == 'neutral']

mix = sentfin_neutral.append(data_df_neutral, ignore_index=True)

r = check_set_duplicates(train_df, 'text', mix, 'text')
print(len(r))
mix = df_no_duplicates(mix, 'text', list(r))
print(len(mix))

mix.drop_duplicates(subset='text', inplace=True)

In [None]:
for eval_df in eval_datasets.keys():
  print(eval_df)
  r = check_set_duplicates(eval_datasets[eval_df], 'text', mix, 'text')
  print(len(r))
  mix = df_no_duplicates(mix, 'text', list(r))
  print(len(mix))

In [None]:
import preprocessor as p
p.set_options(p.OPT.URL, p.OPT.EMOJI, p.OPT.MENTION, p.OPT.RESERVED, p.OPT.SMILEY)

mix['text'] = mix['text'].apply(lambda t: p.clean(t))

save_dataset('fpb_fiqa_sentfin_all_neutral.csv', mix, 'source', drive_loc)

# Loghran-McDonald

In [None]:
lmd_master_dictionary = pd.read_csv(loughran_mcdonald_loc)

# extracting negative words from the Master dictionary
negative_lmd = lmd_master_dictionary[lmd_master_dictionary['Negative'] != 0]
print(len(negative_lmd))

# extracting positive words from the Master dictionary
positive_lmd = lmd_master_dictionary[lmd_master_dictionary['Positive'] != 0]
print(len(positive_lmd))

# keeping just the word column
positive_lmd = pd.DataFrame(positive_lmd.iloc[:,0].values, columns=['word'])
negative_lmd = pd.DataFrame(negative_lmd.iloc[:,0].values, columns=['word'])

# saving the positive and negative LMD words
save_dataset('lmd_positive_words.csv', positive_lmd, 'source', drive_loc)
save_dataset('lmd_negative_words.csv', negative_lmd, 'source', drive_loc)