In [None]:
import re
from datetime import datetime, timedelta
import csv

In [None]:
# data containers

report_by_years = dict()
input_by_years = dict()
reports = dict()

pattern = re.compile(r'^(\d\d\.\d\d\.\d\d\d\d)$')
key=""
new_entry=True

years = ['2018', '2017', '2016', '2015', '2014']

In [None]:
for year in years:
    input_by_years[year] = [line.strip('\n') for line in open("reports_"+year+".txt")]


In [None]:
for year in years:
    reports_for_year = {}
    for line in input_by_years[year]:
        if line == "":
            continue
        if re.search(pattern, line):
            new_entry = True
            key = line
            date_time_obj = datetime.strptime(line, '%d.%m.%Y')
            if date_time_obj.weekday() == 4:
                date_time_obj += timedelta(days=2)
            date_time_obj += timedelta(days=1)
            key = str(date_time_obj.strftime("%d.%m.%Y"))
        else:
            new_entry = False
        if new_entry:
            reports_for_year[key] = ""
        else:
            reports_for_year[key]+=line
    report_by_years[year] = reports_for_year


In [None]:
word_points_binary_count_pos = dict()
word_points_binary_count_neg = dict()
word_points_binary_frequency_pos = dict()
word_points_binary_frequency_neg = dict()
word_points_weighted_count_pos = dict()
word_points_weighted_count_neg = dict()
word_points_weighted_frequency_pos = dict()
word_points_weighted_frequency_neg = dict()


l_binary_count = dict()
l_binary_frequency = dict()
l_weighted_count = dict()
l_weighted_frequency = dict()

In [None]:
# define and populate price_change_with_reports

price_change_with_reports = dict()

with open("percentage_change_reports.txt") as file:
    read = csv.reader(file, delimiter="$")
    for row in read:
        date = row[0]
        change = float(row[1])
        price_change_with_reports[date] = change

In [None]:
def input_to_dict(dictionary, key, value):
    if key in dictionary:
        dictionary[key]+=value
    else:
        dictionary[key] = value


# her yilin tum raporlarini dolasip kelimeleri parcalayip deger atiyoruz
for year in years:
    if year == '2018':
        continue
    yearly_report = report_by_years[year]
    for date in yearly_report:
        if date not in price_change_with_reports:
            print("hatali date bulduk: "+date)
            continue
        words = yearly_report[date].lower().split()
        num_words = len(words)
        for word in words:
            if price_change_with_reports[date] > 0:
                input_to_dict(word_points_binary_count_pos, word, 1)
                input_to_dict(word_points_binary_frequency_pos, word, 1/num_words)
                input_to_dict(word_points_weighted_count_pos, word, price_change_with_reports[date])
                input_to_dict(word_points_weighted_frequency_pos, word, price_change_with_reports[date]/num_words)
            else:
                input_to_dict(word_points_binary_count_neg, word, 1)
                input_to_dict(word_points_binary_frequency_neg, word, 1/num_words)
                input_to_dict(word_points_weighted_count_neg, word, price_change_with_reports[date])
                input_to_dict(word_points_weighted_frequency_neg, word, price_change_with_reports[date]/num_words)


In [None]:
def calculate_l_value(pos, neg, l):
    for word in pos:
        if word in neg and neg[word] != 0:
            l[word] = pos[word]/neg[word]
        else:
            l[word] = pos[word]


calculate_l_value(word_points_binary_count_pos, word_points_binary_count_neg, l_binary_count)
calculate_l_value(word_points_binary_frequency_pos, word_points_binary_frequency_neg, l_binary_frequency)
calculate_l_value(word_points_weighted_count_pos, word_points_weighted_count_neg, l_weighted_count)
calculate_l_value(word_points_weighted_frequency_pos, word_points_weighted_frequency_neg, l_weighted_frequency)


In [None]:
from operator import itemgetter
from collections import OrderedDict

sorted_l_binary_count = OrderedDict(sorted(l_binary_count.items(), key=itemgetter(1)))
sorted_l_binary_frequency = OrderedDict(sorted(l_binary_frequency.items(), key=itemgetter(1)))
sorted_l_weighted_count = OrderedDict(sorted(l_weighted_count.items(), key=itemgetter(1)))
sorted_l_weighted_frequency = OrderedDict(sorted(l_weighted_frequency.items(), key=itemgetter(1)))


In [None]:
success_b_c = 0
success_w_c = 0
success_w_f = 0
num_reports = 0


yearly_report = report_by_years['2018']
for date in yearly_report:
    if date not in price_change_with_reports:
        continue
    num_reports+=1
    words = yearly_report[date].lower().split()
    num_words = len(words)
    estimation_binary_count = 0
    estimation_weighted_count = 0
    estimation_weighted_frequency = 0
    for word in words:
        if word not in l_binary_count:
            continue
        estimation_binary_count += l_binary_count[word]
        estimation_weighted_count += l_weighted_count[word]
        estimation_weighted_frequency += l_weighted_frequency[word]
    if price_change_with_reports[date] > 0:
        if estimation_binary_count > 1:
            success_b_c += 1
            # print("On date "+date+" binary count estimated upward movement successfully!")
        if estimation_weighted_count > 0:
            success_w_c += 1
            # print("On date "+date+" weighted count estimated upward movement successfully!")
        if estimation_weighted_frequency > 0:
            success_w_f += 1
            # print("On date "+date+" weighted frequency estimated upward movement successfully!")
    else:
        if estimation_binary_count < 1:
            success_b_c += 1
            # print("On date "+date+" binary count estimated downward movement successfully!")
        if estimation_weighted_count < 0:
            success_w_c += 1
            # print("On date "+date+" weighted count estimated downward movement successfully!")
        if estimation_weighted_frequency < 0:
            success_w_f += 1
        
print("num_reports: "+str(num_reports))
print("success_b_c: %"+str(100*success_b_c/num_reports))
print("success_w_c: %"+str(100*success_w_c/num_reports))
print("success_w_f: %"+str(100*success_w_f/num_reports))

In [None]:
# define how many top words you want to print. default set to 5
offset = 5
print("Binary Count Top "+str(offset))
for i in range(offset):
    print(sorted_l_binary_count[-offset])
print('\nBinary Frequency Top '+str(offset))
for i in range(offset):
    print(sorted_l_binary_frequency[-offset])
print('\nWeighted Count Top '+str(offset))
for i in range(offset):
    print(sorted_l_weighted_count[-offset])
print('\nWeighted Frequency Top '+str(offset))
for i in range(offset):
    print(sorted_l_weighted_frequency[-offset])
