In [108]:
settings = {
    'coin':'ETHBTC',
    'dataset': 'totale_dataset_26_3_ETHBTC.csv', #if there already is an existing dataset don't make new one 
    'fill_column_if_cointains':['twitter_info','events_aggregator','econ_bitcoin'] , #columns to fill 
    'delete_column_if_contains':['CodeRepository'], #columns to delete
    'greater_than_long_missing':25,#min / greater than this number means that the gab is to long for easy algorithm
    'easy_algorithm':{'name':'linear'},
    'difficult_algorithm':{'name':'linear'},#:{'name':'nearest_neigbor', 'neighbors':10},
    'max_percentage_missing':10
    }

In [109]:
import pandas as pd
import numpy as np
from sklearn import preprocessing
import random
import networkx as nx
%matplotlib inline
import matplotlib.pyplot as plt
import pylab
import os 
import glob

 <h1> Table of contents  </h1><br>
    Part 1: Merging the different datasets <br>
    Part 2: Some basic analysis are done <br>
    Part 3: Clean dataset <br>
    Part 4: Imputation <br>

<h1> Part 1: Merging

<b> TODO </b> <br> 
-

In [110]:
#Only merge the files when the settings say it should 
from merge_files import final_merger
if  settings['dataset'] == ' ':
    final_merger(settings['coin'])

In [111]:
#this function checks if you ask for a specific dataset. If not the previous cell just downloaded a dataset and take that one
from datetime import datetime 
coinpair=settings['coin']
day = datetime.today()
dataset_name = settings['dataset'] if not settings['dataset'] == ' ' else f'totale_dataset_{day.day}_{day.month}_{coinpair}.csv'
data = pd.read_csv('data/' + dataset_name)
data['last_start_time']= data['last_start_time']/1000

<h2> Part 2: Basic analysis 

In [112]:
data.columns

Index(['ETHBTC__technical_analysis_candles__rsi',
       'ETHBTC__technical_analysis_candles__macd',
       'ETHBTC__technical_analysis_candles__signal',
       'ETHBTC__technical_analysis_candles__macdhist',
       'ETHBTC__technical_analysis_candles__sma_5',
       'ETHBTC__technical_analysis_candles__sma_10',
       'ETHBTC__technical_analysis_candles__sma_21',
       'ETHBTC__technical_analysis_candles__sma_50',
       'ETHBTC__technical_analysis_candles__sma_100',
       'ETHBTC__technical_analysis_candles__sma_200',
       ...
       'general_info__exchange_info__Turkish_Lira',
       'general_info__exchange_info__New_Taiwan_Dollar',
       'general_info__exchange_info__Ukrainian_hryvnia',
       'general_info__exchange_info__Venezuelan_bolivar_fuerte',
       'general_info__exchange_info__Vietnamese_dong',
       'general_info__exchange_info__South_African_Rand',
       'general_info__exchange_info__IMF_Special_Drawing_Rights',
       'general_info__exchange_info__Silver_Troy_Ou

In [113]:
def print_timespan_of_data(dataset):
    print("first time in dataset is:")
    print(datetime.utcfromtimestamp(dataset['last_start_time'].iloc[0]).strftime('%d-%m-%Y %H:%M:%S'))
    print('')
    print("last time in dataset is:")
    print(datetime.utcfromtimestamp(dataset['last_start_time'].iloc[-1]).strftime('%d-%m-%Y %H:%M:%S'))

In [114]:
print_timespan_of_data(data)

first time in dataset is:
06-02-2020 18:00:00

last time in dataset is:
26-03-2020 05:52:00





# Part 3: Clean dataset 

In [115]:
#helper functions
def write_as_log(title,message,data):
    '''
    Function can be used to write a log in the logs folder. 
    Data has to be an array 
    '''
    with open(f'logs/log_lex/{settings["coin"]}/{title}.txt','w') as temp:
        temp.write(message)
        temp.write('\n\n')
        if type(data) == list:
            for row in data:
                temp.write(str(row))
                temp.write('\n')
        if type(data) == dict:
            for row in data.items():
                temp.write(str(row))
                temp.write('\n')
       
    
def write_as_log2(title,data):
    '''
    Function can be used to write a log in the logs folder. 
    Data has to be an array 
    '''
    with open(f'logs/log_lex/{settings["coin"]}/{title}.txt','w') as temp:
        if type(data) == list:
            temp.write(str(data))
        if type(data) == dict:
            for row in data.items():
                temp.write(str(row))
                temp.write('\n')

<b>Delete the columns that contain keywords that are specified in the settings file 

In [116]:
all_columns = list(data.columns) 
def drop_all_columns(dataset,columns_to_drop):
        for i in columns_to_drop:
            dataset = dataset.drop(columns =[i])     
        return dataset

def delete_columns(dataset):
    '''
    This functions delete the columns that are not usefull. It does so by:
     1. Detecting with columns contain the keywords given in the settings
     2. Delete all those column that contain the keywords
    Eventually it returns the dataset without those columns 
    '''

    def detect_columns_to_delete():
        col_to_delete = []
        for i in range(len(all_columns)):
            for column in settings['delete_column_if_contains']:
                if column in all_columns[i]:
                    col_to_delete.append(all_columns[i])
        return col_to_delete
    columns_to_delete = detect_columns_to_delete()
    dataset = drop_all_columns(dataset,columns_to_delete)
    print(f'Deleted {len(all_columns) - len(dataset.columns)} columns')
    write_as_log('Columns_deleted_from_settings_repository',
                 'Following columns are deleted because they contain a value that is in the setting file',
                 columns_to_delete)
    
    return dataset

data = delete_columns(data)

Deleted 24 columns





<b>Fill the columns for those that are specified in the settings file. Becasue missing data for those columns actually means that it was zero at that moment 

In [117]:
def fill_nan_with_0(dataset, columns_to_replace):
    for i in columns_to_replace:
        dataset[i] = dataset[i].fillna(0)
    return dataset   

col_to_fill = [] 
for i in range(len(all_columns)):
    for column in settings['fill_column_if_cointains']:
        if column in all_columns[i]:
            col_to_fill.append(all_columns[i])
write_as_log('Columns_filled_with_0',
             'Following columns are filled with 0 because nan means 0 in this case',
             col_to_fill)
    
data = fill_nan_with_0(data,col_to_fill)

<b>Insert missing minutes (so no api was working at that moment)

In [118]:
def fill_missing_range(df, field, range_from, range_to, range_step=1, fill_with=0):
    filled_dataset = df\
      .merge(how='right', on=field,
            right = pd.DataFrame({field:np.arange(range_from, range_to, range_step)}))\
      .sort_values(by=field).reset_index().fillna(fill_with).drop(['index'], axis=1)
    filled_dataset_new = filled_dataset.sort_values(by='last_start_time', ascending=True)
    filled_dataset_new = filled_dataset_new.reset_index(drop=True)
    return filled_dataset_new    

#fill in the empty minutes with nan
first = int(data["last_start_time"].iloc[0])
last = int(data["last_start_time"].iloc[-1])
data = fill_missing_range(data,'last_start_time',first,last-60,60,np.nan)

# Part 4: Imputation

<b> TODO </b> <br> 
- Fix the nearest neighbors

In [119]:
def get_sequence_of_missing_data(dataframe,column_name):
    '''
    This function checks checks a column and makes a list of lists that contain the sequences of missing data. 
    So when two minutes of missing data are following each others it makes a list of len 2 with the last start time 
    of those sequences. 
        column_name: The name of the column that you want to check for missing data
        dataframe: The dataframe that should be analyzed 
    '''
    missing = dataframe[dataframe[column_name].isna()]['last_start_time'].tolist() #get the last start time of the missing data
    missing_seq = [] #The list with missing sequences 
    if len(missing) > 0: #should only iterate when missing values otherwise crashes
        for i in range(len(missing)): #iterates through the list of missing start_times
            if i == 0: #the first missing data can never be the second in an sequence. So special case
                current_seq = [missing[i]] 
                first_seq = False      
            elif missing[i]-missing[i-1] == 60: #if the missing data follows another missing point at it to the sequence
                current_seq.append(missing[i])
            else: #if the missing data does not follow another missing point. Close previous sequence and start new one
                missing_seq.append(current_seq)
                current_seq = [missing[i]]
        if len(current_seq) > 0: #if the dataset ends with missing data this should still be appended
            missing_seq.append(current_seq)
            
    return missing_seq

#def handle_missing_data(cut_of_point,algorithm_missing_low,algorithm_missing_high)
#missing_seq = get_sequence_of_missing_data(data,'BNBBTC__technical_analysis_candles__sma_100')

<b>Get sequence per column and analyze the gabs

In [120]:
all_sequences = {}
for column in data.columns:
    all_sequences[column] = get_sequence_of_missing_data(data,column)
    

In [121]:
#this function makes a list of the missing values  and puts them into a text file 
count_missing = {}
last_data_point = data['last_start_time'].iloc[-1]
for key,value in all_sequences.items():
    #print(key)
    #print(value)
    missing_per_column = []
    for val in value:
        #if len(val) != 1: #don't check 1 they are not very important
            missing_per_column.append(len(val))

    count_missing[key] = missing_per_column
            



<b>Remove columns that have a  higher percentage of missing data than is allowed // output the remaining


In [122]:
summary_totals = {}
for key,value in count_missing.items():
    percentage_missing = sum(value)/len(data)
    if percentage_missing >= settings["max_percentage_missing"]/100: 
        summary_totals[key] = sum(value)/len(data)
summary_totals = sorted(summary_totals.items(),key=lambda i:i[1],reverse=True)
data = drop_all_columns(data,[x[0] for x in summary_totals])
write_as_log(f'removed_to_much_missing_data',
             f'columns_removed_more_than_{settings["max_percentage_missing"]}_missing',
             summary_totals)

In [123]:
summary_totals

[('BTC__social_media_data__General_Points', 0.6947344302673598),
 ('BTC__social_media_data__CryptoCompare_Points', 0.6947344302673598),
 ('BTC__social_media_data__CryptoCompare_Followers', 0.6947344302673598),
 ('BTC__social_media_data__CryptoCompare_Posts', 0.6947344302673598),
 ('BTC__social_media_data__CryptoCompare_Comments', 0.6947344302673598),
 ('BTC__social_media_data__CryptoCompare_PageViews', 0.6947344302673598),
 ('BTC__social_media_data__Twitter_Points', 0.6947344302673598),
 ('BTC__social_media_data__Twitter_account_creation', 0.6947344302673598),
 ('BTC__social_media_data__Twitter_followers', 0.6947344302673598),
 ('BTC__social_media_data__Twitter_statuses', 0.6947344302673598),
 ('BTC__social_media_data__Twitter_lists', 0.6947344302673598),
 ('BTC__social_media_data__Twitter_favourites', 0.6947344302673598),
 ('BTC__social_media_data__Twitter_following', 0.6947344302673598),
 ('BTC__social_media_data__Reddit_Points', 0.6947344302673598),
 ('BTC__social_media_data__Reddit

In [124]:
# missing_value_pd = pd.DataFrame()
# for (key,value) in summary_totals:
# #     print(round((value*100),4))
# #     missing_value_pd[key] = value

In [125]:
total_missing = sorted(count_missing.items(),key=lambda i:sum(i[1]),reverse=True)
total_missing = [x for x in total_missing if x[0] not in [x[0] for x in summary_totals]]
write_as_log('Check missing per column',
             'All the gaps in the dataset per column',
             total_missing)

<b>Categorize the columns based on if they have a gab greater than the max required. Afterwards show how graph is before imputation --> do imputation based on algorithm given for easy/difficult --> show graph after imputation


In [126]:
#divide list in easy and more difficult algorithm based on the greatest gab 
#is there a gab greater than minimum it is difficult. If not it is easy 
difficulty_per_column = {}
for row in total_missing:
    for value in row[1]:
        if value > settings['greater_than_long_missing']:
            difficulty_per_column[row[0]] = "difficult"
            break
    else:
        difficulty_per_column[row[0]] = "easy"

In [127]:
y=f"{settings['coin']}__ticker_info__open_time"

In [128]:
y

'ETHBTC__ticker_info__open_time'

In [129]:
import plotly.express as px

fig = px.line(data, x='last_start_time', y='general_info__stock_exchange_index__Nasdaq')   #data[column_and_amount_of_nans.index[n]])    #X=timestamp_x
fig.update_layout(
    xaxis = dict(
        tickangle = 45,
        title_text = "timestamps",
        title_font = {"size": 15}))
fig.show()


In [131]:
#def neareast_neighbor_imputation():
import numpy as np
from sklearn.impute import KNNImputer

def interpolate_linear(column):
    column_data = data[column].interpolate(method='linear').values
    return column_data

def interpolate_nearest_neigbor(column,neigbors):
    imputer = KNNImputer(n_neighbors=neigbors)
    values = (imputer.fit_transform([data[column].values]))[0]
    return values
    

def interpolate_column(column,settings):
    if settings['name'] == 'nearest_neigbor':
        return interpolate_nearest_neigbor(column, settings['neighbors'])
    elif settings['name'] == 'linear':
        return interpolate_linear(column)
    else:
        raise TypeError("Type of interpolation not yet programmed")
    
for key,value in difficulty_per_column.items():
    
    if value == 'difficult':
        data_column = interpolate_column(key,settings['difficult_algorithm'])
    elif value == 'easy':
        data_column = interpolate_column(key,settings['easy_algorithm'])
    else:
        raise TypeError("Typo")

    data[key] = data_column

In [132]:
import plotly.express as px

fig = px.line(data, x='last_start_time', y='general_info__stock_exchange_index__Nasdaq')   #data[column_and_amount_of_nans.index[n]])    #X=timestamp_x
fig.update_layout(
    xaxis = dict(
        tickangle = 45,
        title_text = "timestamps",
        title_font = {"size": 15}))
fig.show()


#### delete variables that are constant in dataset


In [25]:
data.columns[data.nunique() <= 1]

Index(['ETHBTC__general_info__market_cap_rank',
       'ETHBTC__general_info__total_supply', 'ETHBTC__general_info__ath',
       'general_info__hash_rate_streamer_bitcoin__block_reward',
       'general_info__stock_exchange_index__S_P_CLX_IPSA',
       'general_info__exchange_info__Bitcoin'],
      dtype='object')

In [26]:
data = data.loc[:,data.apply(pd.Series.nunique) != 1]
len(data.columns)

344

# Part 5: Feature selection

In [27]:
data['close_price_next_min'] = data[f"{settings['coin']}__ticker_info__close_price"].shift(1) #make predictive val
data = data.iloc[1:] #remove first row since predictive val is nan than

In [28]:
settings_feature_selection = {'min_correlation_with_y':0.1,
                              'max_correlation_with_other_x':0.75,
                              'max_avg_multicor': 10,
                             'action_correlation':'drop_random',
                             'max_vif_value':5,
                              "normalize_before_corr":False,
                             "algorithms_feature_selection":['trees','temp']}

<h3> Normalize data

In [29]:
import pandas as pd
from sklearn import preprocessing

x = data.values #returns a numpy array
min_max_scaler = preprocessing.MinMaxScaler()
x_scaled = min_max_scaler.fit_transform(x)
data_normalized = pd.DataFrame(x_scaled,columns=data.columns)


In [3]:
import pandas as pd
data = pd.read_csv('data_after_cleaning_norm.csv')

data2 = data
data2['difference'] = data2['close_price_next_min']-data2['ETHBTC__ticker_info__close_price']
data2['dummy_next_start_time'] = data2['difference'].apply(lambda x: 1 if x > 0 else 0) 
data2 = data2.drop(columns=['difference','close_price_next_min','last_start_time'])

columns = list(data2.columns)
columns.append("dummy_next_start_time") 
final_columns = [column for column in columns if column!="dummy_next_start_time"]

FileNotFoundError: [Errno 2] File b'data_after_cleaning_norm.csv' does not exist: b'data_after_cleaning_norm.csv'

#### run this when you have max_vif limitation

In [None]:
error 

write_as_log(f'{settings["coin"]}_removed_VIF_higher_than_{settings_feature_selection["max_vif_value"]}_corr_y_{settings_feature_selection["min_correlation_with_y"]}',
             f'columns_removed_with_more_than_{settings_feature_selection["max_vif_value"]}_as_a_max_vif_score',
             removed_columns)

vif.sort_values("VIF Factor",ascending=False).to_csv(f'logs/log_lex/{settings["coin"]}/{settings["coin"]}_scores_after_max_VIF_{settings_feature_selection["max_vif_value"]}_corr_y_{settings_feature_selection["min_correlation_with_y"]}.csv',
                                                     index=False)                                       

#### run this if you have max avg vif limitation

In [None]:
error

write_as_log(f'{settings["coin"]}_removed_avg_VIF_above_{settings_feature_selection["max_avg_multicor"]}_corr_y_{settings_feature_selection["min_correlation_with_y"]}',
             f'columns_removed_more_than_{settings_feature_selection["max_avg_multicor"]} as multicor',
             removed_columns)

vif.sort_values("VIF Factor",ascending=False).to_csv(f'logs/log_lex/{settings["coin"]}_scores_after_avg_VIF_{settings_feature_selection["max_avg_multicor"]}_corr_y_{settings_feature_selection["min_correlation_with_y"]}.csv',index=False)                        

### vergelijk vif scores met elkaar en kijk welke overal worden bewaard

In [None]:
# import os module 
import os 
import glob


def extract_list_from_txt_file(textfile):
    os.chdir("C:/Users/lexfo/PycharmProjects/thesis_lex/logs/log_lex/ETHBTC") 
    cwd = os.getcwd() 
    lijst = []
    with open(textfile, "r") as myfile:
        for myline in myfile:
            if myline.startswith("('"):
                myline=myline.replace("'","")
                a= myline.strip(" ()").split(",")
                lijst.append(a[0])
    return lijst

In [None]:
import glob
def get_all_text_files():
    all_text_files = []
    os.chdir("C:/Users/lexfo/PycharmProjects/thesis_lex/logs/log_lex/ETHBTC") 
    myFiles = glob.glob('*0.1.txt')
    for i in myFiles:
        if i.startswith(f'{settings["coin"]}'):
            all_text_files.append(i)
    return all_text_files
text_files = get_all_text_files()
text_files

In [None]:
max_5_0_1 = extract_list_from_txt_file("ETHBTC_removed_VIF_higher_than_5_corr_y_0.1.txt")
max_10_0_1 = extract_list_from_txt_file("ETHBTC_removed_VIF_higher_than_10_corr_y_0.1.txt")
avg_5_0_1 = extract_list_from_txt_file("ETHBTC_removed_VIF_avg_higher_5_corr_y_0.1.txt")
avg_10_0_1 = extract_list_from_txt_file("ETHBTC_removed_VIF_avg_higher_10_corr_y_0.1.txt")
len(max_5_0_1)

In [None]:
over_na_max_5_0_1= [x for x in final_data_with_corr_higher_than_10 if x not in max_5_0_1]
over_na_max_10_0_1 =[x for x in final_data_with_corr_higher_than_10 if x not in max_10_0_1]
over_na_avg_5_0_1= [x for x in final_data_with_corr_higher_than_10 if x not in avg_5_0_1]
over_na_avg_10_0_1 =[x for x in final_data_with_corr_higher_than_10 if x not in avg_10_0_1]

In [None]:
alles_0_1 = over_na_max_5_0_1+over_na_max_10_0_1+over_na_avg_5_0_1+over_na_avg_10_0_1
# print(len(alles_0_1))
c = [item for item in alles_0_1]
d = {item:c.count(item) for item in c}
d

In [None]:
# import os module 
import os 
import glob


def extract_list_from_txt_file(textfile):
    os.chdir("C:/Users/lexfo/PycharmProjects/thesis_lex/logs/log_lex/ETHBTC") 
    cwd = os.getcwd() 
    lijst = []
    with open(textfile, "r") as myfile:
        for myline in myfile:
            if myline.startswith("('"):
                myline=myline.replace("'","")
                a= myline.strip(" ()").split(",")
                lijst.append(a[0])
    return lijst

In [None]:
import glob
def get_all_text_files():
    all_text_files = []
    os.chdir("C:/Users/lexfo/PycharmProjects/thesis_lex/logs/log_lex/ETHBTC") 
    myFiles = glob.glob('*0.2.txt')
    for i in myFiles:
        if i.startswith(f'{settings["coin"]}'):
            all_text_files.append(i)
    return all_text_files
text_files = get_all_text_files()
text_files

In [None]:
max_5_0_2 = extract_list_from_txt_file("ETHBTC_removed_VIF_higher_than_5_corr_y_0.2.txt")
max_10_0_2 = extract_list_from_txt_file("ETHBTC_removed_VIF_higher_than_10_corr_y_0.2.txt")
avg_5_0_2 = extract_list_from_txt_file("ETHBTC_removed_VIF_avg_higher_5_corr_y_0.2.txt")
avg_10_0_2 = extract_list_from_txt_file("ETHBTC_removed_VIF_avg_higher_10_corr_y_0.2.txt")

len(avg_10_0_2)

In [None]:
over_na_max_5_0_2= [x for x in final_data_with_corr_higher_than_20 if x not in max_5_0_2]
over_na_max_10_0_2 =[x for x in final_data_with_corr_higher_than_20 if x not in max_10_0_2]
over_na_avg_5_0_2 = [x for x in final_data_with_corr_higher_than_20 if x not in avg_5_0_2]
over_na_avg_10_0_2 =[x for x in final_data_with_corr_higher_than_20 if x not in avg_10_0_2]

In [None]:
alles_0_2 = over_na_max_5_0_2+over_na_max_10_0_2+over_na_avg_5_0_2+over_na_avg_10_0_2
# print(len(alles_0_1))
c2 = [item for item in alles_0_2]
d2 = {item:c2.count(item) for item in c2}
d2