In [24]:
#Plotting Library
import matplotlib.pyplot as plt
from mpl_toolkits.axes_grid1 import ImageGrid
from pyts.image import GramianAngularField
from pyts.datasets import load_gunpoint

#Data Manipulation Library
import pandas as pd
import numpy as np

#Runtime Library
from tqdm import tqdm

#Data Preprocessing Libraries
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder
from sklearn.model_selection import train_test_split

In [25]:
####-------------------- Full Kaggle Dataset Import ------------------------------------####

In [26]:
price_data = pd.read_csv('/Users/jakob/OneDrive - Universitat Ramón Llull/TechLabsProj/cnn_input/cnn_fullpriceinput.csv')
price_data.drop(['Unnamed: 0'], axis=1, inplace=True)
price_data.dropna(axis=0, inplace=True)

In [27]:
####-------------------- Additional Fake Features ------------------------------------####

In [28]:
def fill_up_features_count(feature_list):
    """
    Every feature is later plotted as a smaller square embedded in a larger square. Therefore the number of features
    needs to have an integer square root.
    
    Based on the imported technicals this function checks how many 'mock features' need to be added
    to make the plotting possible.
    """
    feature_count = len(feature_list)-1 #-1 for return feature (target)
    
    while (np.sqrt(feature_count)).is_integer() == False:
        feature_count = feature_count +1
    
    to_fill = feature_count - (len(feature_list) -1) #-1 for return feature (target)
    return (feature_count, to_fill)

to_fill = fill_up_features_count(price_data.columns)

In [29]:
def fill_up_features(price_data, to_fill):
    """
    This function adds the computed number of mock features as null columns to the dataset.
    """
    for i in range(0, to_fill[1]):
        price_data[f'mock_{i}'] = 0
    
    return price_data

price_data = fill_up_features(price_data, to_fill)

In [30]:
####------------------ Distributing & Exporting Train & Test Dataset ------------------####

In [31]:
df = price_data
list_features = list(df.iloc[:, :-1].columns)
print('Total number of features', len(list_features))

#define return column as target & split in x/y & train/test
x_train, x_test, y_train, y_test = train_test_split(df.iloc[:, :-1].values, df['return'].values, train_size=0.8, 
                                                    test_size=0.2, random_state=2, shuffle=True)

Total number of features 36


In [32]:
#export target variable as csv for further processing
pd.DataFrame(y_train).to_csv('/Users/jakob/OneDrive - Universitat Ramón Llull/TechLabsProj/cnn_input/y_train.csv', index_label=False)
pd.DataFrame(y_test).to_csv('/Users/jakob/OneDrive - Universitat Ramón Llull/TechLabsProj/cnn_input/y_test.csv', index_label=False)

In [33]:
####-------------------- Imaging Data as pyplot ------------------------------------####

In [34]:
def image_format(x, img_width, img_height):
    '''
    Function formats the features in a matrix that can be plotted (as square) later.
    1st Step: Creating a list of null arrays each with the dimensions dimXdim as basis to fill.
    2nd Step: Each datapoint (technicals per day) is iterated as list of values & reshaped in a dimXdim matrix which replaced the 0 matrix.
    '''
    x_temp = np.zeros((len(x), img_height, img_width)) #creating array of zeros as basis (shape given as lengthXheightXwidth)
    for i in range(x.shape[0]): #iterating rows of input
        x_temp[i] = np.reshape(x[i], (img_height, img_width)) #each row is iterated as list of values 
    return x_temp

In [35]:
#compute the side length of the plotted squares
dim = int(np.sqrt(len(list_features))) #sqrt in order to fit the quadratic plot sturcture (sqrt of number of features needs to be an integer)

#transforming tabular data to be plotted
x_train = image_format(x_train, dim, dim)
x_test = image_format(x_test, dim, dim)

# adding 1 additional dimension (containing 3 times each value to represent color)
x_train = np.stack((x_train,) * 3, axis=-1)
x_test = np.stack((x_test,) * 3, axis=-1)

print("final shape of x, y train/test {} {} {} {}".format(x_train.shape, y_train.shape, x_test.shape, y_test.shape))

final shape of x, y train/test (320228, 6, 6, 3) (320228,) (80057, 6, 6, 3) (80057,)


In [36]:
def convert_to_img(input_matrix, name, y_mat):
    """
    This function is converting the data into images:
    1. Plot the shaped matrix as square of smaller squares.
    2. Sorting every image on label (return positive or negative).
    3. Saving the image in the respective folder.
    """
    for i in tqdm(range(0, len(input_matrix))): #iterating all rows
        img = input_matrix[i]
        plt.imshow(img) # drawing data as a 2d plot/ figure
        plt.axis("off")
        plt.title(f'{name}_index_{str(i)}', fontsize=10)
        plt.subplots_adjust(wspace=0.2, hspace=0.2)
        
        if y_mat[i] > 0: 
            plt.savefig(f'/Users/jakob/OneDrive - Universitat Ramón Llull/TechLabsProj/cnn_input/pyplot_labelling/{name}/pos/{name}_index_{str(i)}.png')
        else:
            plt.savefig(f'/Users/jakob/OneDrive - Universitat Ramón Llull/TechLabsProj/cnn_input/pyplot_labelling/{name}/neg/{name}_index_{str(i)}.png')
    return

In [None]:
convert_to_img(x_train, 'x_train', y_train)

In [None]:
convert_to_img(x_test, 'x_test', y_test)