In [10]:
import pandas as pd
import numpy as np
import yfinance as yf
from scipy.stats import skew, kurtosis

In [11]:
# Function to extract stock codes from a CSV file
def extract_stock_codes(file_path):
    try:
        # Read the CSV file into a DataFrame
        df = pd.read_csv(file_path)

        # Extract the "Stock Code" column into a list
        stock_codes_list = df['Symbol'].astype(str).tolist()

        return stock_codes_list
    except Exception as e:
        print(f"Error: {e}")
        return None

In [12]:
# Function to fetch stock data using Yahoo Finance API
def get_stock_data_list(stock_codes, period='1y'):
    all_stock_data = []
    
    for stock_code in stock_codes:
        try:
            yahoo_ticker = stock_code + '.NS'
            stock = yf.Ticker(yahoo_ticker)
            data = stock.history(period=period)
            if not data.empty:
                stock_data = {
                    'Date': data.index.tolist(),
                    'Ticker': stock_code,
                    'Open': data['Open'].tolist(),
                    'High': data['High'].tolist(),
                    'Low': data['Low'].tolist(),
                    'Close': data['Close'].tolist(),
                    'Volume': data['Volume'].tolist(),
                    'Sector': stock.info.get('industry', 'N/A')
                }
                all_stock_data.append(pd.DataFrame(stock_data))
            else:
                print(f"No data available for {stock_code}")
        except Exception as e:
            print(f"Error fetching data for {stock_code}: {e}")
    
    return pd.concat(all_stock_data, ignore_index=True)

In [13]:
# Function to create time series dataset
def create_timeseries_dataset_static(df, window_size=180):
    dataset = []

    for ticker in df['Ticker'].unique():
        sub_df = df[df['Ticker'] == ticker].sort_values('Date').reset_index(drop=True)
        prices = sub_df['Close'].values
        dates = sub_df['Date'].values

        for i in range(len(prices) - window_size):
            window = prices[i:i+window_size]
            next_day = prices[i+window_size]

            last_day_in_window = window[-1]
            direction = 1 if next_day > last_day_in_window else -1
            pct_change = ((next_day - last_day_in_window) / last_day_in_window) * 100

            dataset.append({
                'Ticker': ticker,
                'MinDate': dates[i],
                'MaxDate': dates[i + window_size - 1],
                'TargetDate': dates[i + window_size],
                'Features': window.tolist(),
                'TargetValue': next_day,
                'Direction': direction,
                'PctChange': pct_change
            })

    return pd.DataFrame(dataset)

In [14]:
def create_timeseries_dataset_series(df, window_size=1):
    dataset = []

    for ticker in df['Ticker'].unique():
        sub_df = df[df['Ticker'] == ticker].sort_values('Date').reset_index(drop=True)
        prices = sub_df['Close'].values
        dates = sub_df['Date'].values

        # Start from the minimum window size and grow the window
        for i in range(window_size, len(prices)):
            window = prices[0:i]           # Growing window from the start
            next_day = prices[i]           # Prediction target is the next value

            last_day_in_window = window[-1]
            direction = 1 if next_day > last_day_in_window else -1
            pct_change = ((next_day - last_day_in_window) / last_day_in_window) * 100

            dataset.append({
                'Ticker': ticker,
                'MinDate': dates[0],
                'MaxDate': dates[i - 1],
                'TargetDate': dates[i],
                'Features': window.tolist(),
                'TargetValue': next_day,
                'Direction': direction,
                'PctChange': pct_change
            })

    return pd.DataFrame(dataset)


In [15]:
# Function to add statistical features
def add_stock_features(df):
    stats = {
        'Mean': [],
        'Median': [],
        'StdDev': [],
        'Variance': [],
        'Min': [],
        'Max': [],
        'Q1': [],
        'Q3': [],
        'IQR': [],
        'OutlierCount': [],
        'Momentum': [],
        'Range': [],
        'Skewness': [],
        'Kurtosis': []
    }

    for row in df['Features']:
        arr = np.array(row)
        q1 = np.percentile(arr, 25)
        q3 = np.percentile(arr, 75)
        iqr_val = q3 - q1
        outliers = arr[(arr < q1 - 1.5 * iqr_val) | (arr > q3 + 1.5 * iqr_val)]

        stats['Mean'].append(np.mean(arr))
        stats['Median'].append(np.median(arr))
        stats['StdDev'].append(np.std(arr))
        stats['Variance'].append(np.var(arr))
        stats['Min'].append(np.min(arr))
        stats['Max'].append(np.max(arr))
        stats['Q1'].append(q1)
        stats['Q3'].append(q3)
        stats['IQR'].append(iqr_val)
        stats['OutlierCount'].append(len(outliers))
        stats['Momentum'].append(arr[-1] - arr[0])
        stats['Range'].append(np.max(arr) - np.min(arr))
        stats['Skewness'].append(skew(arr))
        stats['Kurtosis'].append(kurtosis(arr))

    for key, values in stats.items():
        df[key] = values

    return df

### Explanation of the Code:
#### extract_stock_codes: Extracts stock symbols from a CSV file containing a list of companies (Nifty 100 in this case).
#### get_stock_data_list: Fetches historical stock data from Yahoo Finance using the yfinance library.
#### create_timeseries_dataset: Generates a time-series dataset where the Features are the stock prices for the previous n-1 days, and the TargetValue is the next day's closing price. It also calculates whether the next day will be positive or negative (based on price movement).
#### add_stock_features: Adds various statistical features to the dataset, such as mean, standard deviation, variance, skewness, kurtosis, momentum, price range, and outlier detection based on the 1.5 IQR rule.
#### Combining everything: The script reads stock data, creates a time series, adds statistical features, and then displays the enriched dataset.

In [16]:
# File path for stock data
file_path = 'ind_niftysmallcap250list.csv'

# 1. Extract stock codes
stock_codes_list = extract_stock_codes(file_path)
stock_codes_list = stock_codes_list[0:1]
stock_codes_list

['360ONE']

In [17]:
# 2. Fetch historical stock data
stock_data_df = get_stock_data_list(stock_codes_list, period='100000d')
print(stock_data_df.shape)
stock_data_df.head()

(1389, 8)


Unnamed: 0,Date,Ticker,Open,High,Low,Close,Volume,Sector
0,2019-09-19 00:00:00+05:30,360ONE,248.479731,260.903717,248.479731,260.903717,7109860,Asset Management
1,2019-09-20 00:00:00+05:30,360ONE,272.917041,273.943817,272.917041,273.943817,2760648,Asset Management
2,2019-09-23 00:00:00+05:30,360ONE,287.640991,287.640991,287.640991,287.640991,34096,Asset Management
3,2019-09-24 00:00:00+05:30,360ONE,302.015869,302.015869,302.015869,302.015869,2496652,Asset Management
4,2019-09-25 00:00:00+05:30,360ONE,317.109406,317.109406,286.9222,288.513702,530580,Asset Management


In [18]:
# 3. Create time series dataset with last n-1 days as features
#base_dataset = create_timeseries_dataset_static(stock_data_df, window_size=180)
base_dataset = create_timeseries_dataset_series(stock_data_df, window_size=180)
print(base_dataset.shape)
base_dataset.head()

(1209, 8)


Unnamed: 0,Ticker,MinDate,MaxDate,TargetDate,Features,TargetValue,Direction,PctChange
0,360ONE,2019-09-18 18:30:00,2020-06-14 18:30:00,2020-06-15 18:30:00,"[260.9037170410156, 273.9438171386719, 287.640...",202.839661,-1,-4.823874
1,360ONE,2019-09-18 18:30:00,2020-06-15 18:30:00,2020-06-16 18:30:00,"[260.9037170410156, 273.9438171386719, 287.640...",208.647324,1,2.863179
2,360ONE,2019-09-18 18:30:00,2020-06-16 18:30:00,2020-06-17 18:30:00,"[260.9037170410156, 273.9438171386719, 287.640...",207.865265,-1,-0.374823
3,360ONE,2019-09-18 18:30:00,2020-06-17 18:30:00,2020-06-18 18:30:00,"[260.9037170410156, 273.9438171386719, 287.640...",208.240631,1,0.180581
4,360ONE,2019-09-18 18:30:00,2020-06-18 18:30:00,2020-06-21 18:30:00,"[260.9037170410156, 273.9438171386719, 287.640...",203.048187,-1,-2.493483


In [19]:
# 4. Add statistical features to the dataset
#enriched_dataset = add_stock_features(base_dataset)
#enriched_dataset.head()

In [20]:
base_dataset.Ticker.value_counts()

Ticker
360ONE    1209
Name: count, dtype: int64

## Embedding Creation

In [21]:
import numpy as np
import pandas as pd
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity

# Initialize SentenceTransformer model (SBERT)
model = SentenceTransformer('all-MiniLM-L6-v2')  # This is a lightweight model, you can use larger models if needed


# Function to generate embeddings for the list of numbers in 'Features'
def get_embedding(numbers_list):
    # Convert the numerical list to a string (SBERT works well with text input, but numbers can be treated as text)
    return model.encode([str(numbers_list)])

# Apply the function to the 'Features' column to generate embeddings
base_dataset['embedding'] = base_dataset['Features'].apply(get_embedding)
base_dataset.head()




Unnamed: 0,Ticker,MinDate,MaxDate,TargetDate,Features,TargetValue,Direction,PctChange,embedding
0,360ONE,2019-09-18 18:30:00,2020-06-14 18:30:00,2020-06-15 18:30:00,"[260.9037170410156, 273.9438171386719, 287.640...",202.839661,-1,-4.823874,"[[0.058084052, -0.029030701, 0.010628777, -0.0..."
1,360ONE,2019-09-18 18:30:00,2020-06-15 18:30:00,2020-06-16 18:30:00,"[260.9037170410156, 273.9438171386719, 287.640...",208.647324,1,2.863179,"[[0.058084052, -0.029030701, 0.010628777, -0.0..."
2,360ONE,2019-09-18 18:30:00,2020-06-16 18:30:00,2020-06-17 18:30:00,"[260.9037170410156, 273.9438171386719, 287.640...",207.865265,-1,-0.374823,"[[0.058084052, -0.029030701, 0.010628777, -0.0..."
3,360ONE,2019-09-18 18:30:00,2020-06-17 18:30:00,2020-06-18 18:30:00,"[260.9037170410156, 273.9438171386719, 287.640...",208.240631,1,0.180581,"[[0.058084052, -0.029030701, 0.010628777, -0.0..."
4,360ONE,2019-09-18 18:30:00,2020-06-18 18:30:00,2020-06-21 18:30:00,"[260.9037170410156, 273.9438171386719, 287.640...",203.048187,-1,-2.493483,"[[0.058084052, -0.029030701, 0.010628777, -0.0..."


In [22]:
# Save the length of each embedding into a new column
base_dataset['embedding_length'] = base_dataset['embedding'].apply(lambda x: x.shape[1] if len(x.shape) > 1 else x.shape[0])
base_dataset.head()

Unnamed: 0,Ticker,MinDate,MaxDate,TargetDate,Features,TargetValue,Direction,PctChange,embedding,embedding_length
0,360ONE,2019-09-18 18:30:00,2020-06-14 18:30:00,2020-06-15 18:30:00,"[260.9037170410156, 273.9438171386719, 287.640...",202.839661,-1,-4.823874,"[[0.058084052, -0.029030701, 0.010628777, -0.0...",384
1,360ONE,2019-09-18 18:30:00,2020-06-15 18:30:00,2020-06-16 18:30:00,"[260.9037170410156, 273.9438171386719, 287.640...",208.647324,1,2.863179,"[[0.058084052, -0.029030701, 0.010628777, -0.0...",384
2,360ONE,2019-09-18 18:30:00,2020-06-16 18:30:00,2020-06-17 18:30:00,"[260.9037170410156, 273.9438171386719, 287.640...",207.865265,-1,-0.374823,"[[0.058084052, -0.029030701, 0.010628777, -0.0...",384
3,360ONE,2019-09-18 18:30:00,2020-06-17 18:30:00,2020-06-18 18:30:00,"[260.9037170410156, 273.9438171386719, 287.640...",208.240631,1,0.180581,"[[0.058084052, -0.029030701, 0.010628777, -0.0...",384
4,360ONE,2019-09-18 18:30:00,2020-06-18 18:30:00,2020-06-21 18:30:00,"[260.9037170410156, 273.9438171386719, 287.640...",203.048187,-1,-2.493483,"[[0.058084052, -0.029030701, 0.010628777, -0.0...",384


## Search of Right Match

In [23]:
# Now, let's compare the similarity of the first embedding with all others
embedding_n = base_dataset['embedding'][2]  # Reference embedding (first row)

# Calculate cosine similarity between the reference embedding and all others
similarities = [cosine_similarity(embedding_n.reshape(1, -1), emb.reshape(1, -1))[0][0] for emb in base_dataset['embedding']]

# Add the similarity scores to the dataframe
base_dataset['similarity'] = similarities

# Sort the DataFrame based on similarity scores to find the most similar series
#base_dataset_sorted = base_dataset.sort_values(by='similarity', ascending=False)

# Display the sorted DataFrame to see the most similar series
base_dataset.head()

Unnamed: 0,Ticker,MinDate,MaxDate,TargetDate,Features,TargetValue,Direction,PctChange,embedding,embedding_length,similarity
0,360ONE,2019-09-18 18:30:00,2020-06-14 18:30:00,2020-06-15 18:30:00,"[260.9037170410156, 273.9438171386719, 287.640...",202.839661,-1,-4.823874,"[[0.058084052, -0.029030701, 0.010628777, -0.0...",384,1.0
1,360ONE,2019-09-18 18:30:00,2020-06-15 18:30:00,2020-06-16 18:30:00,"[260.9037170410156, 273.9438171386719, 287.640...",208.647324,1,2.863179,"[[0.058084052, -0.029030701, 0.010628777, -0.0...",384,1.0
2,360ONE,2019-09-18 18:30:00,2020-06-16 18:30:00,2020-06-17 18:30:00,"[260.9037170410156, 273.9438171386719, 287.640...",207.865265,-1,-0.374823,"[[0.058084052, -0.029030701, 0.010628777, -0.0...",384,1.0
3,360ONE,2019-09-18 18:30:00,2020-06-17 18:30:00,2020-06-18 18:30:00,"[260.9037170410156, 273.9438171386719, 287.640...",208.240631,1,0.180581,"[[0.058084052, -0.029030701, 0.010628777, -0.0...",384,1.0
4,360ONE,2019-09-18 18:30:00,2020-06-18 18:30:00,2020-06-21 18:30:00,"[260.9037170410156, 273.9438171386719, 287.640...",203.048187,-1,-2.493483,"[[0.058084052, -0.029030701, 0.010628777, -0.0...",384,1.0


In [16]:
#Note: Embedding fails to catpure the pattern in the data!!!!!

## Creating Embedding Model

In [33]:
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras import layers, models
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [34]:
# ---------------------------
# 1. Extract Data from DataFrame
# ---------------------------
features_list = base_dataset['Features'].tolist()
targets = base_dataset['TargetValue'].tolist()

# ---------------------------
# 2. Preprocessing: Pad Sequences
# ---------------------------
max_seq_len = max(len(seq) for seq in features_list)
padded_features = pad_sequences(features_list, maxlen=max_seq_len, dtype='float32', padding='post')

# Add channel dimension for (batch, seq_len, 1)
padded_features = np.expand_dims(padded_features, axis=-1)
targets = np.array(targets, dtype=np.float32)

In [35]:
# ---------------------------
# 3. Subclassed Model Definition
# ---------------------------
class EmbeddingModel(tf.keras.Model):
    def __init__(self, d_model=64, num_heads=2, ff_dim=128, output_dim=32, **kwargs):
        super(EmbeddingModel, self).__init__(**kwargs)

        self.d_model = d_model
        self.num_heads = num_heads
        self.ff_dim = ff_dim
        self.output_dim = output_dim

        self.time_distributed = layers.TimeDistributed(layers.Dense(d_model))
        self.masking = layers.Masking()
        self.attn = layers.MultiHeadAttention(num_heads=num_heads, key_dim=d_model)

        # Ensuring Dense layers are properly instantiated
        self.ffn = tf.keras.Sequential([
            layers.Dense(ff_dim, activation='relu'),
            layers.Dense(d_model),
        ])
        self.global_pool = layers.GlobalAveragePooling1D()
        self.output_dense = layers.Dense(output_dim)

    def call(self, inputs):
        x = self.time_distributed(inputs)
        x = self.masking(x)
        attn_out = self.attn(x, x)
        x = self.ffn(attn_out)
        x = self.global_pool(x)
        return self.output_dense(x)

    def get_config(self):
        config = super(EmbeddingModel, self).get_config()
        config.update({
            'd_model': self.d_model,
            'num_heads': self.num_heads,
            'ff_dim': self.ff_dim,
            'output_dim': self.output_dim,
        })
        return config

    @classmethod
    def from_config(cls, config):
        return cls(**config)

In [None]:
# ---------------------------
# 4. Train and Save Model
# ---------------------------
model = EmbeddingModel()
model.compile(optimizer='adam', loss='mse')

# Ensure that the model works with the given data
model.fit(padded_features, targets, epochs=10, batch_size=1, verbose=1)

# Save model with the correct file extension
model_save_path = "embedding_model.keras"
model.save(model_save_path)

In [27]:
base_dataset.head()

Unnamed: 0,Ticker,MinDate,MaxDate,TargetDate,Features,TargetValue,Direction,PctChange,embedding,embedding_length,similarity
0,360ONE,2019-09-18 18:30:00,2020-06-14 18:30:00,2020-06-15 18:30:00,"[260.9037170410156, 273.9438171386719, 287.640...",202.839661,-1,-4.823874,"[[0.058084052, -0.029030701, 0.010628777, -0.0...",384,1.0
1,360ONE,2019-09-18 18:30:00,2020-06-15 18:30:00,2020-06-16 18:30:00,"[260.9037170410156, 273.9438171386719, 287.640...",208.647324,1,2.863179,"[[0.058084052, -0.029030701, 0.010628777, -0.0...",384,1.0
2,360ONE,2019-09-18 18:30:00,2020-06-16 18:30:00,2020-06-17 18:30:00,"[260.9037170410156, 273.9438171386719, 287.640...",207.865265,-1,-0.374823,"[[0.058084052, -0.029030701, 0.010628777, -0.0...",384,1.0
3,360ONE,2019-09-18 18:30:00,2020-06-17 18:30:00,2020-06-18 18:30:00,"[260.9037170410156, 273.9438171386719, 287.640...",208.240631,1,0.180581,"[[0.058084052, -0.029030701, 0.010628777, -0.0...",384,1.0
4,360ONE,2019-09-18 18:30:00,2020-06-18 18:30:00,2020-06-21 18:30:00,"[260.9037170410156, 273.9438171386719, 287.640...",203.048187,-1,-2.493483,"[[0.058084052, -0.029030701, 0.010628777, -0.0...",384,1.0


In [28]:
# ---------------------------
# 5. Load Model and Use for Embedding
# ---------------------------
loaded_model = tf.keras.models.load_model(model_save_path, custom_objects={'EmbeddingModel': EmbeddingModel})

# Function to generate embeddings for the list of numbers in 'Features'
def get_embedding(numbers_list):
    padded_input = pad_sequences([numbers_list], maxlen=max_seq_len, padding='post', dtype='float32')
    padded_input = np.expand_dims(padded_input, axis=-1)  # (1, seq_len, 1)
    return loaded_model.predict(padded_input).flatten()

# Apply the function to the 'Features' column to generate embeddings
base_dataset['embedding_local'] = base_dataset['Features'].apply(get_embedding)

# Display the updated DataFrame with embeddings
base_dataset.head()



[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 12s/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 69ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 64ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 60ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 73ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 60ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 66ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 63ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 61ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 67ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 66ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 76ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 61ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 71

Unnamed: 0,Ticker,MinDate,MaxDate,TargetDate,Features,TargetValue,Direction,PctChange,embedding,embedding_length,similarity,embedding_local
0,360ONE,2019-09-18 18:30:00,2020-06-14 18:30:00,2020-06-15 18:30:00,"[260.9037170410156, 273.9438171386719, 287.640...",202.839661,-1,-4.823874,"[[0.058084052, -0.029030701, 0.010628777, -0.0...",384,1.0,"[239.8176, 239.53238, 238.27144, 239.26614, 23..."
1,360ONE,2019-09-18 18:30:00,2020-06-15 18:30:00,2020-06-16 18:30:00,"[260.9037170410156, 273.9438171386719, 287.640...",208.647324,1,2.863179,"[[0.058084052, -0.029030701, 0.010628777, -0.0...",384,1.0,"[239.80408, 239.5182, 238.25784, 239.25157, 23..."
2,360ONE,2019-09-18 18:30:00,2020-06-16 18:30:00,2020-06-17 18:30:00,"[260.9037170410156, 273.9438171386719, 287.640...",207.865265,-1,-0.374823,"[[0.058084052, -0.029030701, 0.010628777, -0.0...",384,1.0,"[239.78485, 239.50012, 238.2394, 239.23271, 23..."
3,360ONE,2019-09-18 18:30:00,2020-06-17 18:30:00,2020-06-18 18:30:00,"[260.9037170410156, 273.9438171386719, 287.640...",208.240631,1,0.180581,"[[0.058084052, -0.029030701, 0.010628777, -0.0...",384,1.0,"[239.76703, 239.48096, 238.22171, 239.21593, 2..."
4,360ONE,2019-09-18 18:30:00,2020-06-18 18:30:00,2020-06-21 18:30:00,"[260.9037170410156, 273.9438171386719, 287.640...",203.048187,-1,-2.493483,"[[0.058084052, -0.029030701, 0.010628777, -0.0...",384,1.0,"[239.7491, 239.46346, 238.20332, 239.19756, 23..."


In [29]:
base_dataset.head()

Unnamed: 0,Ticker,MinDate,MaxDate,TargetDate,Features,TargetValue,Direction,PctChange,embedding,embedding_length,similarity,embedding_local
0,360ONE,2019-09-18 18:30:00,2020-06-14 18:30:00,2020-06-15 18:30:00,"[260.9037170410156, 273.9438171386719, 287.640...",202.839661,-1,-4.823874,"[[0.058084052, -0.029030701, 0.010628777, -0.0...",384,1.0,"[239.8176, 239.53238, 238.27144, 239.26614, 23..."
1,360ONE,2019-09-18 18:30:00,2020-06-15 18:30:00,2020-06-16 18:30:00,"[260.9037170410156, 273.9438171386719, 287.640...",208.647324,1,2.863179,"[[0.058084052, -0.029030701, 0.010628777, -0.0...",384,1.0,"[239.80408, 239.5182, 238.25784, 239.25157, 23..."
2,360ONE,2019-09-18 18:30:00,2020-06-16 18:30:00,2020-06-17 18:30:00,"[260.9037170410156, 273.9438171386719, 287.640...",207.865265,-1,-0.374823,"[[0.058084052, -0.029030701, 0.010628777, -0.0...",384,1.0,"[239.78485, 239.50012, 238.2394, 239.23271, 23..."
3,360ONE,2019-09-18 18:30:00,2020-06-17 18:30:00,2020-06-18 18:30:00,"[260.9037170410156, 273.9438171386719, 287.640...",208.240631,1,0.180581,"[[0.058084052, -0.029030701, 0.010628777, -0.0...",384,1.0,"[239.76703, 239.48096, 238.22171, 239.21593, 2..."
4,360ONE,2019-09-18 18:30:00,2020-06-18 18:30:00,2020-06-21 18:30:00,"[260.9037170410156, 273.9438171386719, 287.640...",203.048187,-1,-2.493483,"[[0.058084052, -0.029030701, 0.010628777, -0.0...",384,1.0,"[239.7491, 239.46346, 238.20332, 239.19756, 23..."


In [None]:
## Similarity Check

In [32]:
# Now, let's compare the similarity of the reference embedding with all others
embedding_n = base_dataset['embedding'][2]  # Reference embedding (third row)

# Calculate cosine similarity between the reference embedding and all others
similarities = [cosine_similarity(embedding_n.reshape(1, -1), emb.reshape(1, -1))[0][0] for emb in base_dataset['embedding']]

# Add the similarity scores to the DataFrame
base_dataset['similarity_local'] = similarities

base_dataset.head()

Unnamed: 0,Ticker,MinDate,MaxDate,TargetDate,Features,TargetValue,Direction,PctChange,embedding,embedding_length,similarity,embedding_local,similarity_local
0,360ONE,2019-09-18 18:30:00,2020-06-14 18:30:00,2020-06-15 18:30:00,"[260.9037170410156, 273.9438171386719, 287.640...",202.839661,-1,-4.823874,"[[0.058084052, -0.029030701, 0.010628777, -0.0...",384,1.0,"[239.8176, 239.53238, 238.27144, 239.26614, 23...",1.0
1,360ONE,2019-09-18 18:30:00,2020-06-15 18:30:00,2020-06-16 18:30:00,"[260.9037170410156, 273.9438171386719, 287.640...",208.647324,1,2.863179,"[[0.058084052, -0.029030701, 0.010628777, -0.0...",384,1.0,"[239.80408, 239.5182, 238.25784, 239.25157, 23...",1.0
2,360ONE,2019-09-18 18:30:00,2020-06-16 18:30:00,2020-06-17 18:30:00,"[260.9037170410156, 273.9438171386719, 287.640...",207.865265,-1,-0.374823,"[[0.058084052, -0.029030701, 0.010628777, -0.0...",384,1.0,"[239.78485, 239.50012, 238.2394, 239.23271, 23...",1.0
3,360ONE,2019-09-18 18:30:00,2020-06-17 18:30:00,2020-06-18 18:30:00,"[260.9037170410156, 273.9438171386719, 287.640...",208.240631,1,0.180581,"[[0.058084052, -0.029030701, 0.010628777, -0.0...",384,1.0,"[239.76703, 239.48096, 238.22171, 239.21593, 2...",1.0
4,360ONE,2019-09-18 18:30:00,2020-06-18 18:30:00,2020-06-21 18:30:00,"[260.9037170410156, 273.9438171386719, 287.640...",203.048187,-1,-2.493483,"[[0.058084052, -0.029030701, 0.010628777, -0.0...",384,1.0,"[239.7491, 239.46346, 238.20332, 239.19756, 23...",1.0
