# Vector Embedding

In [None]:
pip install --upgrade pip

In [None]:
pip install gpudb==7.2.0.0b0 pycatch22 pandas pyarrow typeguard

In [None]:
import pycatch22
import gpudb
import json
import math

Let's define our time-series vector embedding algorithms to use from the catch22 library [here](https://time-series-features.gitbook.io/catch22-features/).  Essentially, the idea is to use a variety of algorithms to represent our sensor data such that we can identify similar sensor data segments.

In [None]:
def c22(ts_data: list) -> list:
    timeseries = [pycatch22.CO_f1ecac(ts_data),
                  pycatch22.CO_trev_1_num(ts_data),
                  pycatch22.CO_FirstMin_ac(ts_data),
                  pycatch22.CO_HistogramAMI_even_2_5(ts_data),
                  pycatch22.DN_Mean(ts_data),
                  pycatch22.DN_Spread_Std(ts_data),
                  pycatch22.DN_HistogramMode_5(ts_data),
                  pycatch22.DN_HistogramMode_10(ts_data),
                  pycatch22.DN_OutlierInclude_n_001_mdrmd(ts_data),
                  pycatch22.SB_BinaryStats_diff_longstretch0(ts_data),
                  pycatch22.SB_BinaryStats_mean_longstretch1(ts_data),
                  pycatch22.SB_MotifThree_quantile_hh(ts_data),
                  pycatch22.SB_TransitionMatrix_3ac_sumdiagcov(ts_data),
                  pycatch22.SC_FluctAnal_2_dfa_50_1_2_logi_prop_r1(ts_data),
                  pycatch22.SC_FluctAnal_2_rsrangefit_50_1_logi_prop_r1(ts_data),
                  pycatch22.SP_Summaries_welch_rect_area_5_1(ts_data),
                  pycatch22.SP_Summaries_welch_rect_centroid(ts_data),
                  pycatch22.FC_LocalSimple_mean1_tauresrat(ts_data),
                  pycatch22.FC_LocalSimple_mean3_stderr(ts_data),
                  pycatch22.IN_AutoMutualInfoStats_40_gaussian_fmmi(ts_data),
                  pycatch22.MD_hrv_classic_pnn40(ts_data),
                  pycatch22.PD_PeriodicityWang_th0_01(ts_data)]

    return timeseries

Let's connect to our Kinetica database

In [None]:
db = gpudb.GPUdb(host='https://demo72.kinetica.com/_gpudb',
                 username='gtc',
                 password='Kinetica123!')

Now, we need to create our table to store our vector embeddings using the Kinetica API

In [None]:
if db.has_table("raceday.vectors")["table_exists"]:
    db.clear_table(table_name="raceday.vectors")

schema = [
    ["ts_bkt", "string", "datetime"],
    ["TireSlipRatioFl_Vec", "bytes", "vector(22)"],
    ["TireSlipRatioFr_Vec", "bytes", "vector(22)"],
    ["TireSlipRatioRl_Vec", "bytes", "vector(22)"],
    ["TireSlipRatioRr_Vec", "bytes", "vector(22)"],
    ["AccelerationX_Vec", "bytes", "vector(22)"],
    ["AccelerationY_Vec", "bytes", "vector(22)"],
    ["AccelerationZ_Vec", "bytes", "vector(22)"],
    ["VelocityX_Vec", "bytes", "vector(22)"],
    ["VelocityY_Vec", "bytes", "vector(22)"],
    ["VelocityZ_Vec", "bytes", "vector(22)"],
    ["CombinedRace_Vec", "bytes", "vector(220)"]
]

tableObj = gpudb.GPUdbTable(
    _type=schema,
    name="raceday.vectors",
    use_multihead_io=True,
    multihead_ingest_batch_size=5000,
    db=db
)

Now that we have created our vector table, we need to loop through all the time-series data, convert to our 22-dimensional vector embedding space, then insert into Kinetica

In [None]:
has_more = True
offset = 0
while has_more:
    result = None
    try:
        result = db.execute_sql('''select
       time_bucket(interval 5 second, timestamp) as ts_bkt,
       TireSlipRatioFl,
       TireSlipRatioFr,
       TireSlipRatioRl,
       TireSlipRatioRr,
       AccelerationX,
       AccelerationY,
       AccelerationZ,
       VelocityX,
       VelocityY,
       VelocityZ
    from
        raceday.dash1''',
                                encoding='json',
                                offset=offset)

    except gpudb.GPUdbException as gpudberror:
        print(str(gpudberror))
        
    if result is not None:
        j = json.loads(result.json_encoded_response)
        flat_result = {}
        count = 0

        has_more = result.has_more_records
        offset += len(j['column_1'])
        
        for count in range(0, len(j['column_1'])):
            if j['column_1'][count] not in flat_result:
                flat_result[j['column_1'][count]] = {
                    'TireSlipRatioFl_Vec': [],
                    'TireSlipRatioFr_Vec': [],
                    'TireSlipRatioRl_Vec': [],
                    'TireSlipRatioRr_Vec': [],
                    'AccelerationX_Vec':   [],
                    'AccelerationY_Vec':   [],
                    'AccelerationZ_Vec':   [],
                    'VelocityX_Vec':       [],
                    'VelocityY_Vec':       [],
                    'VelocityZ_Vec':       [],
                    'CombinedRace_Vec':    []
                }

            flat_result[j['column_1'][count]]['TireSlipRatioFl_Vec'].append(j['column_2'][count])
            flat_result[j['column_1'][count]]['TireSlipRatioFr_Vec'].append(j['column_3'][count])
            flat_result[j['column_1'][count]]['TireSlipRatioRl_Vec'].append(j['column_4'][count])
            flat_result[j['column_1'][count]]['TireSlipRatioRr_Vec'].append(j['column_5'][count])
            flat_result[j['column_1'][count]]['AccelerationX_Vec'].append(j['column_6'][count])
            flat_result[j['column_1'][count]]['AccelerationY_Vec'].append(j['column_7'][count])
            flat_result[j['column_1'][count]]['AccelerationZ_Vec'].append(j['column_8'][count])
            flat_result[j['column_1'][count]]['VelocityX_Vec'].append(j['column_9'][count])
            flat_result[j['column_1'][count]]['VelocityY_Vec'].append(j['column_10'][count])
            flat_result[j['column_1'][count]]['VelocityZ_Vec'].append(j['column_11'][count])
            
        for key, value in flat_result.items():
            for k2, v2 in value.items():
                if k2 != 'CombinedRace_Vec':
                    vec = c22(v2)
                    vec = [0 if math.isnan(x) else x for x in vec]
                    flat_result[key][k2] = vec
                    flat_result[key]['CombinedRace_Vec'].extend(vec)
                    
        for key, value in flat_result.items():
            json_record = json.dumps({'ts_bkt':              key,
                                      'TireSlipRatioFl_Vec': value['TireSlipRatioFl_Vec'],
                                      'TireSlipRatioFr_Vec': value['TireSlipRatioFr_Vec'],
                                      'TireSlipRatioRl_Vec': value['TireSlipRatioRl_Vec'],
                                      'TireSlipRatioRr_Vec': value['TireSlipRatioRr_Vec'],
                                      'AccelerationX_Vec':   value['AccelerationX_Vec'],
                                      'AccelerationY_Vec':   value['AccelerationY_Vec'],
                                      'AccelerationZ_Vec':   value['AccelerationZ_Vec'],
                                      'VelocityX_Vec':       value['VelocityX_Vec'],
                                      'VelocityY_Vec':       value['VelocityY_Vec'],
                                      'VelocityZ_Vec':       value['VelocityZ_Vec'],
                                      'CombinedRace_Vec':    value['CombinedRace_Vec']
                                      })
            response = db.insert_records_from_json(
                table_name='raceday.vectors',
                json_records=json_record)
tableObj.flush_data_to_server()

# Kinetica LLM interactions
Here we will set up our LLM context.  Consider adding or removing rules and/or samples to see what the effect will be on the resulting SQL query results:
- high velocity turns have a Velocity along the Y axis of greater than 1.6
- straightaways have an Acceleration along the X axis of greater than 5

In [None]:
# create separate code
sqlcontext = '''
CREATE OR REPLACE CONTEXT raceday.raceday_ui_ctxt
 (
     TABLE = raceday.dash1,
     RULES = (
         'crashes are likely when tire slip ratio is lower than -0.4 and acceleration along the X axis is less than 0 and velocity along the Y axis is lower than 0', 
         'to retrieve videos use an asof join between the raceday.dash1 timestamp column and raceday.videos ts column with a tolerance of 0 seconds before the event and 5 seconds after making the minimum match',
         'when I use the word "distinct" when asking for videos I mean to use the distinct sql function',
         'when I say "show me 1", I want you to add a "LIMIT 1 to the query"',
         'when I ask for 1 video, make sure that the you use the distinct function'),
     COMMENTS = (
         'EngineMaxRpm' = 'this is the maximum engine RPM',
         'timestamp' = 'this is the timestamp column'
     )
),
(
    TABLE = raceday.videos
),
(
    SAMPLES = (
        'show me videos similar to https://kinetica-raceday.s3.amazonaws.com/raceday1/forza_images/output-2-15.mp4' = 'with crashvid as (
    select * from raceday.videos where videopath = ''https://kinetica-raceday.s3.amazonaws.com/raceday1/forza_images/output-2-15.mp4''
),
query_vector as (
    select
    *
    from
        crashvid vid
        join raceday.vectors vec
        on asof(vid.ts, vec.ts_bkt, interval ''0'' seconds, interval ''5'' seconds, min)
),
vec_similar as (
    SELECT
        ts_bkt,
        cosine_distance(VelocityX_Vec,(select string(VelocityX_Vec) from query_vector)) as d1
    FROM
        raceday.vectors
    ORDER BY
        d1
    LIMIT
        5
)

select *
from
    raceday.videos vid
    join (select * from vec_similar) vec_s
    on asof(vid.ts, vec_s.ts_bkt, interval ''0'' seconds, interval ''5'' seconds, min)
order by d1'
    )
)
'''
response = db.execute_sql(sqlcontext)
print(response['status_info']['status'])

Now we can send our question to the LLM via SQL api

Some questions to ask:
- what is the average current engine RPM when acceleration along the X axis is between 0 and 1.5
- show a video where a crash is likely to occur
- show a video along a straightaway
- show me videos similar to https://kinetica-raceday.s3.amazonaws.com/raceday1/forza_images/output-2-15.mp4

In [None]:
question = '''show me videos where a crash is likely to occur'''
sql = f'''GENERATE SQL FOR '{question}' WITH OPTIONS (context_name = 'raceday.raceday_ui_ctxt');'''
response = db.execute_sql(sql, encoding='json')
newQuery = None
if response['status_info']['status'] == 'OK':
    newQuery = json.loads(response['json_encoded_response'])['column_1'][0]
    print(json.loads(response['json_encoded_response'])['column_1'][0])

else:
    print("BAD RESPONSE, Try a different question or adjust your SQL Context")

In [None]:
if newQuery is not None:
    df = db.to_df(newQuery)
    print(df)