In [1]:
import os
from dotenv import load_dotenv
import pandas as pd
import langchain
from langchain_openai import ChatOpenAI
from langchain.prompts import PromptTemplate
from langchain.cache import InMemoryCache
langchain.llm_cache = InMemoryCache()

pd.set_option('display.max_columns', 200)
pd.set_option('display.max_rows', 100) 


# Initialize LLM

In [2]:
# Load environment variables from .env file
load_dotenv()

# Get the OPENAI_API_KEY from environment variables
openai_api_key = os.getenv('OPENAI_API_KEY')

llm = ChatOpenAI(api_key=openai_api_key, temperature=0.2, model="gpt-4o-mini")

# Read CSV Data

In [3]:
files = [i for i in os.listdir('./data') if i.endswith('.csv')]
files

['EMSSessionNoResp.csv',
 'PreconditionCheckFailA2.csv',
 'PreconditionCheckNegResp.csv',
 'EMSSession11.csv',
 'EMSSession13.csv',
 'EMSSession12.csv',
 'vinfailure_continue.csv',
 'EMSSecurityAccessFailure.csv',
 'sessionfailureEMS.csv',
 'EMSSession14.csv',
 'Endless78.csv',
 'vinfailure_stop.csv']

In [4]:
# Example dataset
df = pd.read_csv(f'./data/{files[0]}')
df.head()

Unnamed: 0,frame.frame.encap_type,frame.frame.time,frame.frame.time_utc,frame.frame.time_epoch,frame.frame.offset_shift,frame.frame.time_delta,frame.frame.time_delta_displayed,frame.frame.time_relative,frame.frame.number,frame.frame.len,frame.frame.cap_len,frame.frame.marked,frame.frame.ignored,frame.frame.protocols,frame.frame.coloring_rule.name,frame.frame.coloring_rule.string,eth.eth.dst,eth.eth.dst_tree.eth.dst_resolved,eth.eth.dst_tree.eth.dst.oui,eth.eth.dst_tree.eth.dst.oui_resolved,eth.eth.dst_tree.eth.dst.lg,eth.eth.dst_tree.eth.dst.ig,eth.eth.dst_tree.eth.addr,eth.eth.dst_tree.eth.addr_resolved,eth.eth.dst_tree.eth.addr.oui,eth.eth.dst_tree.eth.addr.oui_resolved,eth.eth.dst_tree.eth.lg,eth.eth.dst_tree.eth.ig,eth.eth.src,eth.eth.src_tree.eth.src_resolved,eth.eth.src_tree.eth.src.oui,eth.eth.src_tree.eth.src.oui_resolved,eth.eth.src_tree.eth.src.lg,eth.eth.src_tree.eth.src.ig,eth.eth.src_tree.eth.addr,eth.eth.src_tree.eth.addr_resolved,eth.eth.src_tree.eth.addr.oui,eth.eth.src_tree.eth.addr.oui_resolved,eth.eth.src_tree.eth.lg,eth.eth.src_tree.eth.ig,eth.eth.type,eth.eth.stream,ip.ip.version,ip.ip.hdr_len,ip.ip.dsfield,ip.ip.dsfield_tree.ip.dsfield.dscp,ip.ip.dsfield_tree.ip.dsfield.ecn,ip.ip.len,ip.ip.id,ip.ip.flags,ip.ip.flags_tree.ip.flags.rb,ip.ip.flags_tree.ip.flags.df,ip.ip.flags_tree.ip.flags.mf,ip.ip.frag_offset,ip.ip.ttl,ip.ip.proto,ip.ip.checksum,ip.ip.checksum.status,ip.ip.checksum_calculated,ip.ip.src,ip.ip.addr,ip.ip.src_host,ip.ip.host,ip.ip.dst,ip.ip.dst_host,ip.ip.stream,tcp.tcp.srcport,tcp.tcp.dstport,tcp.tcp.port,tcp.tcp.stream,tcp.tcp.stream.pnum,tcp.tcp.completeness,tcp.tcp.completeness_tree.tcp.completeness.rst,tcp.tcp.completeness_tree.tcp.completeness.fin,tcp.tcp.completeness_tree.tcp.completeness.data,tcp.tcp.completeness_tree.tcp.completeness.ack,tcp.tcp.completeness_tree.tcp.completeness.syn-ack,tcp.tcp.completeness_tree.tcp.completeness.syn,tcp.tcp.completeness_tree.tcp.completeness.str,tcp.tcp.len,tcp.tcp.seq,tcp.tcp.seq_raw,tcp.tcp.nxtseq,tcp.tcp.ack,tcp.tcp.ack_raw,tcp.tcp.hdr_len,tcp.tcp.flags,tcp.tcp.flags_tree.tcp.flags.res,tcp.tcp.flags_tree.tcp.flags.ae,tcp.tcp.flags_tree.tcp.flags.cwr,tcp.tcp.flags_tree.tcp.flags.ece,tcp.tcp.flags_tree.tcp.flags.urg,tcp.tcp.flags_tree.tcp.flags.ack,tcp.tcp.flags_tree.tcp.flags.push,tcp.tcp.flags_tree.tcp.flags.reset,tcp.tcp.flags_tree.tcp.flags.syn,tcp.tcp.flags_tree.tcp.flags.syn_tree._ws.expert.tcp.connection.syn,tcp.tcp.flags_tree.tcp.flags.syn_tree._ws.expert._ws.expert.message,tcp.tcp.flags_tree.tcp.flags.syn_tree._ws.expert._ws.expert.severity,tcp.tcp.flags_tree.tcp.flags.syn_tree._ws.expert._ws.expert.group,tcp.tcp.flags_tree.tcp.flags.fin,tcp.tcp.flags_tree.tcp.flags.str,tcp.tcp.window_size_value,tcp.tcp.window_size,tcp.tcp.checksum,tcp.tcp.checksum.status,tcp.tcp.urgent_pointer,tcp.tcp.options,tcp.tcp.options_tree.tcp.options.mss,tcp.tcp.options_tree.tcp.options.mss_tree.tcp.option_kind,tcp.tcp.options_tree.tcp.options.mss_tree.tcp.option_len,tcp.tcp.options_tree.tcp.options.mss_tree.tcp.options.mss_val,tcp.tcp.options_tree.tcp.options.sack_perm,tcp.tcp.options_tree.tcp.options.sack_perm_tree.tcp.option_kind,tcp.tcp.options_tree.tcp.options.sack_perm_tree.tcp.option_len,tcp.tcp.options_tree.tcp.options.timestamp,tcp.tcp.options_tree.tcp.options.timestamp_tree.tcp.option_kind,tcp.tcp.options_tree.tcp.options.timestamp_tree.tcp.option_len,tcp.tcp.options_tree.tcp.options.timestamp_tree.tcp.options.timestamp.tsval,tcp.tcp.options_tree.tcp.options.timestamp_tree.tcp.options.timestamp.tsecr,tcp.tcp.options_tree.tcp.options.nop,tcp.tcp.options_tree.tcp.options.nop_tree.tcp.option_kind,tcp.tcp.options_tree.tcp.options.wscale,tcp.tcp.options_tree.tcp.options.wscale_tree.tcp.option_kind,tcp.tcp.options_tree.tcp.options.wscale_tree.tcp.option_len,tcp.tcp.options_tree.tcp.options.wscale_tree.tcp.options.wscale.shift,tcp.tcp.options_tree.tcp.options.wscale_tree.tcp.options.wscale.multiplier,tcp.Timestamps.tcp.time_relative,tcp.Timestamps.tcp.time_delta,transum.transum.status,transum.transum.firstreq,transum.transum.lastreq,transum.transum.firstrsp,transum.transum.lastrsp,transum.transum.art,transum.transum.st,transum.transum.reqspread,transum.transum.rspspread,transum.transum.clip_filter,transum.transum.calculation
0,1,"Sep 18, 2024 11:45:56.048538000 W. Europe Dayl...","Sep 18, 2024 09:45:56.048538000 UTC",1726653000.0,0.0,0.0,0.0,0.0,1,74,74,0,0,eth:ethertype:ip:tcp,TCP SYN/FIN,tcp.flags & 0x02 || tcp.flags.fin == 1,00:00:00:00:00:00,00:00:00_00:00:00,0,"Officially Xerox, but 0:0:0:0:0:0 is more common",0,0,00:00:00:00:00:00,00:00:00_00:00:00,0,"Officially Xerox, but 0:0:0:0:0:0 is more common",0,0,00:00:00:00:00:00,00:00:00_00:00:00,0,"Officially Xerox, but 0:0:0:0:0:0 is more common",0,0,00:00:00:00:00:00,00:00:00_00:00:00,0,"Officially Xerox, but 0:0:0:0:0:0 is more common",0,0,0x0800,0,4,20,0x00,0,0,60,0x67bd,0x02,0,1,0,0,64,6,0xd4fc,1,0xd4fc,127.0.0.1,127.0.0.1,kubernetes.docker.internal,kubernetes.docker.internal,127.0.0.1,kubernetes.docker.internal,0,52488,13400,13400,0,1,31,0,1,1,1,1,1,·FDASS,0,1551520941,1551520941,1551520942,0,0,40,0x0002,0,0,0,0,0,0,0,0,1,,Connection establish request (SYN): server por...,2097152,33554432,0,··········S·,65495,65495,0xfe30,2,0,02:04:ff:d7:04:02:08:0a:dc:55:66:20:00:00:00:0...,02:04:ff:d7,2,4,65495,04:02,4,2,08:0a:dc:55:66:20:00:00:00:00,8,10,3696584224,0,1,1,03:03:07,3,3,7,128,0.0,0.0,OK,1,1,2,2,8e-06,8e-06,0.0,0.0,tcp.stream==0 && frame.number>=1 && frame.numb...,SYN and SYN/ACK


# Set up Chain

In [5]:
def diagnose(file: str, llm: ChatOpenAI, question: str):

    df_str = pd.read_csv(os.path.join('data', file)).to_string()  # pd.DataFrame as string
    
    template = """You are a helpful assistant that answers questions about the following CSV data:
    
    {csv_content}


    Please answer the question: {question}

    Use only the information provided in the CSV data to answer the question.
    """

    prompt = PromptTemplate(
        input_variables=["csv_content", "question"],
        template=template,
    )
    
    chain = prompt | llm

    # Generate the answer using the chain
    answer = chain.invoke({
        "csv_content": df_str,
        "question": question
    })

    print(answer.content)

# Invoke Chain

In [6]:
diagnose(file=files[1], llm=llm, question="Can you spot any errors or inconsistencies in the data?")

Yes, there are a few errors and inconsistencies in the data:

1. **Malformed Packet**: In row 15, there is a note indicating "Malformed Packet (Exception occurred)" in the `_ws.malformed._ws.expert._ws.malformed.expert.message` column. This suggests that there was an issue with the packet data for this entry.

2. **Missing Values**: Several columns contain `NaN` values, indicating missing data. For example, in rows 5, 15, and 16, various columns have `NaN` entries, which may indicate incomplete data capture.

3. **Inconsistent Source and Destination Addresses**: The source and destination MAC addresses are consistently shown as `00:00:00:00:00:00`, which is often used to represent an unspecified or uninitialized address. This could indicate a problem with how the data was captured or processed.

4. **Repeated Values**: The `frame.frame.number` column shows a constant value of `1` for all rows, which is inconsistent with the expectation that this should be a unique identifier for each f