# InsightForge AI â€” Corporate Insights Agent
Kaggle Submission Notebook

## Install (optional)
```python
# !pip install -q pandas numpy matplotlib duckdb openai google-generativeai
```

In [1]:
import os, json, re, time, warnings
from typing import List, Dict, Any
import pandas as pd, numpy as np
import matplotlib.pyplot as plt
import duckdb

# OpenAI compatibility
openai_new_sdk=False
client=None
try:
    from openai import OpenAI
    client=OpenAI(api_key=os.environ.get('OPENAI_API_KEY'))
    openai_new_sdk=True
    print('Using new OpenAI SDK')
except:
    import openai
    openai.api_key=os.environ.get('OPENAI_API_KEY')
    client=openai
    print('Using old OpenAI SDK')

try:
    import google.generativeai as genai
    if os.environ.get('GEMINI_API_KEY'):
        genai.configure(api_key=os.environ['GEMINI_API_KEY'])
except:
    genai=None

OPENAI_MODEL='gpt-4o-mini'

Using new OpenAI SDK


In [2]:
def generate_synthetic_sales(n_days=180, n_customers=600):
    rng=np.random.default_rng(42)
    dates=pd.date_range(end=pd.Timestamp.today(),periods=n_days)
    products=['Alpha','Beta','Gamma','Delta']
    regions=['North','South','East','West']
    channels=['Online','Retail','Partner']
    rows=[]
    for d in dates:
        for _ in range(rng.integers(10,30)):
            product=rng.choice(products)
            region=rng.choice(regions)
            channel=rng.choice(channels)
            units=int(abs(rng.normal(5,2))+1)
            price=float(20+rng.normal(0,5))
            revenue=round(units*price,2)
            cid=f'C{rng.integers(1,n_customers):05d}'
            rows.append([d.date(),product,region,channel,units,price,revenue,cid])
    return pd.DataFrame(rows,columns=['date','product','region','channel','units','price','revenue','customer_id'])

GLOBAL_DF=generate_synthetic_sales()
GLOBAL_DF.head()

Unnamed: 0,date,product,region,channel,units,price,revenue,customer_id
0,2025-06-02,Delta,East,Retail,7,24.702824,172.92,C00121
1,2025-06-02,Alpha,East,Partner,6,18.418787,110.51,C00308
2,2025-06-02,Alpha,West,Retail,7,23.88896,167.22,C00469
3,2025-06-02,Gamma,South,Partner,6,15.703538,94.22,C00056
4,2025-06-02,Gamma,West,Online,7,19.75037,138.25,C00099


In [3]:
GLOBAL={'df':GLOBAL_DF}

def register_global_df(df): GLOBAL['df']=df.copy()
register_global_df(GLOBAL_DF)

import duckdb

def sql_tool(query):
    con=duckdb.connect()
    con.register('sales',GLOBAL['df'])
    df=con.execute(query).fetchdf()
    con.close()
    return df

def plot_tool(kind,df,x=None,y=None,title=None):
    plt.figure(figsize=(10,4))
    if y not in df.columns: y='revenue'
    if kind=='line':
        if x not in df.columns: x='date'
        plt.plot(pd.to_datetime(df[x]),df[y]); plt.xlabel(x); plt.ylabel(y)
    if kind=='bar':
        agg=df.groupby(x)[y].sum().sort_values(ascending=False)
        agg.plot.bar(); plt.xlabel(x); plt.ylabel(y)
    plt.title(title or ''); plt.tight_layout(); plt.show()


def gemini_summarize(text):
    if genai is None: return text[:400]
    try:
        gm=genai.GenerativeModel('gemini-2.0-flash')
        resp=gm.generate_content('Summarize succinctly:\n'+text)
        return resp.text
    except: return text[:400]

In [4]:
class SimpleMemory:
    def __init__(self): self.docs=[]
    def add(self,id,text,metadata=None): self.docs.append({'id':id,'text':text,'meta':metadata or {}})
    def query_recent(self,k=3): return self.docs[-k:][::-1]

In [5]:
COLUMNS=['date','product','region','channel','units','price','revenue','customer_id']
PROMPT=f'You are planner. Only use columns: {COLUMNS}. Output JSON only.'
import json, re

def extract_json(t):
    try: return json.loads(t)
    except: m=re.search(r"(\[[\s\S]*?\])",t)
    if m:
        try: return json.loads(m.group(1))
        except: return None
    return None

def deterministic(goal):
    return [
        {'name':'rev_by_date','action':'sql','args':{'query':'SELECT date, SUM(revenue) AS revenue FROM sales GROUP BY date ORDER BY date'}},
        {'name':'plot_rev','action':'plot','args':{'kind':'line','x':'date','y':'revenue','title':'Revenue Trends'}},
    ]

def plan(goal):
    prompt=PROMPT+"\nGoal:"+goal
    try:
        if openai_new_sdk:
            r=client.chat.completions.create(model=OPENAI_MODEL,messages=[{'role':'user','content':prompt}])
            text=r.choices[0].message.content
        else:
            r=client.ChatCompletion.create(model=OPENAI_MODEL,messages=[{'role':'user','content':prompt}])
            text=r.choices[0].message['content']
        parsed=extract_json(text)
        return parsed if parsed else deterministic(goal)
    except:
        return deterministic(goal)

In [6]:
class Agent:
    def __init__(self): self.memory=SimpleMemory()
    def run(self,goal):
        p=plan(goal)
        artifacts=[]; last=GLOBAL['df']; dfs={}
        for step in p:
            if step['action']=='sql':
                df=sql_tool(step['args']['query']); last=df; dfs[step['name']]=df
                artifacts.append({'name':step['name'],'type':'table','data':df})
            elif step['action']=='plot':
                x=step['args'].get('x'); y=step['args'].get('y')
                df=last if x in last.columns else next((d for d in dfs.values() if x in d.columns), last)
                plot_tool(step['args']['kind'],df,x,y,step['args'].get('title'))
                artifacts.append({'name':step['name'],'type':'plot','data':'(plot)'})
            elif step['action']=='nlp':
                s=gemini_summarize(step['args'].get('text',''))
                artifacts.append({'name':step['name'],'type':'text','data':s})
        summary = artifacts[-1]['data'] if artifacts else ''
        self.memory.add(str(time.time()),summary)
        return {'plan':p,'artifacts':artifacts,'summary':summary}

In [8]:
agent=Agent()
prompts=[
    'Show revenue trends over time and identify growth drivers.',
    'What are the top products and best regions?'
]
for p in prompts:
    print('\nPROMPT:',p)
    out=agent.run(p)
    print('PLAN:',json.dumps(out['plan'],indent=2))
    print('SUMMARY:',out['summary'][:800])


PROMPT: Show revenue trends over time and identify growth drivers.


KeyError: 'action'