In [8]:
from dotenv import load_dotenv
load_dotenv()

True

In [9]:
import openai
import pandas as pd
from openai import OpenAI

# Initialize OpenAI client (uses OPENAI_API_KEY from .env)
client = OpenAI()

In [10]:
# Sample dataset (you can load from CSV, JSON, or database)
data = pd.read_csv("LimitedData.csv")  
df = pd.DataFrame(data)
print("Dataset:")
print(df)

  data = pd.read_csv("LimitedData.csv")


Dataset:
             DESYNPUF_ID  BENE_BIRTH_DT  BENE_DEATH_DT  BENE_SEX_IDENT_CD  \
0       BAC397A752FA03A7     19330101.0            0.0                2.0   
1       BE4841C03C968C6B     19341201.0            0.0                2.0   
2       0DCFC21B16C7A18F     19300901.0            0.0                2.0   
3       5ED5E5248B290111     19220201.0            0.0                2.0   
4       7FA31DE58F7B26E1     19280801.0            0.0                2.0   
...                  ...            ...            ...                ...   
999895               NaN            NaN            NaN                NaN   
999896               NaN            NaN            NaN                NaN   
999897               NaN            NaN            NaN                NaN   
999898               NaN            NaN            NaN                NaN   
999899               NaN            NaN            NaN                NaN   

        BENE_RACE_CD BENE_ESRD_IND  SP_STATE_CODE  BENE_COUNTY_CD 

In [14]:
# Call OpenAI API to get insights from the dataset
def get_dataset_insights(dataframe):
    """
    Send dataset to OpenAI and get insights back.
    
    Args:
        dataframe: pandas DataFrame with the data
    
    Returns:
        str: Insights from OpenAI
    """
    # Convert DataFrame to CSV string for sending to API
    prompt = f"""

    """

    
    # Create the prompt
    prompt = f"""Analyze the following dataset and provide key insights, trends, and recommendations:

Dataset:
    Analyze this dataset structure and summary:

    Columns:
    {df.columns.tolist()}

    Data types:
    {df.dtypes}

    Summary statistics:
    {df.describe().to_string()}

    Sample rows:
    {df.head(20).to_string()}

Please provide:
1. Summary statistics
2. Key trends and patterns
3. Recommendations for improvement
4. Any anomalies or areas of concern"""
    
    # Call OpenAI API using chat completions
    response = client.chat.completions.create(
        model="gpt-4o-mini",  # or use "gpt-4" for more advanced analysis
        messages=[
            {"role": "system", "content": "You are a data analyst assistant. Provide clear, actionable insights."},
            {"role": "user", "content": prompt}
        ],
        temperature=0.7,
        max_tokens=1000
    )
    
    # Extract and return the insights
    insights = response.choices[0].message.content
    return insights

# Get insights from your dataset
print("Requesting insights from OpenAI...\n")
insights = get_dataset_insights(df)
print("Insights:")
print(insights)

Requesting insights from OpenAI...

Insights:
### 1. Summary Statistics

From the dataset, we can derive the following summary statistics:

- **Demographics**:
  - **BENE_BIRTH_DT**: Ranges from 1922 to 1961, indicating a diverse age group.
  - **BENE_SEX_IDENT_CD**: Majority (1.0) appear to be female (coded as 1), while 2.0 represents male.
  - **BENE_RACE_CD**: Majority of the sample is coded as 1 (possibly indicating a specific race category).
  
- **Health Indicators**:
  - **BENE_ESRD_IND**: A binary indicator where 'Y' indicates end-stage renal disease (ESRD) presence.
  - Health conditions such as **SP_ALZHDMTA, SP_CHF, SP_DEPRESSN, SP_DIABETES, SP_COPD** show a high incidence across the population, with values mostly at 1.0 (indicating presence).
  
- **Coverage and Claims**:
  - **BENE_HI_CVRAGE_TOT_MONS**: The average is approximately 11.75 months, indicating limited health coverage for many.
  - **PLAN_CVRG_MOS_NUM**: The average is around 9.12 months, which indicates that m

### Making a Powerpoint of Insights

In [15]:
insights.split("###")

['',
 " 1. Summary Statistics\n\nFrom the dataset, we can derive the following summary statistics:\n\n- **Demographics**:\n  - **BENE_BIRTH_DT**: Ranges from 1922 to 1961, indicating a diverse age group.\n  - **BENE_SEX_IDENT_CD**: Majority (1.0) appear to be female (coded as 1), while 2.0 represents male.\n  - **BENE_RACE_CD**: Majority of the sample is coded as 1 (possibly indicating a specific race category).\n  \n- **Health Indicators**:\n  - **BENE_ESRD_IND**: A binary indicator where 'Y' indicates end-stage renal disease (ESRD) presence.\n  - Health conditions such as **SP_ALZHDMTA, SP_CHF, SP_DEPRESSN, SP_DIABETES, SP_COPD** show a high incidence across the population, with values mostly at 1.0 (indicating presence).\n  \n- **Coverage and Claims**:\n  - **BENE_HI_CVRAGE_TOT_MONS**: The average is approximately 11.75 months, indicating limited health coverage for many.\n  - **PLAN_CVRG_MOS_NUM**: The average is around 9.12 months, which indicates that many beneficiaries may exper

In [16]:
from pptx import Presentation
from pptx.util import Inches, Pt
from pptx.enum.text import PP_ALIGN
from pptx.dml.color import RGBColor
from pptx.enum.shapes import MSO_SHAPE


def create_powerpoint_from_insights(insights_text, output_file="dataset_insights.pptx"):
    """
    Create a PowerPoint presentation from OpenAI insights.

    - Lines starting with "###" become H3 headings (slide titles).
    - Adds a colored banner/header to each slide for a non-default look.
    """
    prs = Presentation()
    prs.slide_width = Inches(10)
    prs.slide_height = Inches(7.5)

    def apply_brand(slide):
        # light background
        slide.background.fill.solid()
        slide.background.fill.fore_color.rgb = RGBColor(250, 250, 250)
        # top banner
        banner = slide.shapes.add_shape(
            MSO_SHAPE.RECTANGLE, Inches(0), Inches(0), prs.slide_width, Inches(0.6)
        )
        banner.fill.solid()
        banner.fill.fore_color.rgb = RGBColor(0, 102, 153)  # teal
        try:
            banner.line.fill.background()
        except Exception:
            pass
        # banner title centered
        tf = banner.text_frame
        tf.text = "Dataset Insights"
        p = tf.paragraphs[0]
        p.alignment = PP_ALIGN.CENTER
        p.font.size = Pt(16)
        p.font.bold = True
        p.font.color.rgb = RGBColor(255, 255, 255)

    # Title Slide
    title_slide = prs.slides.add_slide(prs.slide_layouts[0])
    title = title_slide.shapes.title
    subtitle = title_slide.placeholders[1]
    title.text = "Dataset Analysis Insights"
    subtitle.text = "Generated by OpenAI Data Analysis"
    apply_brand(title_slide)

    # Parse insights into logical blocks
    lines = insights_text.splitlines()
    buffer = []
    current_title = None

    def flush_buffer_as_slide(title_text, items):
        if not items and not title_text:
            return
        slide = prs.slides.add_slide(prs.slide_layouts[5])
        apply_brand(slide)
        # Title box
        title_box = slide.shapes.add_textbox(Inches(0.5), Inches(0.8), Inches(9), Inches(0.7))
        tf = title_box.text_frame
        tf.clear()
        tf.text = title_text or "Details"
        p = tf.paragraphs[0]
        p.font.size = Pt(28)
        p.font.bold = True
        p.font.color.rgb = RGBColor(0, 51, 102)

        # Content box
        content_box = slide.shapes.add_textbox(Inches(0.5), Inches(1.6), Inches(9), Inches(5))
        cf = content_box.text_frame
        cf.word_wrap = True
        cf.clear()
        for it in items:
            par = cf.add_paragraph()
            par.text = it
            par.font.size = Pt(16)
            par.level = 0

    for line in lines:
        stripped = line.strip()
        if not stripped:
            continue
        if stripped.startswith("###"):
            # new H3 heading -> flush previous buffer
            flush_buffer_as_slide(current_title, buffer)
            buffer = []
            current_title = stripped.lstrip('#').strip()
        elif any(stripped.startswith(f"{i}.") for i in range(1, 10)):
            # numbered item - keep
            buffer.append(stripped)
        else:
            # normal paragraph or bullet
            buffer.append(stripped)

    # flush remaining
    flush_buffer_as_slide(current_title, buffer)

    prs.save(output_file)
    print(f"PowerPoint presentation created: {output_file}")
    return output_file

# Generate the PowerPoint from insights (if `insights` exists in the kernel)
try:
    pptx_file = create_powerpoint_from_insights(insights)
    print(f"✅ Presentation ready: {pptx_file}")
except NameError:
    print("No `insights` variable found in the notebook kernel. Run the analysis cell first.")

PowerPoint presentation created: dataset_insights.pptx
✅ Presentation ready: dataset_insights.pptx
