# LIDA与ChatGPT API集成示例
使用OpenAI的API进行数据可视化

In [6]:
import os
import pandas as pd
from lida import Manager, TextGenerationConfig
from lida.components import llm

## 1. 配置 OpenAI API

In [7]:
from dotenv import load_dotenv

def load_token():
    """从.env文件加载OpenAI API令牌"""
    # 加载.env文件
    load_dotenv()
    
    # 获取环境变量中的API令牌
    token = os.getenv("OPENAI_API_KEY")
    
    if token:
        print("已从.env文件读取API令牌")
        return token
    else:
        raise ValueError("未在.env文件中找到OPENAI_API_KEY")

# 加载API令牌
OPENAI_API_TOKEN = load_token()
os.environ["OPENAI_API_KEY"] = OPENAI_API_TOKEN

已从.env文件读取API令牌


## 2. 初始化LIDA与ChatGPT后端

In [6]:
print("正在初始化LIDA与ChatGPT后端...")
# 使用OpenAI API作为后端
text_gen = llm(
    "openai",  # 使用OpenAI作为provider
    model="gpt-3.5-turbo"  # 使用GPT-3.5模型
)

# 初始化LIDA管理器
lida = Manager(text_gen=text_gen)

正在初始化LIDA与ChatGPT后端...


## 3. 准备数据集

In [7]:
print("正在准备数据集...")
# 方式一：使用内存中的示例数据
sample_data = {
    "年份": [2018, 2019, 2020, 2021, 2022],
    "销售额": [2500, 2800, 2200, 3100, 3500],
    "利润": [500, 550, 480, 600, 720],
    "客户数": [150, 170, 155, 190, 210]
}
df = pd.DataFrame(sample_data)
# 保存为CSV以便LIDA加载
df.to_csv("sales_data.csv", index=False)

# 显示数据预览
df.head()

正在准备数据集...


Unnamed: 0,年份,销售额,利润,客户数
0,2018,2500,500,150
1,2019,2800,550,170
2,2020,2200,480,155
3,2021,3100,600,190
4,2022,3500,720,210


## 4. 数据总结

In [8]:
print("正在分析数据...")
summary = lida.summarize(df)
print("\n===================== 数据总结 =====================")
print(summary)

正在分析数据...

{'name': '', 'file_name': '', 'dataset_description': '', 'fields': [{'column': '年份', 'properties': {'dtype': 'number', 'std': 1, 'min': 2018, 'max': 2022, 'samples': [2019, 2022, 2020], 'num_unique_values': 5, 'semantic_type': '', 'description': ''}}, {'column': '销售额', 'properties': {'dtype': 'number', 'std': 506, 'min': 2200, 'max': 3500, 'samples': [2800, 3500, 2200], 'num_unique_values': 5, 'semantic_type': '', 'description': ''}}, {'column': '利润', 'properties': {'dtype': 'number', 'std': 95, 'min': 480, 'max': 720, 'samples': [550, 720, 480], 'num_unique_values': 5, 'semantic_type': '', 'description': ''}}, {'column': '客户数', 'properties': {'dtype': 'number', 'std': 25, 'min': 150, 'max': 210, 'samples': [170, 210, 155], 'num_unique_values': 5, 'semantic_type': '', 'description': ''}}], 'field_names': ['年份', '销售额', '利润', '客户数']}


## 5. 生成可视化目标

In [14]:
print("正在生成可视化目标...")
# 创建配置，指定参数
config = TextGenerationConfig(
    n=2,  # 生成3个可视化目标
    temperature=0.7,  # 控制创意程度，0-1之间
    max_tokens=200  # 每个目标的最大token数
)

# 生成可视化目标
goals = lida.goals(
    summary=summary,
    textgen_config=config
)

print("\n===================== 可视化目标 =====================")
for goal in goals:
    display(goal)

正在生成可视化目标...




### Goal 0
---
**Question:** How does the sales revenue vary over the years?

**Visualization:** `Bar chart showing sales revenue over years`

**Rationale:** By plotting the sales revenue against the years, we can identify any trends or patterns in the revenue growth or decline over time.



### Goal 1
---
**Question:** What is the distribution of profits across different years?

**Visualization:** `Bar chart displaying profits by year`

**Rationale:** Analyzing the distribution of profits across different years can provide insights into the profitability trend and help in identifying the most profitable years.



### Goal 2
---
**Question:** How does the customer count change annually?

**Visualization:** `Bar chart illustrating customer count by year`

**Rationale:** Understanding the annual variation in customer count can help in assessing customer retention and acquisition strategies over time.



### Goal 3
---
**Question:** What is the relationship between sales revenue and profits?

**Visualization:** `Scatter plot of sales revenue vs. profits`

**Rationale:** Examining the correlation between sales revenue and profits can reveal the effectiveness of the sales strategy and pricing in generating profits.



### Goal 4
---
**Question:** Which year had the highest sales revenue and profits?

**Visualization:** `Bar chart comparing sales revenue and profits for each year`

**Rationale:** Comparing the sales revenue and profits for each year can help in identifying the most successful year in terms of revenue generation and profitability.


## 6. 为每个目标生成可视化方案

In [None]:
print("正在生成可视化方案...")
all_visualizations = []

for i, goal in enumerate(goals):
    print(f"\n为目标 {i+1} 生成可视化...")
    
    # 为当前目标生成可视化
    visualizations = lida.generate_viz(
        goal=goal.text,
        data=data,
        summary=summary,
        config=config
    )
    
    # 添加到总列表
    all_visualizations.extend(visualizations)
    
    # 输出生成的可视化数量
    print(f"已为目标 {i+1} 生成 {len(visualizations)} 个可视化方案")

## 7. 显示可视化结果

In [None]:
# 创建保存目录
if not os.path.exists("visualizations"):
    os.makedirs("visualizations")

# 显示每个可视化
for i, viz in enumerate(all_visualizations):
    print(f"\n可视化 {i+1}:")
    # 在Jupyter中直接显示可视化
    viz.visualize()
    
    # 保存为HTML文件
    file_path = f"visualizations/viz_{i+1}.html"
    viz.save(file_path)
    print(f"已保存到: {file_path}")

## 8. 生成数据洞察和可视化评论

In [None]:
print("正在生成数据洞察和评论...")

if len(all_visualizations) > 0:
    # 选择第一个可视化进行评论
    viz = all_visualizations[0]
    
    # 生成评论
    comments = lida.generate_data_comments(
        viz=viz,
        data=data,
        summary=summary
    )
    
    print("\n===================== 数据洞察 =====================")
    for i, comment in enumerate(comments):
        print(f"洞察 {i+1}: {comment.text}")
        print("-" * 50)