安装依赖库
pip install tiktoken openai pandas matplotlib plotly scikit-learn numpy

In [27]:
#导入相关依赖
import pandas as pd
import os
from openai import OpenAI
import tiktoken
import dotenv

# 1.数据处理

In [38]:
# 读取文件
df = pd.read_csv('data/评论集.csv', index_col=0)
print("评价文件长度：", len(df))
# 查看前3行
df[:3]

评价文件长度： 1000


Unnamed: 0,Time,ProductId,UserId,Score,Summary,Text
0,1351123200,B003XPF9BO,A3R7JR3FMEBXQB,5,where does one start...and stop... with a tre...,Wanted to save some to bring to my Chicago fam...
1,1351123200,B003JK537S,A3JBPC3WFUT5ZP,1,Arrived in pieces,"Not pleased at all. When I opened the box, mos..."
2,1351123200,B000JMBE7M,AQX1N6A51QOKG,4,"It isn't blanc mange, but isn't bad . . .",I'm not sure that custard is really custard wi...


In [21]:
# 删除文件中缺失数据(NaN、NaT)
df = df.dropna()
print("评价文件长度：", len(df))

评价文件长度： 1000


In [22]:
# 把summary、test两个字段合并
df['combined'] = "Title:"+df.Summary.str.strip()+";Content:"+df.Text.str.strip()
df[:3]

Unnamed: 0,Time,ProductId,UserId,Score,Summary,Text,combined
0,1351123200,B003XPF9BO,A3R7JR3FMEBXQB,5,where does one start...and stop... with a tre...,Wanted to save some to bring to my Chicago fam...,Title:where does one start...and stop... with...
1,1351123200,B003JK537S,A3JBPC3WFUT5ZP,1,Arrived in pieces,"Not pleased at all. When I opened the box, mos...",Title:Arrived in pieces;Content:Not pleased at...
2,1351123200,B000JMBE7M,AQX1N6A51QOKG,4,"It isn't blanc mange, but isn't bad . . .",I'm not sure that custard is really custard wi...,"Title:It isn't blanc mange, but isn't bad . . ..."


# 2.生成向量

In [23]:
# 数据按照time排序
df = df.sort_values('Time')
# 删除time字段
df.drop("Time", axis=1, inplace=True)
df[:3]

Unnamed: 0,ProductId,UserId,Score,Summary,Text,combined
0,B003XPF9BO,A3R7JR3FMEBXQB,5,where does one start...and stop... with a tre...,Wanted to save some to bring to my Chicago fam...,Title:where does one start...and stop... with...
297,B003VXHGPK,A21VWSCGW7UUAR,4,"Good, but not Wolfgang Puck good","Honestly, I have to admit that I expected a li...","Title:Good, but not Wolfgang Puck good;Content..."
296,B008JKTTUA,A34XBAIFT02B60,1,Should advertise coconut as an ingredient more...,"First, these should be called Mac - Coconut ba...",Title:Should advertise coconut as an ingredien...


In [33]:
# 创建分词器(计算token，超过阈值则丢弃)
tokenizer = tiktoken.get_encoding(encoding_name='cl100k_base')
# 计算token数量
df['count_token'] = df.combined.apply(lambda x: len(tokenizer.encode(x)))
# 单条的token超过8191 或者 总条数超过5条
df = df[df.count_token <= 8191].tail(5)
print(len(df),"条待生成向量的数据")
df[:5]

5 条待生成向量的数据


Unnamed: 0,ProductId,UserId,Score,Summary,Text,combined,count_token
623,B0000CFXYA,A3GS4GWPIBV0NT,1,Strange inflammation response,Truthfully wasn't crazy about the taste of the...,Title:Strange inflammation response;Content:Tr...,110
624,B0001BH5YM,A1BZ3HMAKK0NC,5,My favorite and only MUSTARD,You've just got to experience this mustard... ...,Title:My favorite and only MUSTARD;Content:Yo...,80
625,B0009ET7TC,A2FSDQY5AI6TNX,5,My furbabies LOVE these!,Shake the container and they come running. Eve...,Title:My furbabies LOVE these!;Content:Shake t...,48
619,B007PA32L2,A15FF2P7RPKH6G,5,got this for the daughter,all i have heard since she got a kuerig is why...,Title:got this for the daughter;Content:all i ...,50
999,B001EQ5GEO,A3VYU0VO6DYV6I,5,I love Maui Coffee!,My first experience with Maui Coffee was bring...,Title:I love Maui Coffee!;Content:My first exp...,117


In [35]:
# 加载环境变量
dotenv.load_dotenv()
# 客户端
client = OpenAI(
    base_url = os.getenv('OPENAI_API_BASE'),
    api_key = os.getenv('OPENAI_API_KEY')
)
# 读取客户端响应
def embedding_resp(text):
    resp = client.embeddings.create(input=text, model='text-embedding-3-small')
    return resp.data[0].embedding
# 增加向量列
df['embedding'] = df.combined.apply(embedding_resp)
df[:3]

Unnamed: 0,ProductId,UserId,Score,Summary,Text,combined,count_token,embedding
623,B0000CFXYA,A3GS4GWPIBV0NT,1,Strange inflammation response,Truthfully wasn't crazy about the taste of the...,Title:Strange inflammation response;Content:Tr...,110,"[-0.048618898, 0.048367564, 0.0060389643, 0.01..."
624,B0001BH5YM,A1BZ3HMAKK0NC,5,My favorite and only MUSTARD,You've just got to experience this mustard... ...,Title:My favorite and only MUSTARD;Content:Yo...,80,"[0.027737668, -0.027710946, -0.045935504, 0.00..."
625,B0009ET7TC,A2FSDQY5AI6TNX,5,My furbabies LOVE these!,Shake the container and they come running. Eve...,Title:My furbabies LOVE these!;Content:Shake t...,48,"[-0.008169582, -0.030571591, -0.017105248, 0.0..."


In [37]:
# 保存成csv文件
df.to_csv('data/评论的向量画结果集.csv')