In [5]:
import numpy as np
import pandas as pd
from sqlalchemy import create_engine
import psycopg2
import matplotlib.pyplot as plt

## 连接数据库

In [6]:
# 数据库连接字符串
engine = create_engine('postgresql://postgres:1234@localhost:5432/tempdblp')

In [7]:
try:
    result = engine.execute("SELECT 1")
    print("Connection successful!")
except Exception as e:
    print("Connection failed:", e)

Connection failed: (psycopg2.OperationalError) FATAL:  database "tempdblp" does not exist

(Background on this error at: https://sqlalche.me/e/14/e3q8)


## 生成views

In [48]:
# 定义维度、度量和聚合函数
dimensions = ['workclass', 'education', 'occupation', 'relationship', 'race', 'sex', 'native_country', 'income']
measurements = ['age', 'fnlwgt', 'education_num', 'capital_gain', 'capital_loss', 'hours_per_week']
aggregate_functions = ['count', 'max', 'min', 'sum', 'avg']

# 初始化视图
views = {}
index = 0

for dimension, measurement, function in product(dimensions, measurements, aggregate_functions):
    views[index] = (dimension, measurement, function)
    index += 1

In [113]:
print(views[17])

('workclass', 'capital_gain', 'min')


## 生成sql queries

In [50]:
sql_queries = []
for key, (attribute,measurement,function) in views.items():
    sql_married = f"SELECT {attribute}, {function}({measurement}) AS {function}_{measurement} FROM married_data WHERE {attribute} IS NOT NULL GROUP BY {attribute};"
    sql_unmarried = f"SELECT {attribute}, {function}({measurement}) AS {function}_{measurement} FROM unmarried_data WHERE {attribute} IS NOT NULL GROUP BY {attribute};"
    sql_queries.append((sql_married, sql_unmarried))

# 可选：打印所有生成的查询
for married_query, unmarried_query in sql_queries:
    print("Married Data Query:", married_query)
    print("Unmarried Data Query:", unmarried_query)

# 查询数量
query_count = len(sql_queries)
print("Generated queries count:", query_count * 2)  # 每个视图两个查询


Married Data Query: SELECT workclass, count(age) AS count_age FROM married_data WHERE workclass IS NOT NULL GROUP BY workclass;
Unmarried Data Query: SELECT workclass, count(age) AS count_age FROM unmarried_data WHERE workclass IS NOT NULL GROUP BY workclass;
Married Data Query: SELECT workclass, max(age) AS max_age FROM married_data WHERE workclass IS NOT NULL GROUP BY workclass;
Unmarried Data Query: SELECT workclass, max(age) AS max_age FROM unmarried_data WHERE workclass IS NOT NULL GROUP BY workclass;
Married Data Query: SELECT workclass, min(age) AS min_age FROM married_data WHERE workclass IS NOT NULL GROUP BY workclass;
Unmarried Data Query: SELECT workclass, min(age) AS min_age FROM unmarried_data WHERE workclass IS NOT NULL GROUP BY workclass;
Married Data Query: SELECT workclass, sum(age) AS sum_age FROM married_data WHERE workclass IS NOT NULL GROUP BY workclass;
Unmarried Data Query: SELECT workclass, sum(age) AS sum_age FROM unmarried_data WHERE workclass IS NOT NULL GROU

In [85]:
results = []
for married_query, unmarried_query in sql_queries:
    # 执行已婚数据查询并直接读取到 DataFrame
    df_married = pd.read_sql_query(married_query, engine)

    # 执行未婚数据查询并直接读取到 DataFrame
    df_unmarried = pd.read_sql_query(unmarried_query, engine)

    # 将结果存储为元组
    results.append((df_married, df_unmarried))

In [139]:
print(results[93])

(     relationship  sum_age
0         Husband   578099
1       Own-child     8056
2            Wife    62479
3   Not-in-family    61035
4       Unmarried    43782
5  Other-relative    10835,      relationship  sum_age
0       Own-child   117772
1   Not-in-family   257429
2       Unmarried    95071
3  Other-relative    21699)


In [153]:
# 这一部分可能结果有问题，逻辑没问题(就是把结果里面每一个view查到的结果，求归一化后求K-L偏差，最后排序找到前5个，输出分数)

def kl_divergence(p, q):
    epsilon = 1e-10  # 避免数学错误的小常数
    p = np.where(p == 0, epsilon, p)
    q = np.where(q == 0, epsilon, q)
    return np.sum(p * np.log(p / q))

view_scores = {}  # 初始化空字典，将用来存储三元组和对应的K-L散度

# 修正并重新计算分数
for vid, (attribute, measurement, function) in views.items():
    df_married = results[vid][0]
    df_unmarried = results[vid][1]

    all_attribute_values = set(df_married[attribute].unique()).union(set(df_unmarried[attribute].unique()))


    grouped_married = df_married.groupby(attribute).agg({f'{function}_{measurement}': 'sum'}).reindex(all_attribute_values, fill_value=0)
    grouped_unmarried = df_unmarried.groupby(attribute).agg({f'{function}_{measurement}': 'sum'}).reindex(all_attribute_values, fill_value=0)

    total_married = grouped_married[f'{function}_{measurement}'].sum()
    total_unmarried = grouped_unmarried[f'{function}_{measurement}'].sum()

    # 归一化以获得概率分布
    p = (grouped_married / total_married).fillna(0).values.flatten()
    q = (grouped_unmarried / total_unmarried).fillna(0).values.flatten()

    # 计算 K-L 散度并保存
    score = kl_divergence(p, q)
    view_scores[(attribute, measurement, function)] = score

print(view_scores['relationship', 'age', 'sum'])
# 排序并打印top 5
top_scores = sorted(view_scores.items(), key=lambda x: x[1], reverse=True)[:5]
for key, score in top_scores:
    print(f"View: {key}, Score: {score}")



18.614196594087876
View: ('relationship', 'capital_gain', 'sum'), Score: 21.23297699554917
View: ('relationship', 'capital_loss', 'sum'), Score: 20.444890552177746
View: ('relationship', 'hours_per_week', 'sum'), Score: 19.31089150345611
View: ('relationship', 'education_num', 'sum'), Score: 19.1626653581741
View: ('relationship', 'age', 'count'), Score: 18.812311596760697
