In [1]:
import pandas as pd
import numpy as np
from sqlalchemy import create_engine
import psycopg2
import matplotlib.pyplot as plt
import itertools
from collections import defaultdict
import time
import re


In [2]:
engine = create_engine('postgresql://postgres:1234@localhost:5432/yimingyuan')

In [3]:
try:
    result = engine.execute("SELECT 1")
    print("Connection successful!")
except Exception as e:
    print("Connection failed:", e)

Connection successful!


In [29]:
# Sharing opt
# The following functions can generated the combined queries

import re
from collections import defaultdict

function_lst = ['SUM', 'COUNT', 'AVG', 'MAX', 'MIN']

select_pattern = r"SELECT\s+(.*?)\s+FROM"
from_pattern = r"FROM\s+(.*?)(?:\s+WHERE|\s+GROUP\s+BY|$)"
where_pattern = r"WHERE\s+(.*?)(?:\s+GROUP\s+BY|$)"
group_by_pattern = r"GROUP\s+BY\s+(.*)"


def parseQuery(query):
    select_match = re.search(select_pattern, query, re.IGNORECASE)
    from_match = re.search(from_pattern, query, re.IGNORECASE)
    where_match = re.search(where_pattern, query, re.IGNORECASE)
    group_by_match = re.search(group_by_pattern, query, re.IGNORECASE)

    select_items = select_match.group(1) if select_match else ""
    from_items = from_match.group(1) if from_match else ""
    where_items = where_match.group(1) if where_match else ""
    group_by_items = group_by_match.group(1) if group_by_match else ""

    return select_items, from_items, where_items, group_by_items


def parseSelectItems(select_items):
    components = select_items.split(',')
    attribute = None
    measures, functions = [], []

    for component in components:
        component = component.strip()
        component_upper = component.upper()
        for func in function_lst:
            if func.upper() in component_upper:
                measure_match = re.search(rf"{func}\((.*?)\)", component, re.IGNORECASE)
                if measure_match:
                    measure = measure_match.group(1)
                    measures.append(measure)
                    functions.append(func)
                break
        else:
            attribute = component

    return attribute, measures, functions

def combineAggregates(queries):
    # Group queries by FROM and WHERE clauses for potential combination
    query_groups = defaultdict(list)
    for query in queries:
        select_items, from_clause, where_clause, group_by_items = parseQuery(query)
        attribute, measures, functions = parseSelectItems(select_items)
        key = (from_clause, where_clause)
        query_groups[key].append((group_by_items, measures, functions, attribute))

    combined_queries = []
    for (from_clause, where_clause), group in query_groups.items():
        combined_by_group = defaultdict(list)
        for group_by_items, measures, functions, attribute in group:
            combined_by_group[group_by_items].append((measures, functions, attribute))

        for group_by_items, details in combined_by_group.items():
            select_parts = []
            for measures, functions, attribute in details:
                for measure, function in zip(measures, functions):
                    select_parts.append(f"{function}({measure}) AS {function}_{measure}")
            select_clause = ', '.join(select_parts)
            where_clause = f"WHERE {where_clause}" if where_clause else ""
            group_by_clause = f"GROUP BY {group_by_items}" if group_by_items else ""
            query = f"SELECT {group_by_items}, {select_clause} FROM {from_clause} {where_clause} {group_by_clause};"
            combined_queries.append(query)

    return combined_queries


In [30]:
# 定义维度、度量和聚合函数
import itertools

dimensions = ['workclass', 'education', 'occupation', 'relationship', 'race', 'sex', 'native_country', 'income']
measurements = ['age', 'fnlwgt', 'education_num', 'capital_gain', 'capital_loss', 'hours_per_week']
aggregate_functions = ['count', 'max', 'min', 'sum', 'avg']

# 初始化视图
views = {}
index = 0

for dimension, measurement, function in itertools.product(dimensions, measurements, aggregate_functions):
    views[index] = (dimension, measurement, function)
    index += 1

## Generate the query

In [34]:
sql_queries = []
married_query_list = []
unmarried_query_list = []

for key, (attribute,measurement,function) in views.items():
    sql_married = f"SELECT {attribute}, {function}({measurement}) AS {function}_{measurement} FROM married_data WHERE {attribute} IS NOT NULL GROUP BY {attribute};"
    sql_unmarried = f"SELECT {attribute}, {function}({measurement}) AS {function}_{measurement} FROM unmarried_data WHERE {attribute} IS NOT NULL GROUP BY {attribute};"
    sql_queries.append((sql_married, sql_unmarried))

# 可选：打印所有生成的查询
for married_query, unmarried_query in sql_queries:
    print("Married Data Query:", married_query)
    married_query_list.append(married_query.rstrip(';'))
    print("Unmarried Data Query:", unmarried_query)
    unmarried_query_list.append(unmarried_query.rstrip(';'))
    
# 查询数量
query_count = len(sql_queries)
print("Generated queries count:", query_count * 2)  # 每个视图两个查询



Married Data Query: SELECT workclass, count(age) AS count_age FROM married_data WHERE workclass IS NOT NULL GROUP BY workclass;
Unmarried Data Query: SELECT workclass, count(age) AS count_age FROM unmarried_data WHERE workclass IS NOT NULL GROUP BY workclass;
Married Data Query: SELECT workclass, max(age) AS max_age FROM married_data WHERE workclass IS NOT NULL GROUP BY workclass;
Unmarried Data Query: SELECT workclass, max(age) AS max_age FROM unmarried_data WHERE workclass IS NOT NULL GROUP BY workclass;
Married Data Query: SELECT workclass, min(age) AS min_age FROM married_data WHERE workclass IS NOT NULL GROUP BY workclass;
Unmarried Data Query: SELECT workclass, min(age) AS min_age FROM unmarried_data WHERE workclass IS NOT NULL GROUP BY workclass;
Married Data Query: SELECT workclass, sum(age) AS sum_age FROM married_data WHERE workclass IS NOT NULL GROUP BY workclass;
Unmarried Data Query: SELECT workclass, sum(age) AS sum_age FROM unmarried_data WHERE workclass IS NOT NULL GROU

In [39]:
# generate agg based combine query

combined_queries_married = combineAggregates(married_query_list)
for query in combined_queries_married:
    print(query)

# print(married_query_list)


SELECT workclass, COUNT(age) AS COUNT_age, MAX(age) AS MAX_age, MIN(age) AS MIN_age, SUM(age) AS SUM_age, AVG(age) AS AVG_age, COUNT(fnlwgt) AS COUNT_fnlwgt, MAX(fnlwgt) AS MAX_fnlwgt, MIN(fnlwgt) AS MIN_fnlwgt, SUM(fnlwgt) AS SUM_fnlwgt, AVG(fnlwgt) AS AVG_fnlwgt, COUNT(education_num) AS COUNT_education_num, MAX(education_num) AS MAX_education_num, MIN(education_num) AS MIN_education_num, SUM(education_num) AS SUM_education_num, AVG(education_num) AS AVG_education_num, COUNT(capital_gain) AS COUNT_capital_gain, MAX(capital_gain) AS MAX_capital_gain, MIN(capital_gain) AS MIN_capital_gain, SUM(capital_gain) AS SUM_capital_gain, AVG(capital_gain) AS AVG_capital_gain, COUNT(capital_loss) AS COUNT_capital_loss, MAX(capital_loss) AS MAX_capital_loss, MIN(capital_loss) AS MIN_capital_loss, SUM(capital_loss) AS SUM_capital_loss, AVG(capital_loss) AS AVG_capital_loss, COUNT(hours_per_week) AS COUNT_hours_per_week, MAX(hours_per_week) AS MAX_hours_per_week, MIN(hours_per_week) AS MIN_hours_per_

In [40]:
# generate agg based combine query

combined_queries_unmarried = combineAggregates(unmarried_query_list)
for query in combined_queries_unmarried:
    print(query)

SELECT workclass, COUNT(age) AS COUNT_age, MAX(age) AS MAX_age, MIN(age) AS MIN_age, SUM(age) AS SUM_age, AVG(age) AS AVG_age, COUNT(fnlwgt) AS COUNT_fnlwgt, MAX(fnlwgt) AS MAX_fnlwgt, MIN(fnlwgt) AS MIN_fnlwgt, SUM(fnlwgt) AS SUM_fnlwgt, AVG(fnlwgt) AS AVG_fnlwgt, COUNT(education_num) AS COUNT_education_num, MAX(education_num) AS MAX_education_num, MIN(education_num) AS MIN_education_num, SUM(education_num) AS SUM_education_num, AVG(education_num) AS AVG_education_num, COUNT(capital_gain) AS COUNT_capital_gain, MAX(capital_gain) AS MAX_capital_gain, MIN(capital_gain) AS MIN_capital_gain, SUM(capital_gain) AS SUM_capital_gain, AVG(capital_gain) AS AVG_capital_gain, COUNT(capital_loss) AS COUNT_capital_loss, MAX(capital_loss) AS MAX_capital_loss, MIN(capital_loss) AS MIN_capital_loss, SUM(capital_loss) AS SUM_capital_loss, AVG(capital_loss) AS AVG_capital_loss, COUNT(hours_per_week) AS COUNT_hours_per_week, MAX(hours_per_week) AS MAX_hours_per_week, MIN(hours_per_week) AS MIN_hours_per_

In [60]:
# test-only

x = pd.read_sql_query(combined_queries_married[0], engine)
x

Unnamed: 0,workclass,count_age,max_age,min_age,sum_age,avg_age,count_fnlwgt,max_fnlwgt,min_fnlwgt,sum_fnlwgt,...,count_capital_loss,max_capital_loss,min_capital_loss,sum_capital_loss,avg_capital_loss,count_hours_per_week,max_hours_per_week,min_hours_per_week,sum_hours_per_week,avg_hours_per_week
0,Without-pay,10,72,29,588,58.8,10,212588,27012,1515934,...,10,0,0,0,0.0,10,65,12,348,34.8
1,Never-worked,1,30,30,30,30.0,1,176673,176673,176673,...,1,0,0,0,0.0,1,40,40,40,40.0
2,Self-emp-not-inc,1840,90,18,87206,47.394565,1840,795830,20333,320253289,...,1840,2457,0,245783,133.577717,1840,99,1,83226,45.231522
3,State-gov,675,81,20,29514,43.724444,675,1033222,20057,122914520,...,675,2377,0,75848,112.367407,675,80,1,27611,40.905185
4,Private,11391,90,17,477251,41.8972,11391,1455435,13769,2173573195,...,11391,4356,0,1215756,106.729523,11391,99,2,484030,42.492318
5,Self-emp-inc,891,84,22,42762,47.993266,891,1097453,23813,156644896,...,891,2559,0,161334,181.070707,891,99,1,43581,48.912458
6,Federal-gov,547,90,19,24930,45.575868,547,930948,19914,101221295,...,547,2415,0,62286,113.868373,547,80,6,22940,41.937843
7,Local-gov,1194,90,20,53735,45.004188,1194,668319,14878,226155167,...,1194,2415,0,165383,138.511725,1194,99,4,49866,41.763819


In [53]:
agg_opt_res = []
for i in range(len(combined_queries_married)):
    df_married_agg_opt = pd.read_sql_query(combined_queries_married[i], engine)
    
    df_unmarried_agg_opt = pd.read_sql_query(combined_queries_unmarried[i], engine)
    
    agg_opt_res.append((df_married_agg_opt, df_unmarried_agg_opt))
    

In [98]:
# test only
combined_df = agg_opt_res[1][0]
combined_df.columns[0]
# len(agg_opt_res)

'education'

## Decompose the aggregation table

In [123]:
def decomposeAggTable(dimensions_name, combined_dataframe, individual_list):
    measurements = set(col.split('_')[1] for col in combined_dataframe.columns if '_' in col)

    # Create a dictionary to store each separate DataFrame
    separated_tables = {}

    for column in combined_dataframe.columns:
        if column != dimensions_name:
            # Properly split the column name
            parts = column.split('_')
            func = parts[0]  # The function is always the first part
            measure = '_'.join(parts[1:])  # The rest is the measurement name

            table_name = f"{func}_{measure}"  # Create a unique table name

            # Create a new DataFrame for this specific measurement and function
            separated_tables[table_name] = combined_dataframe[[dimensions_name, column]].copy()
            separated_tables[table_name].rename(columns={column: table_name}, inplace=True)

    # Printing or exporting the separated tables
    for table_name, df in separated_tables.items():
        individual_list.append(df)
        
de_married_list = []
de_unmarried_list = []

for i in range(len(agg_opt_res)):
    dimensions_name = agg_opt_res[i][0].columns[0]
    married_combined_df = agg_opt_res[i][0]
    unmarried_combined_df = agg_opt_res[i][1]
    
    decomposeAggTable(dimensions_name, married_combined_df, de_married_list)
    decomposeAggTable(dimensions_name, unmarried_combined_df, de_unmarried_list)


In [124]:
# test
de_married_list[0]

Unnamed: 0,workclass,count_age
0,Without-pay,10
1,Never-worked,1
2,Self-emp-not-inc,1840
3,State-gov,675
4,Private,11391
5,Self-emp-inc,891
6,Federal-gov,547
7,Local-gov,1194


In [125]:
def kl_divergence(p, q):
    epsilon = 1e-10  # 避免数学错误的小常数
    p = np.where(p == 0, epsilon, p)
    q = np.where(q == 0, epsilon, q)
    return np.sum(p * np.log(p / q))

view_scores = {}  # 初始化空字典，将用来存储三元组和对应的K-L散度

# 修正并重新计算分数
for vid, (attribute, measurement, function) in views.items():
    df_married = de_married_list[vid]
    df_unmarried = de_unmarried_list[vid]

    all_attribute_values = set(df_married[attribute].unique()).union(set(df_unmarried[attribute].unique()))


    grouped_married = df_married.groupby(attribute).agg({f'{function}_{measurement}': 'sum'}).reindex(all_attribute_values, fill_value=0)
    grouped_unmarried = df_unmarried.groupby(attribute).agg({f'{function}_{measurement}': 'sum'}).reindex(all_attribute_values, fill_value=0)

    total_married = grouped_married[f'{function}_{measurement}'].sum()
    total_unmarried = grouped_unmarried[f'{function}_{measurement}'].sum()

    # 归一化以获得概率分布
    p = (grouped_married / total_married).fillna(0).values.flatten()
    q = (grouped_unmarried / total_unmarried).fillna(0).values.flatten()

    # 计算 K-L 散度并保存
    score = kl_divergence(p, q)
    view_scores[(attribute, measurement, function)] = score

print(view_scores['relationship', 'age', 'sum'])
# 排序并打印top 5
top_scores = sorted(view_scores.items(), key=lambda x: x[1], reverse=True)[:5]
for key, score in top_scores:
    print(f"View: {key}, Score: {score}")

18.614196594087872
View: ('relationship', 'capital_gain', 'sum'), Score: 21.232976995549166
View: ('relationship', 'capital_loss', 'sum'), Score: 20.44489055217775
View: ('relationship', 'hours_per_week', 'sum'), Score: 19.310891503456112
View: ('relationship', 'education_num', 'sum'), Score: 19.1626653581741
View: ('relationship', 'age', 'count'), Score: 18.812311596760697


In [51]:
results = []
for married_query, unmarried_query in sql_queries:
    # 执行已婚数据查询并直接读取到 DataFrame
    df_married = pd.read_sql_query(married_query, engine)

    # 执行未婚数据查询并直接读取到 DataFrame
    df_unmarried = pd.read_sql_query(unmarried_query, engine)

    # 将结果存储为元组
    results.append((df_married, df_unmarried))



In [52]:
# 这一部分可能结果有问题，逻辑没问题(就是把结果里面每一个view查到的结果，求归一化后求K-L偏差，最后排序找到前5个，输出分数)

def kl_divergence(p, q):
    epsilon = 1e-10  # 避免数学错误的小常数
    p = np.where(p == 0, epsilon, p)
    q = np.where(q == 0, epsilon, q)
    return np.sum(p * np.log(p / q))

view_scores = {}  # 初始化空字典，将用来存储三元组和对应的K-L散度

# 修正并重新计算分数
for vid, (attribute, measurement, function) in views.items():
    df_married = results[vid][0]
    df_unmarried = results[vid][1]

    all_attribute_values = set(df_married[attribute].unique()).union(set(df_unmarried[attribute].unique()))


    grouped_married = df_married.groupby(attribute).agg({f'{function}_{measurement}': 'sum'}).reindex(all_attribute_values, fill_value=0)
    grouped_unmarried = df_unmarried.groupby(attribute).agg({f'{function}_{measurement}': 'sum'}).reindex(all_attribute_values, fill_value=0)

    total_married = grouped_married[f'{function}_{measurement}'].sum()
    total_unmarried = grouped_unmarried[f'{function}_{measurement}'].sum()

    # 归一化以获得概率分布
    p = (grouped_married / total_married).fillna(0).values.flatten()
    q = (grouped_unmarried / total_unmarried).fillna(0).values.flatten()

    # 计算 K-L 散度并保存
    score = kl_divergence(p, q)
    view_scores[(attribute, measurement, function)] = score

print(view_scores['relationship', 'age', 'sum'])
# 排序并打印top 5
top_scores = sorted(view_scores.items(), key=lambda x: x[1], reverse=True)[:5]
for key, score in top_scores:
    print(f"View: {key}, Score: {score}")



18.614196594087872
View: ('relationship', 'capital_gain', 'sum'), Score: 21.232976995549166
View: ('relationship', 'capital_loss', 'sum'), Score: 20.44489055217775
View: ('relationship', 'hours_per_week', 'sum'), Score: 19.310891503456112
View: ('relationship', 'education_num', 'sum'), Score: 19.1626653581741
View: ('relationship', 'age', 'count'), Score: 18.812311596760697
