In [5]:
import pandas as pd
import numpy as np
from sqlalchemy import create_engine
import psycopg2
import matplotlib.pyplot as plt
import itertools
from collections import defaultdict
import time
import re


In [6]:
engine = create_engine('postgresql://postgres:1234@localhost:5432/yimingyuan')

In [7]:
try:
    result = engine.execute("SELECT 1")
    print("Connection successful!")
except Exception as e:
    print("Connection failed:", e)

Connection successful!


In [8]:
def kl_divergence(p, q):
    epsilon = 1e-10 
    p = np.where(p == 0, epsilon, p)
    q = np.where(q == 0, epsilon, q)
    return np.sum(p * np.log(p / q))

In [9]:
# Sharing opt
# The following functions can generated the combined queries

import re
from collections import defaultdict

function_lst = ['SUM', 'COUNT', 'AVG', 'MAX', 'MIN']

select_pattern = r"SELECT\s+(.*?)\s+FROM"
from_pattern = r"FROM\s+(.*?)(?:\s+WHERE|\s+GROUP\s+BY|$)"
where_pattern = r"WHERE\s+(.*?)(?:\s+GROUP\s+BY|$)"
group_by_pattern = r"GROUP\s+BY\s+(.*)"


def parseQuery(query):
    select_match = re.search(select_pattern, query, re.IGNORECASE)
    from_match = re.search(from_pattern, query, re.IGNORECASE)
    where_match = re.search(where_pattern, query, re.IGNORECASE)
    group_by_match = re.search(group_by_pattern, query, re.IGNORECASE)

    select_items = select_match.group(1) if select_match else ""
    from_items = from_match.group(1) if from_match else ""
    where_items = where_match.group(1) if where_match else ""
    group_by_items = group_by_match.group(1) if group_by_match else ""

    return select_items, from_items, where_items, group_by_items


def parseSelectItems(select_items):
    components = select_items.split(',')
    attribute = None
    measures, functions = [], []

    for component in components:
        component = component.strip()
        component_upper = component.upper()
        for func in function_lst:
            if func.upper() in component_upper:
                measure_match = re.search(rf"{func}\((.*?)\)", component, re.IGNORECASE)
                if measure_match:
                    measure = measure_match.group(1)
                    measures.append(measure)
                    functions.append(func)
                break
        else:
            attribute = component

    return attribute, measures, functions

def combineAggregates(queries):
    # Group queries by FROM and WHERE clauses for potential combination
    query_groups = defaultdict(list)
    for query in queries:
        select_items, from_clause, where_clause, group_by_items = parseQuery(query)
        attribute, measures, functions = parseSelectItems(select_items)
        key = (from_clause, where_clause)
        query_groups[key].append((group_by_items, measures, functions, attribute))

    combined_queries = []
    for (from_clause, where_clause), group in query_groups.items():
        combined_by_group = defaultdict(list)
        for group_by_items, measures, functions, attribute in group:
            combined_by_group[group_by_items].append((measures, functions, attribute))

        for group_by_items, details in combined_by_group.items():
            select_parts = []
            for measures, functions, attribute in details:
                for measure, function in zip(measures, functions):
                    select_parts.append(f"{function}({measure}) AS {function}_{measure}")
            select_clause = ', '.join(select_parts)
            where_clause = f"WHERE {where_clause}" if where_clause else ""
            group_by_clause = f"GROUP BY {group_by_items}" if group_by_items else ""
            query = f"SELECT {group_by_items}, {select_clause} FROM {from_clause} {where_clause} {group_by_clause};"
            combined_queries.append(query)

    return combined_queries


def decomposeAggTable(dimensions_name, combined_dataframe, individual_list):
    measurements = set(col.split('_')[1] for col in combined_dataframe.columns if '_' in col)

    # Create a dictionary to store each separate DataFrame
    separated_tables = {}

    for column in combined_dataframe.columns:
        if column != dimensions_name:
            # Properly split the column name
            parts = column.split('_')
            func = parts[0]  # The function is always the first part
            measure = '_'.join(parts[1:])  # The rest is the measurement name

            table_name = f"{func}_{measure}"  # Create a unique table name

            # Create a new DataFrame for this specific measurement and function
            separated_tables[table_name] = combined_dataframe[[dimensions_name, column]].copy()
            separated_tables[table_name].rename(columns={column: table_name}, inplace=True)

    # Printing or exporting the separated tables
    for table_name, df in separated_tables.items():
        individual_list.append(df)
    
    return 


In [11]:
import itertools

dimensions = ['workclass', 'education', 'occupation', 'relationship', 'race', 'sex', 'native_country', 'income']
measurements = ['age', 'fnlwgt', 'education_num', 'capital_gain', 'capital_loss', 'hours_per_week']
aggregate_functions = ['count', 'max', 'min', 'sum', 'avg']

views = {}
index = 0

for dimension, measurement, function in itertools.product(dimensions, measurements, aggregate_functions):
    views[index] = (dimension, measurement, function)
    index += 1

## Generate the query

In [13]:
sql_queries = []
married_query_list = []
unmarried_query_list = []

for key, (attribute,measurement,function) in views.items():
    sql_married = f"SELECT {attribute}, {function}({measurement}) AS {function}_{measurement} FROM married_data WHERE {attribute} IS NOT NULL GROUP BY {attribute};"
    sql_unmarried = f"SELECT {attribute}, {function}({measurement}) AS {function}_{measurement} FROM unmarried_data WHERE {attribute} IS NOT NULL GROUP BY {attribute};"
    sql_queries.append((sql_married, sql_unmarried))

for married_query, unmarried_query in sql_queries:
    print("Married Data Query:", married_query)
    married_query_list.append(married_query.rstrip(';'))
    print("Unmarried Data Query:", unmarried_query)
    unmarried_query_list.append(unmarried_query.rstrip(';'))
    
query_count = len(sql_queries)
print("Generated queries count:", query_count * 2)

Married Data Query: SELECT workclass, count(age) AS count_age FROM married_data WHERE workclass IS NOT NULL GROUP BY workclass;
Unmarried Data Query: SELECT workclass, count(age) AS count_age FROM unmarried_data WHERE workclass IS NOT NULL GROUP BY workclass;
Married Data Query: SELECT workclass, max(age) AS max_age FROM married_data WHERE workclass IS NOT NULL GROUP BY workclass;
Unmarried Data Query: SELECT workclass, max(age) AS max_age FROM unmarried_data WHERE workclass IS NOT NULL GROUP BY workclass;
Married Data Query: SELECT workclass, min(age) AS min_age FROM married_data WHERE workclass IS NOT NULL GROUP BY workclass;
Unmarried Data Query: SELECT workclass, min(age) AS min_age FROM unmarried_data WHERE workclass IS NOT NULL GROUP BY workclass;
Married Data Query: SELECT workclass, sum(age) AS sum_age FROM married_data WHERE workclass IS NOT NULL GROUP BY workclass;
Unmarried Data Query: SELECT workclass, sum(age) AS sum_age FROM unmarried_data WHERE workclass IS NOT NULL GROU

## Combine Group By Query -- Married/Unmarried Data

In [18]:
# function to decompose the group by query

def calculate_mean_from_sum_count(sum_val, count_val):
    return sum_val / count_val if count_val != 0 else 0

def build_aggregation_dict(df):
    # dummy column for avg
    agg_dict = {
        'count':'sum',
        'max':'max',
        'min':'min',
        'sum':'sum',
        'avg':'mean' # dummy 
    }

    agg_funcs = {}
    mean_col_list = []

    for col in df.columns:
        if '_' in col and col!='native_country':
            func_suffix, base_name = col.split('_', 1)
            agg_funcs[col] = agg_dict[func_suffix]
    return agg_funcs


def decomposeGroupBy(combined_group_by_df, dimension_name):
    agg_funcs = build_aggregation_dict(combined_group_by_df)
        
    non_avg_grouped_df = combined_group_by_df.groupby(dimension_name, as_index=False).agg(agg_funcs)
    
    for col in non_avg_grouped_df.columns:
        if '_' in col:
            if 'avg' in col:
                _, base_name = col.split('_', 1)

                non_avg_grouped_df['avg_'+ base_name ] = non_avg_grouped_df['sum_'+ base_name] / non_avg_grouped_df['count_'+base_name]
    
    return non_avg_grouped_df

## Generate and Decompose the aggregation table

In [19]:
def share_opt():
    # ---- combine aggregation    
    combined_queries_married = combineAggregates(married_query_list)
    combined_queries_unmarried = combineAggregates(unmarried_query_list)

    # ---- combine group by    
    # generate the combined query 1.
    select_items, from_clause, where_clause, group_by_items = parseQuery(combined_queries_married[0])

    remove_dimension = select_items[11:]

    combined_group_by_selection = remove_dimension

    for each_dimension in dimensions:
        combined_group_by_selection = each_dimension + ', ' + combined_group_by_selection


    combined_group_by_items = ''

    for each_dimension in dimensions:
        combined_group_by_items = each_dimension + ', ' + combined_group_by_items

    combined_group_by_items = combined_group_by_items.rstrip(', ')

    married_final_combined_query = 'SELECT ' + combined_group_by_selection + ' FROM ' + from_clause + ' WHERE ' + where_clause + ' GROUP BY ' + combined_group_by_items



    # generate the combined query 2.
    select_items, from_clause, where_clause, group_by_items = parseQuery(combined_queries_unmarried[0])

    remove_dimension = select_items[11:]

    combined_group_by_selection = remove_dimension

    for each_dimension in dimensions:
        combined_group_by_selection = each_dimension + ', ' + combined_group_by_selection


    combined_group_by_items = ''

    for each_dimension in dimensions:
        combined_group_by_items = each_dimension + ', ' + combined_group_by_items

    combined_group_by_items = combined_group_by_items.rstrip(', ')

    unmarried_final_combined_query = 'SELECT ' + combined_group_by_selection + ' FROM ' + from_clause + ' WHERE ' + where_clause + ' GROUP BY ' + combined_group_by_items

    # ------------------------

    agg_opt_res = []

    # fetch the data
    married_final_combined_data = pd.read_sql_query(married_final_combined_query, engine)
    unmarried_final_combined_data = pd.read_sql_query(unmarried_final_combined_query, engine)

    for i in range(len(dimensions)):

        df_married_agg_opt = decomposeGroupBy(married_final_combined_data,dimensions[i])
        df_unmarried_agg_opt = decomposeGroupBy(unmarried_final_combined_data,dimensions[i])
        agg_opt_res.append((df_married_agg_opt, df_unmarried_agg_opt))


    de_married_list = []
    de_unmarried_list = []

    for i in range(len(agg_opt_res)):
        dimensions_name = agg_opt_res[i][0].columns[0]
        married_combined_df = agg_opt_res[i][0]
        unmarried_combined_df = agg_opt_res[i][1]

        decomposeAggTable(dimensions_name, married_combined_df, de_married_list)
        decomposeAggTable(dimensions_name, unmarried_combined_df, de_unmarried_list)



    view_scores = {}  

    for vid, (attribute, measurement, function) in views.items():
        df_married = de_married_list[vid]
        df_unmarried = de_unmarried_list[vid]

        all_attribute_values = set(df_married[attribute].unique()).union(set(df_unmarried[attribute].unique()))


        grouped_married = df_married.groupby(attribute).agg({f'{function}_{measurement}': 'sum'}).reindex(all_attribute_values, fill_value=0)
        grouped_unmarried = df_unmarried.groupby(attribute).agg({f'{function}_{measurement}': 'sum'}).reindex(all_attribute_values, fill_value=0)

        total_married = grouped_married[f'{function}_{measurement}'].sum()
        total_unmarried = grouped_unmarried[f'{function}_{measurement}'].sum()

        p = (grouped_married / total_married).fillna(0).values.flatten()
        q = (grouped_unmarried / total_unmarried).fillna(0).values.flatten()

        score = kl_divergence(p, q)
        view_scores[(vid, attribute, measurement, function)] = score

    top_scores = sorted(view_scores.items(), key=lambda x: x[1], reverse=True)[:5]
    for key, score in top_scores:
        print(f"View: {key}, Score: {score}")

# Baseline non-opt

In [20]:
def baseline():
    results = []
    for married_query, unmarried_query in sql_queries:
        df_married = pd.read_sql_query(married_query, engine)

        df_unmarried = pd.read_sql_query(unmarried_query, engine)

        results.append((df_married, df_unmarried))


    view_scores = {}  

    for vid, (attribute, measurement, function) in views.items():
        df_married = results[vid][0]
        df_unmarried = results[vid][1]

        all_attribute_values = set(df_married[attribute].unique()).union(set(df_unmarried[attribute].unique()))


        grouped_married = df_married.groupby(attribute).agg({f'{function}_{measurement}': 'sum'}).reindex(all_attribute_values, fill_value=0)
        grouped_unmarried = df_unmarried.groupby(attribute).agg({f'{function}_{measurement}': 'sum'}).reindex(all_attribute_values, fill_value=0)

        total_married = grouped_married[f'{function}_{measurement}'].sum()
        total_unmarried = grouped_unmarried[f'{function}_{measurement}'].sum()

        p = (grouped_married / total_married).fillna(0).values.flatten()
        q = (grouped_unmarried / total_unmarried).fillna(0).values.flatten()

        score = kl_divergence(p, q)
        view_scores[(vid, attribute, measurement, function)] = score

    top_scores = sorted(view_scores.items(), key=lambda x: x[1], reverse=True)[:5]
    for key, score in top_scores:
        print(f"View: {key}, Score: {score}")



In [21]:
import time


# Measure execution time of approach_one
start_time = time.time()
result_one = baseline()
end_time = time.time()
duration_one = end_time - start_time
print(f"Baseline Time: {duration_one} seconds")

# Measure execution time of approach_two
start_time = time.time()
result_two = share_opt()
end_time = time.time()
duration_two = end_time - start_time
print(f"Approach Two Execution Time: {duration_two} seconds")


View: (108, 'relationship', 'capital_gain', 'sum'), Score: 21.23297699554917
View: (113, 'relationship', 'capital_loss', 'sum'), Score: 20.444890552177746
View: (118, 'relationship', 'hours_per_week', 'sum'), Score: 19.310891503456112
View: (103, 'relationship', 'education_num', 'sum'), Score: 19.1626653581741
View: (90, 'relationship', 'age', 'count'), Score: 18.812311596760697
Baseline Time: 2.734830141067505 seconds
View: (108, 'relationship', 'capital_gain', 'sum'), Score: 21.295728232615392
View: (113, 'relationship', 'capital_loss', 'sum'), Score: 20.615982942635775
View: (118, 'relationship', 'hours_per_week', 'sum'), Score: 19.449258775251344
View: (103, 'relationship', 'education_num', 'sum'), Score: 19.334507209588587
View: (90, 'relationship', 'age', 'count'), Score: 19.009414439539466
Approach Two Execution Time: 0.4519309997558594 seconds
