In [331]:
import psycopg2
import pandas as pd

def query(
    query_text, 
    host = "localhost",
    port = 5432,
    database = "postgres",
    user = "postgres",
    password = "postgres"):
    
    with psycopg2.connect(
                        host=host,
                        port=port,
                        database=database,
                        user=user,
                        password=password
                    ) as conn:
        cur = conn.cursor()
        cur.execute(query_text)
        results = cur.fetchall()
        column_names = [desc[0] for desc in cur.description]

    return_df = pd.DataFrame.from_records(results)
    return_df.columns = column_names
    return return_df


class PkScout:
    
    def __init__(self, table_name):
        self.table_name = table_name
        
        info_schema_query = f"""
            SELECT 
            column_name, 
            data_type, 
            is_nullable, 
            character_maximum_length, 
            numeric_precision, 
            numeric_scale
            FROM information_schema.columns
            WHERE table_schema = 'public'
            AND table_name = '{self.table_name}';
        """

        self.info_schema_df = query(info_schema_query)
        self.table_row_count = int(query(
            f"""
            Select count(1) as ct 
            from (Select distinct * from {self.table_name}) foo
            """
        )['ct'].iloc[0])

    def get_column_cardinalities(self, descending=True):
        column_cardinalities = []
        for column in self.info_schema_df['column_name']:
            column_cardinality = int(
                query(
                    f'Select count(distinct "{column}") as ct from {self.table_name}'
                )['ct'].iloc[0]
            )
            
            column_cardinalities.append((column, column_cardinality))

        return sorted(column_cardinalities, key=lambda x: x[1], reverse=descending)

    def compute_joint_cardinality(self, columns):
        columns = [f'"{name}"' for name in columns]

        return int(query(
                    f"""
                        Select count(1) as ct
                        from (
                            Select distinct {','.join(columns)}
                            from {self.table_name}
                        ) as foo
                    """
                )['ct'].iloc[0])

    def find_pk(self, descending=True):
        column_cardinalities = self.get_column_cardinalities(descending)

        joint_cardinalities = []
        for index, (column, cardinality) in enumerate(column_cardinalities):
            if cardinality == self.table_row_count:
                print(f"{column} is a candidate: Same cardinality as table row count = {self.table_row_count}")

            for i in range(1, len(column_cardinalities)-index-1):
                joint_columns = [name for name, _ in column_cardinalities[index:index+i+1]]
                joint_cardinality = self.compute_joint_cardinality(joint_columns)

                joint_cardinalities.append(
                    (
                        tuple([x.replace('"','') for x in joint_columns]),
                        joint_cardinality
                    )
                )
                
                if joint_cardinality == self.table_row_count:
                    print(f"{joint_columns} are a candidate: Same cardinality as table row count = {self.table_row_count}")
                    break


        return joint_cardinalities
                    

            

            

In [19]:
TABLES = [
    'rugs_usa_category_map',
    'rugs_usa_color_map',
    'rugs_usa_links',
    'rugs_usa_pads_upsell',
    'rugs_usa_parent',
    'rugs_usa_variant'
]

In [21]:
for table in TABLES:
    schema_query = f"""
    SELECT 
        column_name, 
        data_type, 
        is_nullable, 
        character_maximum_length, 
        numeric_precision, 
        numeric_scale
    FROM information_schema.columns
    WHERE table_schema = 'public'
    AND table_name = '{table}';
    """
    print(f"*****{table}*****")
    display(
        query(
            schema_query
        )
    )

*****rugs_usa_category_map*****


Unnamed: 0,column_name,data_type,is_nullable,character_maximum_length,numeric_precision,numeric_scale
0,pid,text,YES,,,
1,category_id,text,YES,,,
2,category_value,text,YES,,,
3,dw_insert_timestamp,text,YES,,,


*****rugs_usa_color_map*****


Unnamed: 0,column_name,data_type,is_nullable,character_maximum_length,numeric_precision,numeric_scale
0,pid,text,YES,,,
1,color_id,text,YES,,,
2,color_value,text,YES,,,
3,dw_insert_timestamp,text,YES,,,


*****rugs_usa_links*****


Unnamed: 0,column_name,data_type,is_nullable,character_maximum_length,numeric_precision,numeric_scale
0,base_url,text,YES,,,
1,page_links,text,YES,,,
2,card_links,text,YES,,,
3,request_time_stamp,text,YES,,,
4,dw_insert_timestamp,text,YES,,,


*****rugs_usa_pads_upsell*****


Unnamed: 0,column_name,data_type,is_nullable,character_maximum_length,numeric_precision,numeric_scale
0,size,text,YES,,,
1,shape,text,YES,,,
2,price,double precision,YES,,53.0,
3,width,double precision,YES,,53.0,
4,sqft,double precision,YES,,53.0,
5,type,text,YES,,,
6,stock,integer,YES,,32.0,0.0
7,height,double precision,YES,,53.0,
8,variant,text,YES,,,
9,p_id,text,YES,,,


*****rugs_usa_parent*****


Unnamed: 0,column_name,data_type,is_nullable,character_maximum_length,numeric_precision,numeric_scale
0,pid,text,YES,,,
1,product_type_id,text,YES,,,
2,name,text,YES,,,
3,url,text,YES,,,
4,origin,text,YES,,,
5,thickness,text,YES,,,
6,material,text,YES,,,
7,weave,integer,YES,,32.0,0.0
8,weave_feature,text,YES,,,
9,color,text,YES,,,


*****rugs_usa_variant*****


Unnamed: 0,column_name,data_type,is_nullable,character_maximum_length,numeric_precision,numeric_scale
0,pid,text,YES,,,
1,variant,text,YES,,,
2,actual_size,text,YES,,,
3,weave_feature,text,YES,,,
4,weave_cat,text,YES,,,
5,size_grp,text,YES,,,
6,shipping_size,text,YES,,,
7,shape,text,YES,,,
8,weight,double precision,YES,,53.0,
9,price,double precision,YES,,53.0,


# DW Insert Timestamp?

In [548]:
for table in TABLES:
    print(table)
    display(query(f"Select distinct dw_insert_timestamp from {table}"))
    display(query(f"Select distinct dw_insert_timestamp::timestamptz from {table}"))


rugs_usa_category_map


Unnamed: 0,dw_insert_timestamp
0,2022-01-13 19:33:41.514826


Unnamed: 0,dw_insert_timestamp
0,2022-01-13 19:33:41.514826+00:00


rugs_usa_color_map


Unnamed: 0,dw_insert_timestamp
0,2022-01-13 19:33:41.514414


Unnamed: 0,dw_insert_timestamp
0,2022-01-13 19:33:41.514414+00:00


rugs_usa_links


Unnamed: 0,dw_insert_timestamp
0,2022-01-13 19:33:41.512911


Unnamed: 0,dw_insert_timestamp
0,2022-01-13 19:33:41.512911+00:00


rugs_usa_pads_upsell


Unnamed: 0,dw_insert_timestamp
0,2022-01-13 20:44:54.227965


Unnamed: 0,dw_insert_timestamp
0,2022-01-13 20:44:54.227965+00:00


rugs_usa_parent


Unnamed: 0,dw_insert_timestamp
0,2022-01-13 19:33:41.513542


Unnamed: 0,dw_insert_timestamp
0,2022-01-13 19:33:41.513542+00:00


rugs_usa_variant


Unnamed: 0,dw_insert_timestamp
0,2022-01-13 19:33:41.513970


Unnamed: 0,dw_insert_timestamp
0,2022-01-13 19:33:41.513970+00:00


# Rugs usa variant

In [402]:
scout = PkScout('rugs_usa_variant')
joint = scout.find_pk()

['msrp', 'pid', 'price', 'stock_level', 'weight', 'shipping_size', 'variant'] are a candidate: Same cardinality as table row count = 38569
['pid', 'price', 'stock_level', 'weight', 'shipping_size', 'variant'] are a candidate: Same cardinality as table row count = 38569


In [403]:
scout.compute_joint_cardinality(['pid', 'variant'])

38569

In [404]:
scout.compute_joint_cardinality(['variant'])

587

In [473]:
scout.compute_joint_cardinality(['actual_size', 'shape'])

536

In [457]:
df = query(
    """
    Select variant, count(distinct pid),
    count(distinct actual_size), count(distinct shape), count(distinct weight)
    from rugs_usa_variant group by 1 order by 2 desc""")

In [464]:
df = query(
    """
    Select variant, pid,
    count(distinct actual_size), count(distinct shape), count(distinct weight)
    from rugs_usa_variant group by 1,2 order by 5 desc""")

In [465]:
df

Unnamed: 0,variant,pid,count,count.1,count.2
0,10,108BT19,1,1,1
1,10,108BT29,1,1,1
2,10,108CA09,1,1,1
3,10,108CA99,1,1,1
4,10,108CC29,1,1,1
...,...,...,...,...,...
38564,SAMPLE,108SM501,1,1,1
38565,SAMPLE,108SM601,1,1,1
38566,SAMPLE,108SM801,1,1,1
38567,SAMPLE,108TL50,1,1,1


In [550]:
for col in ['actual_size', 'size_grp', 'weave_cat', 'shape', 'origin']:
    display(query(f"Select coalesce(cast({col} as varchar), 'missing') as {col}, count(1) from rugs_usa_variant group by 1 order by 2 desc").head(50))

Unnamed: 0,actual_size,count
0,5_ft x 8_ft,2741
1,4_ft x 6_ft,2329
2,8_ft x 10_ft,2204
3,9_ft x 12_ft,1476
4,3_ft x 5_ft,1369
5,6_ft 7_in x 9_ft,1298
6,2_ft x 3_ft,1059
7,6_ft x 9_ft,1048
8,8_ft,962
9,6_ft,924


Unnamed: 0,size_grp,count
0,5x8,6767
1,Runner,6164
2,8x10,5688
3,6x9,5132
4,3x5,3270
5,4x6,3198
6,2x3,2944
7,9x12,2908
8,10x14,1485
9,Oversize,1079


Unnamed: 0,weave_cat,count
0,Machine Made,23855
1,Braided,8225
2,Hand Tufted,4314
3,Flatweave,1949
4,Hand Hooked,341
5,Hand Knotted,109
6,missing,7


Unnamed: 0,shape,count
0,Rectangle,24199
1,Runner,5923
2,Round,3075
3,Oval,3005
4,Square,2294
5,Octagon,108
6,Stair Tread,62
7,Shaped,51
8,Oval Stair Tread,43
9,Hearth,28


Unnamed: 0,origin,count
0,missing,38189
1,China,371
2,Turkey,132
3,SAU,108


In [554]:
query("Select status, max(stock_level), min(stock_level), max(depletion_level), min(depletion_level) from rugs_usa_variant group by 1")

Unnamed: 0,status,max,min,max.1,min.1
0,In_stock,2147483647,1,1008.0,0.0
1,Back_ordered,0,-251,266.0,0.0
2,Pre_order,0,-48,,
3,Out_of_stock,6,-73,42.0,0.0


In [558]:
query("Select round(stock_level,-2), round(depletion_level,-2), status, count(1) from rugs_usa_variant group by 1,2,3 order by 4 desc").head(20)

Unnamed: 0,round,round.1,status,count
0,0,,In_stock,8680
1,0,0.0,In_stock,7819
2,100,,In_stock,6321
3,0,,Out_of_stock,4080
4,200,,In_stock,3096
5,100,0.0,In_stock,2709
6,0,0.0,Out_of_stock,1540
7,2147483600,,In_stock,985
8,0,0.0,Back_ordered,829
9,200,0.0,In_stock,681


In [564]:
query("Select distinct coalesce(new_arrival, 'foo') from rugs_usa_variant")

Unnamed: 0,coalesce
0,Y
1,N


# Rugs usa parent

In [547]:
query("Select * from rugs_usa_parent")

Unnamed: 0,pid,product_type_id,name,url,origin,thickness,material,weave,weave_feature,color,brand,imageName,imageType,internalName,category,min_price,max_price,availability,aggregate,clearance,long_description,shopbyroom,dw_insert_timestamp
0,200TAJT03,RUG,Jute Braided,/rugsusa/rugs/rugs-usa-jute-braided/Natural/20...,India,1/4 inch,100% Jute,,Hand Woven,Natural,Rugs USA,200TAJT03,roomImage,rugs-usa-jute-braided,Maui,0,0,48 Hours,True,N,This handmade 100% jute rug is a stylish and e...,"Bedroom, Dining Room, Living Room, Hallway, Of...",2022-01-13 19:33:41.513542
1,200CB01,RUG,Veronica Wool Braided,/rugsusa/rugs/rugs-usa-veronica-wool-braided/O...,India,1/2 inch,"80% Wool, 20% Cotton",,Braided,Off White,Rugs USA,200CB01,roomImage,rugs-usa-veronica-wool-braided,Textures,0,0,48 Hours,True,N,Handcrafted in the style of a chunky knit swea...,"Bedroom, Living Room",2022-01-13 19:33:41.513542
2,200HMMT01A,RUG,Handwoven Chaste,/rugsusa/rugs/rugs-usa-handwoven-chaste/Natura...,India,1/4 inch,"60% Jute, 40% Cotton",,Hand Loomed,Natural,Rugs USA,200HMMT01A,roomImage,rugs-usa-handwoven-chaste,Fawna,0,0,48 Hours,True,N,Handmade with luxuriously soft jute and cotton...,"Bedroom, Dining Room, Living Room, Hallway, Of...",2022-01-13 19:33:41.513542
3,200RZBD16A,RUG,Moroccan Trellis,/rugsusa/rugs/rugs-usa-moroccan-trellis/Gray/2...,Turkey,1/2 inch,100% Polypropylene,,Machine Made,Gray,Rugs USA,200RZBD16A,roomImage,rugs-usa-moroccan-trellis,Bosphorus,0,0,48 Hours,True,N,"<p class=""h4"">Named ""The Rug"" by The New York ...","Bedroom, Dining Room, Living Room, Hallway, Of...",2022-01-13 19:33:41.513542
4,200TAJT03,RUG,Jute Braided,/rugsusa/rugs/rugs-usa-jute-braided/Off-White/...,India,1/4 inch,100% Jute,,Hand Woven,Natural,Rugs USA,200TAJT03,roomImage,rugs-usa-jute-braided,Maui,0,0,48 Hours,True,N,This handmade 100% jute rug is a stylish and e...,"Bedroom, Dining Room, Living Room, Hallway, Of...",2022-01-13 19:33:41.513542
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5557,108BN19,RUG,Braided Indoor/Outdoor,/rugsusa/rugs/colonial-mills-braided-indoorout...,USA,,100% Polypropylene,,Braided,Brownstone,Colonial Mills,108BN19,s,colonial-mills-braided-indooroutdoor,Brooklyn,0,0,5 Days,True,N,A combination of alternating braid constructio...,,2022-01-13 19:33:41.513542
5558,108BN19,RUG,Braided Indoor/Outdoor,/rugsusa/rugs/colonial-mills-braided-indoorout...,USA,,100% Polypropylene,,Braided,Brownstone,Colonial Mills,108BN19,s,colonial-mills-braided-indooroutdoor,Brooklyn,0,0,5 Days,True,N,A combination of alternating braid constructio...,,2022-01-13 19:33:41.513542
5559,108BN19,RUG,Braided Indoor/Outdoor,/rugsusa/rugs/colonial-mills-braided-indoorout...,USA,,100% Polypropylene,,Braided,Brownstone,Colonial Mills,108BN19,s,colonial-mills-braided-indooroutdoor,Brooklyn,0,0,5 Days,True,N,A combination of alternating braid constructio...,,2022-01-13 19:33:41.513542
5560,108BN19,RUG,Braided Indoor/Outdoor,/rugsusa/rugs/colonial-mills-braided-indoorout...,USA,,100% Polypropylene,,Braided,Brownstone,Colonial Mills,108BN19,s,colonial-mills-braided-indooroutdoor,Brooklyn,0,0,5 Days,True,N,A combination of alternating braid constructio...,,2022-01-13 19:33:41.513542


In [513]:
scout = PkScout('rugs_usa_parent')
joint = scout.find_pk()

url is a candidate: Same cardinality as table row count = 5547
['url', 'pid'] are a candidate: Same cardinality as table row count = 5547


In [540]:
scout.compute_joint_cardinality(['pid', 'name', 'color'])

3404

In [543]:
query(
    "Select distinct pid, split_part(url, '/', 5) from rugs_usa_parent" 
)

Unnamed: 0,pid,split_part
0,1328007A,Thyme
1,200ZHSS01A,Black
2,1520301350,Red-Black
3,132332C,Blue
4,139182803,Green
...,...,...
5542,200GBCB15A,Blue
5543,200HJAM02A,Gray
5544,287FA170B,Dark-Gray
5545,200KKEL01A,Blue


In [483]:
for col in ['product_type_id']:
    display(query(f"Select {col}, count(1) from rugs_usa_parent group by 1 order by 2 desc"))

Unnamed: 0,product_type_id,count
0,RUG,5562


In [565]:
for col in ['origin', 'thickness', 'material', 'weave', 'weave_feature', 'availability']:
    display(query(f"Select coalesce(cast({col} as varchar), 'missing') as {col}, count(1) from rugs_usa_parent group by 1 order by 2 desc"))

Unnamed: 0,origin,count
0,Turkey,2329
1,India,1369
2,USA,864
3,China,559
4,Egypt,271
5,Belgium,116
6,Saudi Arabia,22
7,China/Turkey,13
8,China/India,7
9,Bangladesh,6


Unnamed: 0,thickness,count
0,missing,2553
1,1/2 inch,1329
2,1/4 inch,1171
3,1 inch,137
4,3/4 inch,134
5,1 1/4 inch,128
6,2 inch,45
7,1 3/4 inch,29
8,1 1/4,22
9,1 1/2 inch,14


Unnamed: 0,material,count
0,100% Polypropylene,1911
1,100% Polyester,760
2,100% Wool,431
3,100% Nylon,303
4,100% Olefin,177
...,...,...
268,"Polypropylene, Polyester",1
269,"50% Jute, 50% Leather",1
270,Olefin,1
271,100% Recycled Polyester Flock Face,1


Unnamed: 0,weave,count
0,missing,5562


Unnamed: 0,weave_feature,count
0,Machine Made,3545
1,Hand Tufted,439
2,Braided,298
3,Machine Woven,249
4,Hand Woven,236
5,Flatweave,189
6,Handmade,163
7,Hand Loomed,112
8,Power Loomed,61
9,Hand Hooked,48


Unnamed: 0,availability,count
0,48 Hours,3626
1,3 - 5 Days,841
2,1-3 Days,309
3,14 - 16 Days,267
4,5 Days,215
5,3-5 Days,155
6,12 - 14 Days,58
7,7 - 10 Days,40
8,3 - 5 Business Days,18
9,2-3 Days,12


## Url, pid, color

In [512]:
query("Select url, color, split_part(url, '/', 5) from rugs_usa_parent")

Unnamed: 0,url,color,split_part
0,/rugsusa/rugs/rugs-usa-jute-braided/Natural/20...,Natural,Natural
1,/rugsusa/rugs/rugs-usa-veronica-wool-braided/O...,Off White,Off-White
2,/rugsusa/rugs/rugs-usa-handwoven-chaste/Natura...,Natural,Natural
3,/rugsusa/rugs/rugs-usa-moroccan-trellis/Gray/2...,Gray,Gray
4,/rugsusa/rugs/rugs-usa-jute-braided/Off-White/...,Natural,Off-White
...,...,...,...
5557,/rugsusa/rugs/colonial-mills-braided-indoorout...,Brownstone,Natural
5558,/rugsusa/rugs/colonial-mills-braided-indoorout...,Brownstone,Terracotta
5559,/rugsusa/rugs/colonial-mills-braided-indoorout...,Brownstone,Blue-Haze
5560,/rugsusa/rugs/colonial-mills-braided-indoorout...,Brownstone,Slate


In [363]:
df['color2'] = df['url'].str.split("/").apply(lambda x: x[-2])

In [364]:
df[['pid', 'color2']].drop_duplicates().shape

(5547, 2)

## min price, max price, availability

In [399]:
df[['min_price', 'max_price']].mean()

min_price    0.0
max_price    0.0
dtype: float64

In [400]:
df['availability'].value_counts()

availability
48 Hours               3617
3 - 5 Days              841
1-3 Days                308
14 - 16 Days            267
5 Days                  210
3-5 Days                155
12 - 14 Days             58
7 - 10 Days              40
3 - 5 Business Days      18
2-3 Days                 12
10-14 Days                8
14 - 21 Days              8
1 - 2 Days                3
5 - 7 Days                2
Name: count, dtype: int64

In [202]:
pd.set_option('display.max_columns', None)
query("Select * from rugs_usa_parent where pid = '108BR12'")

Unnamed: 0,pid,product_type_id,name,url,origin,thickness,material,weave,weave_feature,color,brand,imageName,imageType,internalName,category,min_price,max_price,availability,aggregate,clearance,long_description,shopbyroom,dw_insert_timestamp
0,108BR12,RUG,Braided Indoor/Outdoor,/rugsusa/rugs/colonial-mills-braided-indoorout...,USA,,100% Polypropylene,,Braided,White,Colonial Mills,108BR12,roomImage,colonial-mills-braided-indooroutdoor,Boca Raton,0,0,5 Days,True,N,Just pick a color…any color…they are all here!...,,2022-01-13 19:33:41.513542
1,108BR12,RUG,Braided Indoor/Outdoor,/rugsusa/rugs/colonial-mills-braided-indoorout...,USA,,100% Polypropylene,,Braided,White,Colonial Mills,108BR12,roomImage,colonial-mills-braided-indooroutdoor,Boca Raton,0,0,5 Days,True,N,Just pick a color…any color…they are all here!...,,2022-01-13 19:33:41.513542
2,108BR12,RUG,Braided Indoor/Outdoor,/rugsusa/rugs/colonial-mills-braided-indoorout...,USA,,100% Polypropylene,,Braided,White,Colonial Mills,108BR12,roomImage,colonial-mills-braided-indooroutdoor,Boca Raton,0,0,5 Days,True,N,Just pick a color…any color…they are all here!...,,2022-01-13 19:33:41.513542
3,108BR12,RUG,Braided Indoor/Outdoor,/rugsusa/rugs/colonial-mills-braided-indoorout...,USA,,100% Polypropylene,,Braided,White,Colonial Mills,108BR12,roomImage,colonial-mills-braided-indooroutdoor,Boca Raton,0,0,5 Days,True,N,Just pick a color…any color…they are all here!...,,2022-01-13 19:33:41.513542
4,108BR12,RUG,Braided Indoor/Outdoor,/rugsusa/rugs/colonial-mills-braided-indoorout...,USA,,100% Polypropylene,,Braided,White,Colonial Mills,108BR12,roomImage,colonial-mills-braided-indooroutdoor,Boca Raton,0,0,5 Days,True,N,Just pick a color…any color…they are all here!...,,2022-01-13 19:33:41.513542
5,108BR12,RUG,Braided Indoor/Outdoor,/rugsusa/rugs/colonial-mills-braided-indoorout...,USA,,100% Polypropylene,,Braided,White,Colonial Mills,108BR12,roomImage,colonial-mills-braided-indooroutdoor,Boca Raton,0,0,5 Days,True,N,Just pick a color…any color…they are all here!...,,2022-01-13 19:33:41.513542
6,108BR12,RUG,Braided Indoor/Outdoor,/rugsusa/rugs/colonial-mills-braided-indoorout...,USA,,100% Polypropylene,,Braided,White,Colonial Mills,108BR12,roomImage,colonial-mills-braided-indooroutdoor,Boca Raton,0,0,5 Days,True,N,Just pick a color…any color…they are all here!...,,2022-01-13 19:33:41.513542
7,108BR12,RUG,Braided Indoor/Outdoor,/rugsusa/rugs/colonial-mills-braided-indoorout...,USA,,100% Polypropylene,,Braided,White,Colonial Mills,108BR12,roomImage,colonial-mills-braided-indooroutdoor,Boca Raton,0,0,5 Days,True,N,Just pick a color…any color…they are all here!...,,2022-01-13 19:33:41.513542
8,108BR12,RUG,Braided Indoor/Outdoor,/rugsusa/rugs/colonial-mills-braided-indoorout...,USA,,100% Polypropylene,,Braided,White,Colonial Mills,108BR12,roomImage,colonial-mills-braided-indooroutdoor,Boca Raton,0,0,5 Days,True,N,Just pick a color…any color…they are all here!...,,2022-01-13 19:33:41.513542
9,108BR12,RUG,Braided Indoor/Outdoor,/rugsusa/rugs/colonial-mills-braided-indoorout...,USA,,100% Polypropylene,,Braided,White,Colonial Mills,108BR12,roomImage,colonial-mills-braided-indooroutdoor,Boca Raton,0,0,5 Days,True,N,Just pick a color…any color…they are all here!...,,2022-01-13 19:33:41.513542


In [500]:
df = query(
    """
    with deduped_parent as (Select distinct * from rugs_usa_parent),
    deduped_variant as (Select distinct * from rugs_usa_variant)

    Select p.weave_feature, v.weave_cat, count(1)
    from deduped_parent p 
    full outer join deduped_variant v
    on p.pid = v.pid
    group by 1,2
    order by 2,3 desc
    """
)

In [501]:
df.head(50)

Unnamed: 0,weave_feature,weave_cat,count
0,,Braided,7107
1,Braided,Braided,5626
2,Hand Braided,Braided,888
3,Hand Woven,Braided,179
4,Flat Braid,Braided,95
5,Machine Braided,Braided,88
6,Hand Loomed,Braided,6
7,Handmade,Braided,5
8,Hand Woven,Flatweave,924
9,Flatweave,Flatweave,734


# Pads upsell

In [475]:
scout = PkScout('rugs_usa_pads_upsell')
joint = scout.find_pk()

['pad_id', 'p_id', 'price', 'variant', 'size', 'stock'] are a candidate: Same cardinality as table row count = 96703
['p_id', 'price', 'variant', 'size', 'stock'] are a candidate: Same cardinality as table row count = 96703


In [476]:
scout.compute_joint_cardinality(['p_id', 'pad_id', 'variant', 'stock'])

96703

In [478]:
scout.compute_joint_cardinality(['p_id', 'variant'])

33022

In [249]:
query("Select p_id, pad_id, variant, count(distinct stock) from rugs_usa_pads_upsell group by 1,2,3 order by 4 desc")

Unnamed: 0,p_id,pad_id,variant,count
0,108SM301,200AFPD01A-609,709O,2
1,108SM501,200JAPD1A-58088,709O,2
2,108SM301,200JAPD1A-58088,709O,2
3,108SM501,200AFPD01A-609,709O,2
4,158M342TEL,200AFPD01A-208,2608,2
...,...,...,...,...
96685,200MTVS176B,200MTVS176B-508,508,0
96686,200MTVS176B,200MTVS176B-609,609,0
96687,200MTVS176B,200MTVS176B-76096,76096,0
96688,200MTVS176B,200MTVS176B-R404,R404,0


In [479]:
query("Select * from rugs_usa_pads_upsell where p_id = '108SM301' and variant = '709O' and pad_id='200AFPD01A-609' ")

Unnamed: 0,size,shape,price,width,sqft,type,stock,height,variant,p_id,pad_id,dw_insert_timestamp
0,6' x 9',Rectangle,65.99,6.0,54.0,premium,4669,9.0,709O,108SM301,200AFPD01A-609,2022-01-13 20:44:54.227965
1,6' x 9',Rectangle,65.99,6.0,54.0,premium,4668,9.0,709O,108SM301,200AFPD01A-609,2022-01-13 20:44:54.227965


In [481]:
query("Select distinct * from rugs_usa_variant where pid='108SM301' and variant='709O' ")

Unnamed: 0,pid,variant,actual_size,weave_feature,weave_cat,size_grp,shipping_size,shape,weight,price,msrp,stock_level,depletion_level,low_stock,estimated_delivery_date,this_isd_range,status,origin,new_arrival,stockMsg,stockEddMsg,other_stock_core,other_stock_compass,dw_insert_timestamp
0,108SM301,709O,7_ft x 9_ft,,Braided,6x9,W ? x L ? x H ?,Oval,65.0,724.0,646.8,100,,False,,,In_stock,,N,,,0,0,2022-01-13 19:33:41.513970


# Links

In [274]:
scout = PkScout('rugs_usa_links')
joint = scout.find_pk()

request_time_stamp is a candidate: Same cardinality as table row count = 5565
['request_time_stamp', 'card_links'] are a candidate: Same cardinality as table row count = 5565


In [277]:
query("Select base_url, count(distinct page_links), count(distinct card_links) from rugs_usa_links group by 1")

Unnamed: 0,base_url,count,count.1
0,https://www.rugsusa.com/rugsusa/control/search...,93,5550


In [283]:
query("Select distinct base_url from rugs_usa_links").iloc[0].values

array(['https://www.rugsusa.com/rugsusa/control/search-rugs'],
      dtype=object)

In [286]:
query("Select distinct page_links from rugs_usa_links").iloc[0:50].values

array([['https://www.rugsusa.com/rugsusa/control/search-rugs?p=23'],
       ['https://www.rugsusa.com/rugsusa/control/search-rugs?p=11'],
       ['https://www.rugsusa.com/rugsusa/control/search-rugs?p=58'],
       ['https://www.rugsusa.com/rugsusa/control/search-rugs?p=14'],
       ['https://www.rugsusa.com/rugsusa/control/search-rugs?p=76'],
       ['https://www.rugsusa.com/rugsusa/control/search-rugs?p=63'],
       ['https://www.rugsusa.com/rugsusa/control/search-rugs?p=8'],
       ['https://www.rugsusa.com/rugsusa/control/search-rugs?p=24'],
       ['https://www.rugsusa.com/rugsusa/control/search-rugs?p=68'],
       ['https://www.rugsusa.com/rugsusa/control/search-rugs?p=18'],
       ['https://www.rugsusa.com/rugsusa/control/search-rugs?p=56'],
       ['https://www.rugsusa.com/rugsusa/control/search-rugs?p=17'],
       ['https://www.rugsusa.com/rugsusa/control/search-rugs?p=36'],
       ['https://www.rugsusa.com/rugsusa/control/search-rugs?p=21'],
       ['https://www.rugsusa.com/ru

In [299]:
query("Select page_links, count(distinct card_links) from rugs_usa_links group by 1")['count'].value_counts()

count
60    90
58     1
55     1
45     1
Name: count, dtype: int64

In [302]:
df = query("Select page_links, count(distinct card_links) from rugs_usa_links group by 1")
df[df['count'] == 45].page_links.values

array(['https://www.rugsusa.com/rugsusa/control/search-rugs?p=93'],
      dtype=object)

In [303]:
df[df['count'] == 55].page_links.values

array(['https://www.rugsusa.com/rugsusa/control/search-rugs?p=91'],
      dtype=object)

In [304]:
df[df['count'] == 58].page_links.values

array(['https://www.rugsusa.com/rugsusa/control/search-rugs?p=84'],
      dtype=object)

In [282]:
query("Select distinct card_links from rugs_usa_links").iloc[1].values

array(['https://www.rugsusa.com//rugsusa/rugs/rugs-usa-retro-aztec-indooroutdoor/Rust/200HJOA07A-P.html'],
      dtype=object)

# Color map

In [310]:
scout = PkScout('rugs_usa_color_map')
joint = scout.find_pk()

color_id is a candidate: Same cardinality as table row count = 5640
['color_id', 'pid'] are a candidate: Same cardinality as table row count = 5640
['pid', 'color_value'] are a candidate: Same cardinality as table row count = 5640


In [312]:
scout.compute_joint_cardinality(['pid'])

3404

In [340]:
query("Select color_value, count(distinct color_id), count(distinct pid), count(1) from rugs_usa_color_map group by 1 order by 2").head(50)

Unnamed: 0,color_value,count,count.1,count.2
0,Red Black,1,1,8
1,Adobe,1,1,5
2,Aero,1,1,6
3,African Plain,1,1,7
4,Gunmetal,1,1,9
5,Harbor Gray,1,1,1
6,Harvest Ebony,1,1,5
7,Hazy Blue,1,1,7
8,Hazy Forest,1,1,5
9,Hearthstone,1,1,9


# Category map

In [544]:
scout = PkScout('rugs_usa_category_map')
joint = scout.find_pk()

['pid', 'category_id'] are a candidate: Same cardinality as table row count = 31758


In [545]:
scout.compute_joint_cardinality(['pid', 'category_id'])

31758

In [546]:
query("Select * from rugs_usa_category_map")

Unnamed: 0,pid,category_id,category_value,dw_insert_timestamp
0,200TAJT03,11265,Maui,2022-01-13 19:33:41.514826
1,200TAJT03,12199,Solid & Striped,2022-01-13 19:33:41.514826
2,200TAJT03,12327,Serendipity,2022-01-13 19:33:41.514826
3,200TAJT03,17951,Casuals,2022-01-13 19:33:41.514826
4,200TAJT03,7001,Casuals,2022-01-13 19:33:41.514826
...,...,...,...,...
51260,108BN19,13030,Brooklyn,2022-01-13 19:33:41.514826
51261,108BN19,7104,Outdoor,2022-01-13 19:33:41.514826
51262,108BN19,9966,Casuals,2022-01-13 19:33:41.514826
51263,108BN19,COLONIAL_MILLS_PROMO_CAT,Colonial Mills Promo Cat,2022-01-13 19:33:41.514826


# What is PID?

In [343]:
df = query(
    """
    Select pid, 'rugs_usa_parent' as tbl
    from rugs_usa_parent
    union
    Select pid, 'rugs_usa_variant' as tbl
    from rugs_usa_variant
    union
    Select p_id as pid, 'rugs_usa_pads_upsell' as tbl
    from rugs_usa_pads_upsell
    """
)

In [347]:
df.groupby('pid').apply(lambda x: "|".join(sorted(x['tbl']))).value_counts()

  df.groupby('pid').apply(lambda x: "|".join(sorted(x['tbl']))).value_counts()


rugs_usa_pads_upsell|rugs_usa_parent|rugs_usa_variant    3222
rugs_usa_pads_upsell|rugs_usa_variant                    2313
rugs_usa_parent                                           165
rugs_usa_parent|rugs_usa_variant                           17
rugs_usa_variant                                            3
Name: count, dtype: int64

# What is variant

In [353]:
df = query(
    """
    Select variant, 'rugs_usa_variant' as tbl
    from rugs_usa_variant
    union
    Select variant, 'rugs_usa_pads_upsell' as tbl
    from rugs_usa_pads_upsell
    """
)

In [355]:
df.groupby('variant').apply(lambda x: "|".join(sorted(x['tbl']))).value_counts()

  df.groupby('variant').apply(lambda x: "|".join(sorted(x['tbl']))).value_counts()


rugs_usa_pads_upsell|rugs_usa_variant    557
rugs_usa_variant                          30
Name: count, dtype: int64