In [1]:
import re
import sys
import pandas as pd
pd.set_option('display.max_columns', None)

In [2]:
sys.path.append('../..')

In [3]:
from ingestion.ingestion_mediator import IngestionMediator
mediator = IngestionMediator()

  """)


In [4]:
# Get current raw data files for source
SOURCE = 'backcountry'
def get_csv_file_path(specs=True):
    if specs:
        tablename = 'product_specs'
    else:
        tablename = 'products'
    manifest_row = mediator.get_rows_matching(sources=[SOURCE], tablenames=[tablename])[0]
    return mediator.get_filepath_for_manifest_row(row=manifest_row)

csv_prods = get_csv_file_path(specs=False)
print('Prods csv path:', csv_prods)
csv_specs = get_csv_file_path()
print('Specs csv path:', csv_specs)

prods_df = pd.read_csv(csv_prods)
specs_df = pd.read_csv(csv_specs)

Prods csv path: /home/johnny/jk-apps/bike-price-predictor/data/raw_data/01112020/backcountry_prods_all.csv
Specs csv path: /home/johnny/jk-apps/bike-price-predictor/data/raw_data/01112020/backcountry_specs_all.csv


In [5]:
# Confirm have at least 'product_id' column to merge prods_df on specs_df
print('Same number of rows:', len(prods_df) == len(specs_df))
common_cols = set(prods_df.columns) & set(specs_df.columns)
print('Intersecting columns:', common_cols)

# Confirm same number of product ids - symmetric difference i.e. unique to each should be zero
set(prods_df.product_id.unique()) ^ set(specs_df.product_id.unique())

Same number of rows: True
Intersecting columns: {'product_id', 'site'}


set()

In [6]:
# Merge on common columns
merged_df = prods_df.merge(right=specs_df, how='outer', on=list(common_cols))
merged_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 403 entries, 0 to 402
Data columns (total 75 columns):
site                      403 non-null object
bike_type                 403 non-null object
subtype                   403 non-null object
product_id                403 non-null object
href                      403 non-null object
brand                     403 non-null object
description               403 non-null object
price                     403 non-null float64
msrp                      403 non-null float64
tires                     330 non-null object
bar_tape                  46 non-null object
battery_type              6 non-null object
computer                  4 non-null object
handlebar_drop            36 non-null object
brakeset                  331 non-null object
claimed_weight            80 non-null object
iscg_tabs                 233 non-null object
pedals                    202 non-null object
crank_arm_length          208 non-null object
details                   

In [8]:
# let's look at bike_type vs subtypes
grouped = merged_df.groupby(by=['bike_type', 'subtype'])
grouped.count()

Unnamed: 0_level_0,Unnamed: 1_level_0,site,product_id,href,brand,description,price,msrp,tires,bar_tape,battery_type,computer,handlebar_drop,brakeset,claimed_weight,iscg_tabs,pedals,crank_arm_length,details,crankset,handlebar_width,handlebar_sweep,stem_length,front_derailleur,fork,seatpost,cable_routing,seatpost_diameter,wheelset,headset_included,rotors,fork_travel,saddle,head_tube_diameter,modes,rear_shock,chainring_sizes,skewers,tire_size,hubs,cassette,front_axle,seat_collar,dropouts,rear_derailleur,rear_axle,cassette_range,wheel_size,derailleur_pull,max_speed,grips,stem,drive_system,battery_energy,manufacturer_warranty,front_travel,headset,rear_travel,suspension,shifters,bottom_bracket,brake_type,recommended_use,extras,front_derailleur_mount,motor_power,max_torque,compatible_components,frame_material,max_range,chain,handlebar_rise,bottom_bracket_type,handlebar
bike_type,subtype,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1,Unnamed: 63_level_1,Unnamed: 64_level_1,Unnamed: 65_level_1,Unnamed: 66_level_1,Unnamed: 67_level_1,Unnamed: 68_level_1,Unnamed: 69_level_1,Unnamed: 70_level_1,Unnamed: 71_level_1,Unnamed: 72_level_1,Unnamed: 73_level_1,Unnamed: 74_level_1
ebikes,emountain_bikes,2,2,2,2,2,2,2,2,0,2,2,0,2,0,0,2,0,2,2,2,0,2,0,2,2,0,0,2,0,2,0,2,0,2,2,0,0,2,2,2,2,2,0,2,2,2,0,0,2,2,2,2,2,2,2,2,2,2,2,0,2,2,1,0,2,2,0,2,2,2,1,0,2
ebikes,eroad_bikes,3,3,3,3,3,3,3,3,1,3,1,0,3,2,0,2,1,3,3,1,0,1,3,2,3,0,0,3,0,2,0,2,0,1,0,3,0,2,1,3,3,0,0,3,3,3,0,0,3,0,3,3,1,3,0,3,0,0,3,3,3,3,0,0,3,1,0,3,1,3,0,2,3
ebikes,eutility_bikes,1,1,1,1,1,1,1,1,0,1,1,0,1,0,0,1,0,1,1,1,0,0,0,1,1,0,0,1,0,1,0,1,0,0,0,0,0,1,1,1,0,0,0,1,0,0,0,0,1,1,1,1,0,0,0,1,0,0,1,1,0,1,0,0,0,0,0,1,1,1,0,0,1
gravel_cyclocross_bikes,complete_gravel_cyclocross_bikes,47,47,47,47,47,47,47,47,25,0,0,17,47,4,0,29,12,47,47,12,0,11,20,47,47,0,0,46,0,37,0,47,0,0,0,35,0,47,19,47,42,8,0,46,42,35,0,0,0,0,47,0,0,47,0,46,0,0,47,37,47,47,0,0,0,0,0,47,0,47,0,34,47
mountain_bikes,downhill_bikes,3,3,3,3,3,3,3,3,0,0,0,0,3,2,3,2,3,3,3,2,0,1,0,3,3,0,0,3,0,3,0,3,0,0,3,3,0,3,3,3,3,2,0,3,3,3,0,0,0,3,3,0,0,3,3,3,3,3,3,3,3,3,2,0,0,0,0,3,0,3,0,2,3
mountain_bikes,enduro_full_suspension_bikes,81,81,81,81,81,81,81,56,0,0,0,0,56,19,81,43,40,81,56,41,10,34,0,56,55,23,24,56,9,54,10,55,24,0,81,45,0,56,40,56,56,42,0,56,81,44,23,0,0,56,55,0,0,81,57,56,81,81,56,55,81,80,28,3,0,0,0,81,0,53,13,66,56
mountain_bikes,hardtail_bikes,25,25,25,25,25,25,25,16,0,0,0,0,16,5,8,9,12,25,16,9,2,6,0,16,16,8,7,16,2,16,3,16,8,0,0,17,0,17,16,16,16,12,4,16,25,16,8,3,0,16,16,0,0,24,17,17,0,3,16,16,22,25,0,4,0,0,1,25,0,16,2,23,16
mountain_bikes,trail_full_suspension_bikes,146,146,146,146,146,146,146,117,0,0,0,0,117,38,126,58,92,146,117,70,4,77,0,117,120,27,30,118,11,112,10,117,27,0,146,105,0,118,88,117,117,62,0,116,145,106,27,0,0,117,117,0,0,146,116,117,146,146,117,113,138,145,11,8,0,0,1,146,0,110,13,130,117
mountain_bikes,xc_full_suspension_bikes,37,37,37,37,37,37,37,28,0,0,0,0,28,8,15,11,22,37,28,15,0,17,0,29,28,8,6,28,4,25,6,28,8,0,32,23,0,28,20,27,28,15,0,28,37,22,9,2,0,28,28,0,0,37,28,27,33,35,28,26,34,37,0,4,0,0,3,37,0,26,0,30,28
road_bikes,complete_road_bikes,54,54,54,54,54,54,54,53,20,0,0,19,54,2,0,42,26,54,54,28,0,28,53,54,52,0,0,53,0,27,0,53,0,0,0,45,2,48,4,54,49,8,0,54,49,45,0,0,0,0,53,0,0,54,0,51,0,0,54,50,53,53,0,0,0,0,0,54,0,53,0,40,53
