In [114]:
import pandas as pd

In [150]:
df_sets_brickset = pd.read_csv('sets-brickset.csv')

# change number to bricklink format
df_sets_brickset['Number'] = [
    f'{number}-{variant}' for number, variant 
    in zip(df_sets_brickset['Number'], df_sets_brickset['Variant'])
]

# change column names, filter columns
df_sets_brickset = df_sets_brickset.rename(
    columns={col: col.lower() for col in list(df_sets_brickset)}
)[['number', 'theme', 'year', 'name', 'minifigs', 'pieces', 'usprice']]

# filter out sets with fewer than 25 pieces
df_sets_brickset = df_sets_brickset[df_sets_brickset['pieces'] > 25]

# filter out sets with NaN prices
df_sets_brickset = df_sets_brickset.dropna(subset=['usprice'])

# make NaN minifigs 0.0
df_sets_brickset = df_sets_brickset.fillna(value={'minifigs': 0.0})

# set types
df_sets_brickset = df_sets_brickset.astype({
    'number': 'str',
    'year': 'str'
})

df_sets_brickset

Unnamed: 0,number,theme,year,name,minifigs,pieces,usprice
0,497-1,Space,1979,Galaxy Explorer,4.0,338.0,32.00
1,1072-1,Dacta,1979,Supplementary LEGO Set,0.0,304.0,16.30
6,341-2,Fabuland,1979,Catherine Cat's House and Mortimer Mouse,2.0,123.0,3.00
91,8858-2,Technic,1980,Auto Engines,0.0,460.0,69.00
92,744-1,Basic,1980,"Universal Building Set with Motor, 7+",0.0,537.0,59.00
...,...,...,...,...,...,...,...
14386,41387-1,Friends,2019,Olivia's Summer Heart Box,2.0,93.0,7.99
14387,41388-1,Friends,2019,Mia's Summer Heart Box,1.0,85.0,7.99
14414,853906-1,Seasonal,2019,LEGO Greeting Card,0.0,40.0,4.99
14430,30362-1,City,2019,Sky Police Jetpack,2.0,33.0,3.99


In [151]:
df_sets_bricklink = pd.read_csv('sets-bricklink.tsv', sep='\t')

# rename and filter columns
df_sets_bricklink = df_sets_bricklink.rename(
    columns={
        'Number': 'number',
        'Weight (in Grams)': 'set_weight',
    }
)[['number', 'set_weight']]

# filter out sets without weight data
df_sets_bricklink = df_sets_bricklink[df_sets_bricklink['set_weight'] != '?']

# set types
df_sets_bricklink = df_sets_bricklink.astype({
    'number': 'str',
    'set_weight': 'float'
})

df_sets_bricklink

Unnamed: 0,number,set_weight
8,041-2,569.0
11,101-1,74.0
15,1029-1,142.0
17,1030-1,1189.0
18,1031-1,277.0
...,...,...
15568,75964-21,13.0
15569,75964-22,7.0
15570,75964-23,12.0
15571,75964-24,5.0


In [152]:
df_boxes = pd.read_csv('boxes-bricklink.tsv', sep='\t')

# rename and filter columns
df_boxes = df_boxes.rename(
    columns={
        'Number': 'number',
        'Weight (in Grams)': 'box_weight'
    }
)[['number', 'box_weight']]

# filter out sets without weight data
df_boxes = df_boxes[df_boxes['box_weight'] != '?']

# set types
df_boxes = df_boxes.astype({
    'number': 'str',
    'box_weight': 'float'
})

df_boxes

Unnamed: 0,number,box_weight
3,367-1,134.00
5,2164-1,0.45
8,217-2,41.00
9,603-2,3.00
11,293-1,30.00
...,...,...
15420,911951-1,1.50
15422,BIL01-1,14.00
15441,11920-1,2.00
15442,111903-2,1.50


In [153]:
df_instructions = pd.read_csv('instructions-bricklink.tsv', sep='\t')

# filter out instructions without weight
df_instructions = df_instructions[df_instructions['Weight (in Grams)'] != '?']

# rename and filter columns
df_instructions = df_instructions.rename(
    columns={
        'Number': 'number',
        'Weight (in Grams)': 'instruction_weight'
    }
)[['number', 'instruction_weight']]

# set types
df_instructions = df_instructions.astype({
    'number': 'str',
    'instruction_weight': 'float'
})

df_instructions

Unnamed: 0,number,instruction_weight
0,8470-1,102.00
2,691-1,3.40
3,367-1,20.00
8,293-1,3.40
9,645-2,3.40
...,...,...
9231,col19-14,2.38
9232,col19-15,2.38
9233,col19-16,2.38
9235,GA11NoDk-99,9.00


In [155]:
# make a dictionary to compute inflation-adjusted prices

df_inflation = pd.read_csv('inflation.csv')
dollar_2019 = df_inflation['amount'].tolist()[-1]
dict_inflation = {
    str(year): dollar_2019 / amount for year, amount in zip(df_inflation['year'], df_inflation['amount'])
}

dict_inflation

{'1979': 3.5338567493112953,
 '1980': 3.113567961165048,
 '1981': 2.8224202420242026,
 '1982': 2.658632124352332,
 '1983': 2.5758835341365462,
 '1984': 2.469278152069298,
 '1985': 2.384368029739777,
 '1986': 2.340857664233577,
 '1987': 2.2584330985915497,
 '1988': 2.1687066779374473,
 '1989': 2.069016129032258,
 '1990': 1.9629533282325937,
 '1991': 1.8836857562408222,
 '1992': 1.8286386315039205,
 '1993': 1.7754878892733565,
 '1994': 1.731160593792173,
 '1995': 1.683451443569554,
 '1996': 1.63516889738687,
 '1997': 1.598492211838006,
 '1998': 1.5739754601226996,
 '1999': 1.539963985594238,
 '2000': 1.4898838559814174,
 '2001': 1.4486617730095992,
 '2002': 1.4261145080600333,
 '2003': 1.3943369565217392,
 '2004': 1.3581683430386449,
 '2005': 1.3136610343061954,
 '2006': 1.272609126984127,
 '2007': 1.2373662837244745,
 '2008': 1.1916136793263448,
 '2009': 1.195868311759743,
 '2010': 1.176569321642147,
 '2011': 1.1405669981639468,
 '2012': 1.1174420934344977,
 '2013': 1.1013105422889202,


In [163]:
# join it all together!
df_merged = (
    df_sets_brickset
        .merge(df_sets_bricklink, how='left', on='number')
        .merge(df_boxes, how='left', on='number')
        .merge(df_instructions, how='left', on='number')
)

# add inflation-adjusted price
df_merged['adjusted_price'] = [
    price * dict_inflation[year] for price, year in zip(df_merged['usprice'], df_merged['year'])
]

df_merged

Unnamed: 0,number,theme,year,name,minifigs,pieces,usprice,set_weight,box_weight,instruction_weight,adjusted_price
0,497-1,Space,1979,Galaxy Explorer,4.0,338.0,32.00,,,46.3,113.083416
1,1072-1,Dacta,1979,Supplementary LEGO Set,0.0,304.0,16.30,,,,57.601865
2,341-2,Fabuland,1979,Catherine Cat's House and Mortimer Mouse,2.0,123.0,3.00,,,35.0,10.601570
3,8858-2,Technic,1980,Auto Engines,0.0,460.0,69.00,513.0,133.00,100.0,214.836189
4,744-1,Basic,1980,"Universal Building Set with Motor, 7+",0.0,537.0,59.00,1338.0,316.00,90.0,183.700510
...,...,...,...,...,...,...,...,...,...,...,...
5978,41387-1,Friends,2019,Olivia's Summer Heart Box,2.0,93.0,7.99,132.0,,,7.990000
5979,41388-1,Friends,2019,Mia's Summer Heart Box,1.0,85.0,7.99,128.0,,,7.990000
5980,853906-1,Seasonal,2019,LEGO Greeting Card,0.0,40.0,4.99,,,,4.990000
5981,30362-1,City,2019,Sky Police Jetpack,2.0,33.0,3.99,24.4,2.00,3.7,3.990000


In [175]:
# take a peek at what we're working with

with pd.option_context('display.max_rows', None):
    display(df_merged[df_merged['year'] == '1991'])

Unnamed: 0,number,theme,year,name,minifigs,pieces,usprice,set_weight,box_weight,instruction_weight,adjusted_price
221,4558-1,Trains,1991,Metroliner,11.0,784.0,149.0,2800.0,,127.0,280.669178
222,4563-1,Trains,1991,Load 'N Haul Railroad,3.0,476.0,120.0,2178.0,,82.1,226.042291
223,6988-1,Space,1991,Alpha Centauri Outpost,5.0,406.0,80.0,1411.0,435.0,62.0,150.69486
224,4554-1,Trains,1991,Metro Station,8.0,600.0,72.0,1571.0,445.0,60.3,135.625374
225,6273-1,Pirates,1991,Rock Island Refuge,7.0,381.0,66.0,1249.0,457.0,50.0,124.32326
226,6541-1,Town,1991,Intercoastal Seaport,5.0,545.0,63.75,1362.0,,42.0,120.084967
227,4031-1,Boats,1991,Firefighter,4.0,361.0,52.0,1081.0,,39.8,97.951659
228,5550-1,Model Team,1991,Custom Rally Van,0.0,525.0,49.5,919.0,349.0,52.0,93.242445
229,6540-1,Town,1991,Pier Police,4.0,352.0,44.0,910.0,,42.0,82.882173
230,6347-1,Town,1991,Monorail Accessory Track,0.0,54.0,35.0,927.0,299.0,20.0,65.929001


In [245]:
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import OneHotEncoder

In [525]:
with pd.option_context('display.max_rows', None):
    display(df_merged[df_merged['year'] == '2019'])

Unnamed: 0,number,theme,year,name,minifigs,pieces,usprice,set_weight,box_weight,instruction_weight,adjusted_price
5686,75252-1,Star Wars,2019,Imperial Star Destroyer,0.0,4784.0,699.99,,,,699.99
5687,71044-1,Disney,2019,Disney Train and Station,0.0,2925.0,329.99,4243.0,,,329.99
5688,70840-1,The Lego Movie 2: The Second Part,2019,Welcome to Apocalypseburg!,13.0,3178.0,299.99,5329.0,633.0,867.0,299.99
5689,42099-1,Technic,2019,4x4 X-Treme Off-Roader,0.0,958.0,249.99,2156.0,,,249.99
5690,75936-1,Jurassic World,2019,Jurassic Park: T. rex Rampage,0.0,3120.0,249.99,4906.0,557.0,789.0,249.99
5691,10264-1,Creator Expert,2019,Corner Garage,6.0,2569.0,199.99,3035.0,466.0,406.0,199.99
5692,21318-1,Ideas,2019,Tree House,4.0,3036.0,199.99,3870.0,,,199.99
5693,75244-1,Star Wars,2019,Tantive IV,6.0,1768.0,199.99,2520.0,,,199.99
5694,75253-1,Star Wars,2019,Droid Commander,0.0,1177.0,199.99,1669.0,,,199.99
5695,75810-1,Stranger Things,2019,The Upside Down,8.0,2287.0,199.99,3323.0,732.0,544.0,199.99


In [714]:
def process_set_data(
    df_sets, features, target, start_year='1979', end_year='2020',
    excluded_numbers=[], excluded_themes=[]
):

    df = df_sets.copy()

    df = df[
        (df['year'] <= end_year) &
        (df['year'] >= start_year) &
        (~df['number'].isin(excluded_numbers)) &
        (~df['theme'].isin(excluded_themes))
    ]

    # remove sets with missing column data
    df = df.dropna(subset=features)

    # deal with theme if included in model
    if 'theme' in features:
        # remove themes with fewer than 2 sets
        # remove duplo/quatro
        # some education sets are duplo
        theme_counts = df.groupby('theme').count()
        small_themes = list(theme_counts[theme_counts['number'] < 2].index)
        df = df[~df['theme'].isin(small_themes)]

        # one-hot encode theme
        col_theme = df['theme']
        df = pd.get_dummies(df, columns=['theme'])
        df['theme'] = col_theme
        
        # include one-hot columns in model
        model_features = [f for f in features if f != 'theme'] + [
            f for f in list(df) if 'theme_' in f 
        ]
    else:
        model_features = features
    
    return df, model_features



def train_test_split_(df_sets, target):
    df = df_sets.copy()
    
    if 'theme' in list(df_sets):
        stratify = df['theme']
    else:
        stratify = None
    
    df_X = df[[f for f in list(df) if f != target]]
    df_y = df[target]

    X_train, X_test, y_train, y_test = train_test_split(
        df_X, df_y, train_size=0.8, stratify=None
    )
    
    return X_train, X_test, y_train, y_test



# only use sets since strategic re-alignment (2005)
# excluding sets because their minifig counts are wrong
target = 'adjusted_price'

df, features = process_set_data(
    df_sets=df_merged[df_merged['adjusted_price'] != 0.0],
    features=['theme', 'minifigs', 'pieces', 'set_weight'],
    target=target,
    start_year='2005',
    excluded_numbers=[
        '75252-1',
        '71044-1',
        '75936-1',
        '70425-1',
        '10267-1',
        '76122-1',
        '21155-1',
    ],
    excluded_themes=[
        'Clickits'
        'Dimensions',
        'Duplo',
        'Education',
        'Quatro',
    ]
)

X_train, X_test, y_train, y_test = train_test_split_(df, target)

In [715]:
# get piece weight
df_adj = df_merged.dropna(subset=['set_weight', 'box_weight', 'instruction_weight'])
df_adj['piece_weight'] = df_adj['set_weight'] - (df_adj['box_weight'] + df_adj['instruction_weight'])

target = 'adjusted_price'

# only use sets since strategic re-alignment (2005)
# excluding sets because their minifig counts are wrong
df, features = process_set_data(
    df_sets=df_adj[df_adj['adjusted_price'] != 0.0],
    features=['theme', 'minifigs', 'pieces', 'piece_weight'],
    target=target,
    start_year='2005',
    excluded_numbers=[
        '75252-1',
        '71044-1',
        '75936-1',
        '70425-1',
        '10267-1',
        '76122-1',
        '21155-1',
    ],
    excluded_themes=[
        'Clickits'
        'Dimensions',
        'Duplo',
        'Education',
        'Quatro',
    ]
)

X_train, X_test, y_train, y_test = train_test_split_(df, target)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until


In [716]:
# train model
model = GradientBoostingRegressor(
    n_estimators=500,
    loss='huber',
)
model.fit(X_train[features], y_train)
y_pred = model.predict(X_test[features])

print(f'R-squared train {model.score(X_train[features], y_train)}')
print(f'R-squared test {model.score(X_test[features], y_test)}')

imps = model.feature_importances_

print('\nFeature importances:')
display(sorted(list(zip(features, imps.tolist())), key=lambda elem: elem[1], reverse=True))

R-squared train 0.9864853529010742
R-squared test 0.9287742752758972

Feature importances:


[('piece_weight', 0.6193385291808694),
 ('pieces', 0.3475148754561666),
 ('minifigs', 0.010914089365081594),
 ('theme_Make and Create', 0.003969718454029558),
 ('theme_Dimensions', 0.0028449808394272295),
 ('theme_Star Wars', 0.002421788467597898),
 ('theme_Classic', 0.0015477616771950456),
 ('theme_Architecture', 0.0014392180276208014),
 ('theme_Bricks and More', 0.0013519336030154779),
 ('theme_Creator', 0.0012311178859238706),
 ('theme_Mindstorms', 0.0009233923936513737),
 ('theme_Technic', 0.0008593658613612492),
 ('theme_Racers', 0.0006930267212656627),
 ('theme_City', 0.0006166304851604291),
 ('theme_DC Comics Super Heroes', 0.00043385869526782776),
 ('theme_Disney', 0.0002702180401230855),
 ('theme_Avatar The Last Airbender', 0.0002675039573261629),
 ('theme_Exo-Force', 0.00022586872174453214),
 ('theme_Ideas', 0.00021504794222001257),
 ('theme_Sports', 0.00020106848246801432),
 ('theme_Master Builder Academy', 0.00019108675534246556),
 ('theme_Seasonal', 0.0001674092446474489),

In [712]:
df_test = X_test.copy()

# re-adjust price for inflation
df_test['prediction'] = [
    round(float(price) / dict_inflation[year], 2)
    for price, year in zip(y_pred, df_test['year'])
]
df_test['price'] = [
    float(price) / dict_inflation[year]
    for price, year in zip(y_test, df_test['year'])
]
df_test['raw_deviation'] = df_test['prediction'] - df_test['price']

print(df_test['raw_deviation'].abs().median())

with pd.option_context('display.max_rows', None):
    display(df_test[['name', 'theme', 'price', 'prediction', 'raw_deviation', 'year', 'minifigs', 'pieces', 'piece_weight']])

2.16


Unnamed: 0,name,theme,price,prediction,raw_deviation,year,minifigs,pieces,piece_weight
3627,Equila's Ultra Striker,Legends of Chima,39.99,39.06,-0.93,2013,3.0,339.0,400.0
2122,Mistlands Tower,Castle,50.0,51.86,1.86,2006,6.0,431.0,712.0
3556,Town Square,City,119.99,108.68,-11.31,2013,9.0,914.0,1269.0
5299,Battle Suit Macy,Nexo Knights,9.99,9.69,-0.3,2017,1.0,66.0,54.0
5057,Adventure Time,Ideas,49.99,40.53,-9.46,2017,0.0,495.0,220.0
3181,Jetbug,HERO Factory,12.99,12.63,-0.36,2011,0.0,63.0,128.4
3494,Finn McMissile,Cars,6.99,7.75,0.76,2012,0.0,52.0,38.9
2837,Off-Road Power,Creator,89.99,75.14,-14.85,2010,0.0,1061.0,954.0
4420,Blaster Bike,Ninjago,19.99,21.17,1.18,2015,2.0,212.0,154.9
2576,Police Helicopter,City,9.99,11.71,1.72,2008,1.0,94.0,100.0


In [713]:
# todo: add a function to guess a set price given some features

set_features={
    'minifigs': 2.0,
    'piece_weight': 6100.0,
    'pieces': 4784.0,
    'theme': 'Star Wars'
}

def estimate_price(model, model_features, set_features):
    if 'theme' in set_features:
        set_features['theme_' + set_features['theme']] = 1
    features = {f: [set_features[f]] if f in set_features else [0] for f in model_features}
    df = pd.DataFrame(features)
    return model.predict(df[model_features])

estimate_price(model, features, set_features)

array([455.49540459])

In [669]:
df_merged[df_merged['number'] == '75192-1']

Unnamed: 0,number,theme,year,name,minifigs,pieces,usprice,set_weight,box_weight,instruction_weight,adjusted_price
4966,75192-1,Star Wars,2017,Millennium Falcon,8.0,7541.0,799.99,13150.0,2950.0,2900.0,837.319821


appendix: merge brickset year csvs (not necessary to run analysis)

In [37]:
import os

brickset_path = 'sets-brickset'
list_dfs_brickset = []

for filename in os.listdir(brickset_path):
    list_dfs_brickset.append(
        pd.read_csv(os.path.join(brickset_path, filename))
    )

df_sets_brickset = pd.concat(list_dfs_brickset, ignore_index=True)

In [32]:
df_sets_brickset.to_csv('sets-brickset.csv', index=False)