In [76]:
import time
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans

In [None]:
import gensim
from gensim import corpora, models, similarities

sofa_df = pd.read_json('wayfair/sofa.json')
sofa_bed_df = pd.read_json('wayfair/sofa_bed.json')
futon_df = pd.read_json('wayfair/futon.json')
loveseat_df = pd.read_json('wayfair/loveseat.json')
coffee_table_df = pd.read_json('wayfair/coffee_table.json')
desk_df = pd.read_json('wayfair/desk.json')
office_chair_df = pd.read_json('wayfair/office_chair.json')
bookcase_df = pd.read_json('wayfair/bookcase.json')
dining_table_df = pd.read_json('wayfair/dining_table.json')
dining_chair_df = pd.read_json('wayfair/dining_chair.json')
bed_df = pd.read_json('wayfair/bed.json')
nightstand_df = pd.read_json('wayfair/nightstand.json')
dresser_df = pd.read_json('wayfair/dresser.json')

# clean text data
* furniture_id: strip & str()
* price: string with $ to float
* features from list to string
* add description & features to one string


In [67]:
sofa_df = pd.read_json('wayfair/sofa.json')

In [68]:
sofa_df.info()

<class 'pandas.core.frame.DataFrame'>
Float64Index: 480 entries, 0.0 to 99.0
Data columns (total 15 columns):
category                480 non-null object
colors                  480 non-null object
description             359 non-null object
features                480 non-null object
image_links_all         480 non-null object
image_links_by_color    480 non-null object
manufacturer            480 non-null object
price                   480 non-null object
product_id              480 non-null object
rating_avg              420 non-null float64
rating_count            420 non-null float64
specs                   470 non-null object
title                   480 non-null object
url                     480 non-null object
website                 480 non-null object
dtypes: float64(2), object(13)
memory usage: 60.0+ KB


In [69]:
sofa_df.head().T

Unnamed: 0,0.0,1.0,10.0,100.0,101.0
category,sofa,sofa,sofa,sofa,sofa
colors,"[Cobblestone, Café, Mocha, Sage, Salsa, Stone]","[Blue Linen / Chevron, Blue Linen / Greek Key,...","[Purple, Red, Brown, Dark Grey, Grey]","[Ash, Graphite, Natural]","[Greenwich Ivory, Greenwich Pecan, Greenwich L..."
description,Add a touch of comfort and style to your home ...,Sleeping on the sofa has never been so chic. S...,This luxurious sofa merging with style and ele...,tion’s mid-century modern design is masterfull...,"Casual and comfy, the Smith Loveseat offers an..."
features,[\nFeatures\n\nHarvest collection\nMaterial: 1...,[\nFeatures\n\nMaterial: 100% Polyester Linen\...,[\nFeatures\n\nCarthusia collection\nRubberwoo...,"[\nFeatures\n\nCovered in durable, easy-to-cle...",[\nFeatures\n\nLoose back and seat cushions\nH...
image_links_all,[https://secure.img2.wfrcdn.com/lf/49/hash/266...,[https://secure.img1.wfrcdn.com/lf/49/hash/338...,[https://secure.img2.wfrcdn.com/lf/49/hash/296...,[https://secure.img1.wfrcdn.com/lf/49/hash/384...,[https://secure.img1.wfrcdn.com/lf/49/hash/308...
image_links_by_color,{u'Stone': [u'https://secure.img2.wfrcdn.com/l...,{u'Blue Linen / Chevron': [u'https://secure.im...,{u'Purple': [u'https://secure.img2.wfrcdn.com/...,{u'Graphite': [u'https://secure.img1.wfrcdn.co...,{u'Greenwich Pecan': [u'https://secure.img1.wf...
manufacturer,Signature Design by Ashley,Mercury Row,Kingstown Home,Carolina Accents,Andover Mills
price,$357.99,$529.99,$829.99,$189.99,$289.99
product_id,GNT3224,MCRR1650,KMDS1431,ENT1245,ANDO1550
rating_avg,4.4,4.1,4.5,4.5,4.5


In [70]:
def clean_text_data(df):
    df['product_id'] = df['product_id'].apply(lambda x: x.strip())
    df['price'] = df['price'].apply(lambda x:float(x.strip('$').replace(',','').split()[0]))
    df['features'] = df['features'].apply(lambda x: '\n'.join(x) if type(x)==list else x)
    df['description'] = df['description'].apply(lambda x: '\n'.join(x) if type(x)==list else x)
    df['description_all'] = df['description'] + '\n' + df['features']
    return df

In [72]:
start = time.time()
sofa_df = clean_text_data(sofa_df)
print time.time() - start

0.0136740207672


In [73]:
sofa_df.info()

<class 'pandas.core.frame.DataFrame'>
Float64Index: 480 entries, 0.0 to 99.0
Data columns (total 16 columns):
category                480 non-null object
colors                  480 non-null object
description             359 non-null object
features                480 non-null object
image_links_all         480 non-null object
image_links_by_color    480 non-null object
manufacturer            480 non-null object
price                   480 non-null float64
product_id              480 non-null object
rating_avg              420 non-null float64
rating_count            420 non-null float64
specs                   470 non-null object
title                   480 non-null object
url                     480 non-null object
website                 480 non-null object
description_all         359 non-null object
dtypes: float64(3), object(13)
memory usage: 63.8+ KB


In [74]:
sofa_df.head().T

Unnamed: 0,0.0,1.0,10.0,100.0,101.0
category,sofa,sofa,sofa,sofa,sofa
colors,"[Cobblestone, Café, Mocha, Sage, Salsa, Stone]","[Blue Linen / Chevron, Blue Linen / Greek Key,...","[Purple, Red, Brown, Dark Grey, Grey]","[Ash, Graphite, Natural]","[Greenwich Ivory, Greenwich Pecan, Greenwich L..."
description,Add a touch of comfort and style to your home ...,Sleeping on the sofa has never been so chic. S...,This luxurious sofa merging with style and ele...,tion’s mid-century modern design is masterfull...,"Casual and comfy, the Smith Loveseat offers an..."
features,\nFeatures\n\nHarvest collection\nMaterial: 10...,\nFeatures\n\nMaterial: 100% Polyester Linen\n...,\nFeatures\n\nCarthusia collection\nRubberwood...,"\nFeatures\n\nCovered in durable, easy-to-clea...",\nFeatures\n\nLoose back and seat cushions\nHi...
image_links_all,[https://secure.img2.wfrcdn.com/lf/49/hash/266...,[https://secure.img1.wfrcdn.com/lf/49/hash/338...,[https://secure.img2.wfrcdn.com/lf/49/hash/296...,[https://secure.img1.wfrcdn.com/lf/49/hash/384...,[https://secure.img1.wfrcdn.com/lf/49/hash/308...
image_links_by_color,{u'Stone': [u'https://secure.img2.wfrcdn.com/l...,{u'Blue Linen / Chevron': [u'https://secure.im...,{u'Purple': [u'https://secure.img2.wfrcdn.com/...,{u'Graphite': [u'https://secure.img1.wfrcdn.co...,{u'Greenwich Pecan': [u'https://secure.img1.wf...
manufacturer,Signature Design by Ashley,Mercury Row,Kingstown Home,Carolina Accents,Andover Mills
price,357.99,529.99,829.99,189.99,289.99
product_id,GNT3224,MCRR1650,KMDS1431,ENT1245,ANDO1550
rating_avg,4.4,4.1,4.5,4.5,4.5


In [75]:
sofa_df.to_json('wayfair/sofa_clean.json')

# TFIDF: description + features
#### Then cluster

In [88]:
import collections
from nltk import word_tokenize
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords

In [85]:
X = sofa_df['description_all'][sofa_df['description_all'].notnull()]
tfidf = TfidfVectorizer(strip_accents='unicode', stop_words='english', max_features=5000, ngram_range = (1,3))
tfidf_matrix = tfidf.fit_transform(X).todense()

In [86]:
km_model = KMeans(n_clusters=10)
km_model.fit(tfidf_matrix)

KMeans(copy_x=True, init='k-means++', max_iter=300, n_clusters=10, n_init=10,
    n_jobs=1, precompute_distances='auto', random_state=None, tol=0.0001,
    verbose=0)

In [89]:
clustering = collections.defaultdict(list)

for i, label in enumerate(km_model.labels_):
    clustering[label].append(i)

In [100]:
for i in clustering:
    print i, [int(X.index[j]) for j in clustering[i]]

0 [159, 173, 185, 191, 225, 265, 297, 305, 325, 326, 329, 354, 359, 376, 385, 411, 428, 458, 473]
1 [117, 122, 134, 176, 177, 184, 199, 2, 201, 219, 22, 234, 246, 249, 258, 260, 269, 272, 304, 306, 307, 319, 331, 343, 357, 362, 368, 37, 393, 401, 41, 416, 42, 433, 453, 472, 48, 52, 60, 94]
2 [12, 18, 197, 20, 220, 288, 336, 356, 381, 387, 449, 468, 92]
3 [109, 115, 125, 135, 139, 247, 251, 271, 339, 346, 360, 373, 377, 391, 392, 476, 56, 66, 87, 96]
4 [128, 146, 170, 179, 202, 206, 223, 267, 268, 274, 285, 295, 298, 378, 39, 86]
5 [100, 106, 126, 130, 132, 15, 150, 157, 162, 193, 203, 211, 216, 217, 250, 255, 26, 261, 281, 290, 292, 309, 334, 35, 413, 435, 444, 46, 477, 54, 65, 74, 81, 9]
6 [112, 127, 143, 149, 155, 163, 167, 187, 226, 25, 264, 278, 315, 316, 330, 342, 38, 400, 403, 419, 464]
7 [10, 102, 105, 148, 164, 169, 180, 186, 190, 192, 198, 200, 208, 218, 231, 240, 244, 277, 279, 283, 284, 286, 29, 300, 303, 308, 314, 335, 358, 367, 372, 379, 382, 383, 389, 398, 402, 404, 405, 