In [1]:
import time
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans

In [None]:
import gensim
from gensim import corpora, models, similarities

sofa_df = pd.read_json('wayfair/sofa.json')
sofa_bed_df = pd.read_json('wayfair/sofa_bed.json')
futon_df = pd.read_json('wayfair/futon.json')
loveseat_df = pd.read_json('wayfair/loveseat.json')
coffee_table_df = pd.read_json('wayfair/coffee_table.json')
desk_df = pd.read_json('wayfair/desk.json')
office_chair_df = pd.read_json('wayfair/office_chair.json')
bookcase_df = pd.read_json('wayfair/bookcase.json')
dining_table_df = pd.read_json('wayfair/dining_table.json')
dining_chair_df = pd.read_json('wayfair/dining_chair.json')
bed_df = pd.read_json('wayfair/bed.json')
nightstand_df = pd.read_json('wayfair/nightstand.json')
dresser_df = pd.read_json('wayfair/dresser.json')

# clean text data
* furniture_id: strip & str()
* price: string with $ to float
* features from list to string
* add description & features to one string


In [11]:
sofa_df = pd.read_json('wayfair/sofa.json')

In [12]:
sofa_df.info()

<class 'pandas.core.frame.DataFrame'>
Float64Index: 480 entries, 0.0 to 99.0
Data columns (total 15 columns):
category                480 non-null object
colors                  480 non-null object
description             359 non-null object
features                480 non-null object
image_links_all         480 non-null object
image_links_by_color    480 non-null object
manufacturer            480 non-null object
price                   480 non-null object
product_id              480 non-null object
rating_avg              420 non-null float64
rating_count            420 non-null float64
specs                   470 non-null object
title                   480 non-null object
url                     480 non-null object
website                 480 non-null object
dtypes: float64(2), object(13)
memory usage: 60.0+ KB


In [13]:
sofa_df.head().T

Unnamed: 0,0.0,1.0,10.0,100.0,101.0
category,sofa,sofa,sofa,sofa,sofa
colors,"[Cobblestone, Café, Mocha, Sage, Salsa, Stone]","[Blue Linen / Chevron, Blue Linen / Greek Key,...","[Purple, Red, Brown, Dark Grey, Grey]","[Ash, Graphite, Natural]","[Greenwich Ivory, Greenwich Pecan, Greenwich L..."
description,Add a touch of comfort and style to your home ...,Sleeping on the sofa has never been so chic. S...,This luxurious sofa merging with style and ele...,tion’s mid-century modern design is masterfull...,"Casual and comfy, the Smith Loveseat offers an..."
features,[\nFeatures\n\nHarvest collection\nMaterial: 1...,[\nFeatures\n\nMaterial: 100% Polyester Linen\...,[\nFeatures\n\nCarthusia collection\nRubberwoo...,"[\nFeatures\n\nCovered in durable, easy-to-cle...",[\nFeatures\n\nLoose back and seat cushions\nH...
image_links_all,[https://secure.img2.wfrcdn.com/lf/49/hash/266...,[https://secure.img1.wfrcdn.com/lf/49/hash/338...,[https://secure.img2.wfrcdn.com/lf/49/hash/296...,[https://secure.img1.wfrcdn.com/lf/49/hash/384...,[https://secure.img1.wfrcdn.com/lf/49/hash/308...
image_links_by_color,{u'Stone': [u'https://secure.img2.wfrcdn.com/l...,{u'Blue Linen / Chevron': [u'https://secure.im...,{u'Purple': [u'https://secure.img2.wfrcdn.com/...,{u'Graphite': [u'https://secure.img1.wfrcdn.co...,{u'Greenwich Pecan': [u'https://secure.img1.wf...
manufacturer,Signature Design by Ashley,Mercury Row,Kingstown Home,Carolina Accents,Andover Mills
price,$357.99,$529.99,$829.99,$189.99,$289.99
product_id,GNT3224,MCRR1650,KMDS1431,ENT1245,ANDO1550
rating_avg,4.4,4.1,4.5,4.5,4.5


In [14]:
def clean_text_data(df):
    df['product_id'] = df['product_id'].apply(lambda x: x.strip())
    df['price'] = df['price'].apply(lambda x:float(x.strip('$').replace(',','').split()[0]))
    df['features'] = df['features'].apply(lambda x: '\n'.join(x) if type(x)==list else x)
    df['description'] = df['description'].apply(lambda x: '\n'.join(x) if type(x)==list else x)
    df['description'][df['description'].isnull()] = ''
    df['description_all'] = df['description'] + '\n' + df['features']
    return df 

In [15]:
start = time.time()
sofa_df = clean_text_data(sofa_df)
print time.time() - start

0.0638449192047


A value is trying to be set on a copy of a slice from a DataFrame

See the the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [16]:
sofa_df.info()

<class 'pandas.core.frame.DataFrame'>
Float64Index: 480 entries, 0.0 to 99.0
Data columns (total 16 columns):
category                480 non-null object
colors                  480 non-null object
description             480 non-null object
features                480 non-null object
image_links_all         480 non-null object
image_links_by_color    480 non-null object
manufacturer            480 non-null object
price                   480 non-null float64
product_id              480 non-null object
rating_avg              420 non-null float64
rating_count            420 non-null float64
specs                   470 non-null object
title                   480 non-null object
url                     480 non-null object
website                 480 non-null object
description_all         480 non-null object
dtypes: float64(3), object(13)
memory usage: 63.8+ KB


In [74]:
sofa_df.head().T

Unnamed: 0,0.0,1.0,10.0,100.0,101.0
category,sofa,sofa,sofa,sofa,sofa
colors,"[Cobblestone, Café, Mocha, Sage, Salsa, Stone]","[Blue Linen / Chevron, Blue Linen / Greek Key,...","[Purple, Red, Brown, Dark Grey, Grey]","[Ash, Graphite, Natural]","[Greenwich Ivory, Greenwich Pecan, Greenwich L..."
description,Add a touch of comfort and style to your home ...,Sleeping on the sofa has never been so chic. S...,This luxurious sofa merging with style and ele...,tion’s mid-century modern design is masterfull...,"Casual and comfy, the Smith Loveseat offers an..."
features,\nFeatures\n\nHarvest collection\nMaterial: 10...,\nFeatures\n\nMaterial: 100% Polyester Linen\n...,\nFeatures\n\nCarthusia collection\nRubberwood...,"\nFeatures\n\nCovered in durable, easy-to-clea...",\nFeatures\n\nLoose back and seat cushions\nHi...
image_links_all,[https://secure.img2.wfrcdn.com/lf/49/hash/266...,[https://secure.img1.wfrcdn.com/lf/49/hash/338...,[https://secure.img2.wfrcdn.com/lf/49/hash/296...,[https://secure.img1.wfrcdn.com/lf/49/hash/384...,[https://secure.img1.wfrcdn.com/lf/49/hash/308...
image_links_by_color,{u'Stone': [u'https://secure.img2.wfrcdn.com/l...,{u'Blue Linen / Chevron': [u'https://secure.im...,{u'Purple': [u'https://secure.img2.wfrcdn.com/...,{u'Graphite': [u'https://secure.img1.wfrcdn.co...,{u'Greenwich Pecan': [u'https://secure.img1.wf...
manufacturer,Signature Design by Ashley,Mercury Row,Kingstown Home,Carolina Accents,Andover Mills
price,357.99,529.99,829.99,189.99,289.99
product_id,GNT3224,MCRR1650,KMDS1431,ENT1245,ANDO1550
rating_avg,4.4,4.1,4.5,4.5,4.5


In [18]:
sofa_df.to_json('wayfair/sofa_clean.json')

# TFIDF: description + features
#### Then cluster

In [19]:
import collections
from nltk import word_tokenize
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords

In [20]:
X = sofa_df['description_all'][sofa_df['description_all'].notnull()]
tfidf = TfidfVectorizer(strip_accents='unicode', stop_words='english', max_features=5000, ngram_range = (1,3))
tfidf_matrix = tfidf.fit_transform(X).todense()

In [24]:
X.shape

(480,)

In [21]:
km_model = KMeans(n_clusters=10)
km_model.fit(tfidf_matrix)

KMeans(copy_x=True, init='k-means++', max_iter=300, n_clusters=10, n_init=10,
    n_jobs=1, precompute_distances='auto', random_state=None, tol=0.0001,
    verbose=0)

In [22]:
clustering = collections.defaultdict(list)

for i, label in enumerate(km_model.labels_):
    clustering[label].append(i)

In [23]:
for i in clustering:
    print i, [int(X.index[j]) for j in clustering[i]]

0 [106, 150, 153, 154, 182, 205, 21, 236, 238, 242, 257, 26, 282, 31, 341, 35, 351, 410, 470, 67, 73, 9, 97]
1 [0, 112, 127, 13, 131, 143, 149, 155, 163, 167, 174, 175, 187, 226, 237, 25, 256, 264, 278, 287, 313, 315, 316, 330, 342, 366, 375, 38, 400, 403, 419, 437, 447, 457, 464, 5, 50, 71]
2 [101, 113, 118, 120, 123, 141, 148, 156, 16, 161, 166, 17, 171, 19, 194, 198, 210, 214, 23, 230, 235, 239, 24, 27, 275, 276, 299, 3, 303, 32, 322, 333, 34, 36, 455, 459, 63, 70, 75, 79, 80, 88]
3 [180, 190, 208, 231, 28, 29, 30, 55, 68]
4 [100, 126, 130, 132, 15, 157, 162, 193, 203, 211, 216, 217, 250, 254, 255, 261, 281, 290, 292, 309, 317, 318, 323, 324, 334, 348, 396, 399, 413, 423, 435, 436, 444, 46, 477, 54, 65, 74, 81]
5 [117, 122, 124, 134, 144, 160, 176, 177, 184, 2, 201, 219, 22, 234, 246, 249, 258, 260, 272, 296, 304, 306, 319, 331, 343, 358, 362, 363, 368, 37, 393, 401, 41, 415, 416, 42, 433, 453, 48, 60, 94]
6 [1, 10, 102, 105, 108, 111, 116, 121, 129, 133, 136, 137, 138, 140, 142, 14