# Description Analysis

## Setup

In [69]:
%matplotlib inline

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import scipy.stats

from sklearn.cross_validation import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import SGDRegressor
from sklearn.svm import SVR
from sklearn.preprocessing import LabelEncoder

plt.style.use('ggplot')

Adding some useful columns

In [70]:
listings_df = pd.read_csv('scraper/create_dataset/listings.csv')
listings_df['created_at'] = pd.to_datetime(listings_df['created_at'])
listings_df['sold_at'] = pd.to_datetime(listings_df['sold_at'])
listings_df['sold_at_day_of_week'] = listings_df['sold_at'].dt.dayofweek
listings_df['sold_at_hour_of_day'] = listings_df['sold_at'].dt.hour
listings_df['sold_at_date'] = listings_df['sold_at'].dt.date

Adding time-to-sell column (seconds)

In [3]:
delta_to_seconds = lambda x: x.total_seconds()
listings_df['sale_delta'] = listings_df['sold_at'] - listings_df['created_at']
listings_df['sale_delta_seconds'] = listings_df['sale_delta'].apply(delta_to_seconds)
listings_df['sale_delta_days'] = listings_df['sale_delta_seconds']/86400

Added encoded designer_name as a column

In [5]:
designer_encoder = LabelEncoder()
designer_names_encoded = designer_encoder.fit_transform(listings_df['designer_name'])
listings_df['designer_name_encoded'] = designer_names_encoded

Other

In [38]:
designer_counts = listings_df.groupby('designer_name').count()

In [131]:
significant_designers = list(designer_counts[designer_counts['id'] > 25].sort_values(by='id', ascending=False).index)
top_designers = significant_designers[:50]

In [39]:
sold_listings_df = listings_df[listings_df['sold'] == 1]

## Word Counts for Various Designers (if this is promising, let's do word maps!)

In [138]:
designers_joined_desc = listings_df.groupby('designer_name')['description'].apply(sum)
significant_designer_df = designers_joined_desc[significant_designers]

In [140]:
designer_index_map = {designer: index for index, designer in enumerate(significant_designer_df.index)}

In [157]:
vectorizer = TfidfVectorizer(stop_words='english', max_df=0.7)
tfidf_features = vectorizer.fit_transform(significant_designer_df)

In [161]:
def print_top_tfidf_words(designer_name, num_words, feature_names):
    print 'DESIGNER: {}'.format(designer_name)
    ordered_word_inidices = tfidf_features[designer_index_map[designer_name]].toarray()[0].argsort()[::-1]
    
    printed = 0
    index = 0
    while printed < num_words:
        word = feature_names[ordered_word_inidices[index]]
        index += 1
        if word in designer_name.lower() or designer_name.lower() in word:
            continue
        print word
        printed += 1
    print
        
feature_names = vectorizer.get_feature_names()
for designer_name in top_designers[:5]:
        print_top_tfidf_words(designer_name, 20, feature_names)

DESIGNER: Supreme
logo
tee
bag
box
camp
sticker
stickers
ds
hat
hoodie
washed
cracking
cap
deadstock
trades
store
priority
online
navy
red

DESIGNER: Nike
box
flyknit
air
shoes
og
shoe
max
jordan
pair
force
laces
deadstock
boxed
roshe
racer
colorway
11
ds
tech
sole

DESIGNER: Bape
bathing
japan
camo
baggage
tokyo
bag
trips
shark
carry
book
essentials
release
activity
tracking
upgrades
flair
worldwide
packaging
mail
bonus

DESIGNER: Rick Owens
drkshdw
box
nbsp
font
ramones
geobaskets
span
italy
mainline
ro
milk
strong
dust
sole
shoulder
48
geobasket
silk
vicious
laces

DESIGNER: Jordan Brand
box
air
retro
og
creasing
shoes
shoe
nike
bred
receipt
11
release
deadstock
ds
cement
boxed
pair
replacement
yellowing
toe

