## <span style='color:navy'> Prepping </span>

In [329]:
import pandas as pd

In [330]:
data = pd.read_csv('../data/perfumes_df_ready.csv')

In [331]:
data.shape

(4478, 105)

In [332]:
pd.set_option('max_columns', 110)

In [333]:
def ready_df(df):
    """
    Specific to the 'perfumes_df_ready.csv'. 
    Cleans up the 'gender' variable.
    Adds dummies of 'designer', 'group', 'gender' to the dataframe, and drops original ones.
    Adds 'perfume_id' instead of 'perfume_name' to make all numeric dataframe
    """
    # cleaning up the gender column in data
    gender = []
    for k in df['perfume_name']:
        gender.append(k.split('for')[1].lower().strip())


    for n, k in enumerate(gender):
        if ('her' in k[:4]) or ('women' in k[:]) and ('women and men' not in k):
            gender[n]='women'
        elif ('him' in k[:4]) or ('men' in k[:4]):
            gender[n] = 'men'
        elif ('women and men' in k):
            gender[n] = 'women and men'
        else:
            gender[n] = 'unknown'
            
    
    # replace the old with the new
    df['gender'] = gender
    
    # get dummies 
    dummies = pd.get_dummies(df[['designer', 'group', 'gender']])
    
    # add them to the dataframe
    new_df = pd.concat([df, dummies], axis = 1, sort = False)
    
    # add a quick perfume_id instead of perfume_name
    new_df['perfume_id'] = df.index 
    new_df.drop(['all_notes','top_notes', 'middle_notes','base_notes', 'synopsis', 'main_accords', 'perfume_name',
                'designer', 'group', 'gender'], axis = 1, inplace = True)
    
    return new_df

In [334]:
# making sure we don't have any nulls in the main data frame
data.isnull().sum().to_frame().T

Unnamed: 0,perfume_name,designer,group,all_notes,top_notes,middle_notes,base_notes,synopsis,overall_rating,total_num_voters,gender,longvity_poor,longvity_weak,longvity_moderate,longvity_long_lasting,longvity_very_long_lasting,sillage_soft,sillage_moderate,sillage_heavy,sillage_enormous,have_it,had_it,want_it,my_signature,love_it,like_it,dislike_it,spring,summer,fall,day,night,main_accords,salty,smoky,Unnamed: 35,coconut,white floral,fresh spicy,white wine,vanilla,warm spicy,whiskey,ginger,conifer,caramel,woody,vinyl,tropical,musk,fruity,coca cola,powdery,floral,aromatic,milky,yellow floral,nutty,cacao,aquatic,camphor,red fruits,sweet,oud,almond,animalic,gourmand,rum,sand,citrus,beeswax,musky,watery,soft spicy,vodka,tobacco,fresh,tonka coumarin,savory,violet,aldehydic,rose,terpenic,cinnamon,tuberose,sour,cherry,lactonic,herbal,narcotic,ozonic,patchouli,leather,woodsy notes,not found,honey,soapy,metallic,green,bitter,balsamic,marine,earthy,amber,coffee
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [342]:
data = ready_df(data)

In [343]:
# make test and train dataframes:
test_df = data.loc[data['overall_rating'] == -1, :]
test_df.drop(['overall_rating'], axis = 1, inplace = True)

df = data.loc[data['overall_rating']!=-1, :]
# don't mind the warning, it applied anyway

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  errors=errors)


In [344]:
print(test_df.shape, df.shape)

(34, 178) (4444, 179)


***
## <span style='color:navy'> Clustering </span>

In [345]:
from sklearn.model_selection import train_test_split

X = df.drop(['overall_rating'], axis = 1)
y = df['overall_rating']

X_train, X_test, y_train, y_test = train_test_split(X,y, test_size = 0.2, random_state = 62019)

In [346]:
from sklearn.cluster import DBSCAN
from sklearn.preprocessing import StandardScaler

In [347]:
ss = StandardScaler()

In [348]:
X_train = ss.fit_transform(X_train)
X_test = ss.transform(X_test)

  return self.partial_fit(X, y)
  return self.fit(X, **fit_params).transform(X)
  


In [349]:
print(X_train.shape, X_test.shape)

(3555, 178) (889, 178)


In [403]:
from sklearn.cluster import KMeans
kmeans = KMeans(n_clusters = 2000)

In [351]:
kmeans.fit_transform(X_train)

array([[13.26364597, 11.32105223, 12.90463025, ..., 13.27460774,
        12.05249811, 16.29990663],
       [16.69909683, 15.82272259, 15.22292111, ..., 13.92330271,
        16.04654378, 19.55591795],
       [14.87013689, 10.9570257 , 14.5132377 , ..., 15.27028863,
        13.90731408, 17.43117182],
       ...,
       [15.19170524, 13.2023881 , 13.18531546, ..., 15.19490241,
        14.87670737, 15.79048035],
       [14.52568849, 12.69070907, 12.45052474, ..., 14.54400647,
        13.96038508, 15.2170328 ],
       [19.11109872, 18.51470753, 18.56937016, ..., 19.00781222,
        18.43268331, 22.01333182]])

In [352]:
labels = kmeans.labels_

In [353]:
from sklearn.metrics import silhouette_score

In [354]:
print("silhouette coefficient: %.3f" % silhouette_score(X_train, labels))

silhouette coefficient: 0.113


<span style='color:navy'> **What if we changed the problem into classification problem instead? Would that better our clustering?** </span>

In [355]:
def classify_target(df):
    
    # make sure format is unified
    ratings_continuous = df['overall_rating'].map(lambda x: round(float(x), 2))
    
    # make target into bins of 0.5 width
    new_y = []
    for y in ratings_continuous:
        if (y > int(y)) & (y <= int(y) + .5):
            new_y.append(int(y) + .5)

        elif (y > int(y) + .5) & (y < int(y) + 1):
            new_y.append(int(y) + 1)

        elif (y == int(y)):
            new_y.append(y)
            
    # add the new ratings to the data frame; and remove old ones
    df['ratings_classes'] = new_y
    df.drop('overall_rating', axis = 1, inplace = True)
    
    return df

In [356]:
# starting over
data = pd.read_csv('../data/perfumes_df_ready.csv')
data = ready_df(data)

In [358]:
# the new data frame
data = classify_target(data)

In [361]:
data.head(2)

Unnamed: 0,total_num_voters,longvity_poor,longvity_weak,longvity_moderate,longvity_long_lasting,longvity_very_long_lasting,sillage_soft,sillage_moderate,sillage_heavy,sillage_enormous,have_it,had_it,want_it,my_signature,love_it,like_it,dislike_it,spring,summer,fall,day,night,salty,smoky,Unnamed: 35,coconut,white floral,fresh spicy,white wine,vanilla,warm spicy,whiskey,ginger,conifer,caramel,woody,vinyl,tropical,musk,fruity,coca cola,powdery,floral,aromatic,milky,yellow floral,nutty,cacao,aquatic,camphor,red fruits,sweet,oud,almond,animalic,...,designer_Lancome,designer_Marc Jacobs,designer_Narciso Rodriguez,designer_Nina Ricci,designer_Paco Rabanne,designer_Prada,designer_Ralph Lauren,designer_Salvador Dali,designer_Serge Lutens,designer_Tom Ford,designer_Valentino,designer_Van Cleef & Arpels,designer_Versace,designer_Viktor&Rolf,designer_Yves Rocher,designer_Yves Saint Laurent,group_Aromatic,group_Aromatic Aquatic,group_Aromatic Fougere,group_Aromatic Fruity,group_Aromatic Green,group_Aromatic Spicy,group_Chypre,group_Chypre Floral,group_Chypre Fruity,group_Citrus,group_Citrus Aromatic,group_Citrus Gourmand,group_Floral,group_Floral Aldehyde,group_Floral Aquatic,group_Floral Fruity,group_Floral Fruity Gourmand,group_Floral Green,group_Floral Woody Musk,group_Leather,group_Oriental,group_Oriental Floral,group_Oriental Fougere,group_Oriental Spicy,group_Oriental Vanilla,group_Oriental Woody,group_Woody,group_Woody Aquatic,group_Woody Aromatic,group_Woody Chypre,group_Woody Floral Musk,group_Woody Spicy,group_not found,gender_men,gender_unknown,gender_women,gender_women and men,perfume_id,ratings_classes
0,410.0,14,25,35,14,12,47,34,20,31,279,40,272,3,100,74,26,3,20,54,5,56,0.0,0.0,0.0,80.0,119.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,52.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,78.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,4.5
1,2311.0,33,39,173,402,97,87,285,366,111,2021,638,1252,64,100,70,37,55,41,12,58,41,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,96.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,4.0


In [362]:
# make training and testing data frames again
test_df = data.loc[data['ratings_classes'] == -1, :]
test_df.drop(['ratings_classes'], axis = 1, inplace = True)

df = data.loc[data['ratings_classes']!=-1, :]

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  errors=errors)


In [363]:
# splitting and standarizing again. All models already instantiated above
X = df.drop(['ratings_classes'], axis = 1)
y = df['ratings_classes']

X_train, X_test, y_train, y_test = train_test_split(X,y, test_size = 0.2, random_state = 62019)

X_train = ss.fit_transform(X_train)
X_test = ss.transform(X_test)

  return self.partial_fit(X, y)
  return self.fit(X, **fit_params).transform(X)
  


In [402]:
# running the clustering model again, and checking for score
kmeans.fit_transform(X_train)

labels = kmeans.labels_

print("silhouette coefficient: %.3f" % silhouette_score(X_train, labels))

silhouette coefficient: 0.115


<span style='color:navy'> Since that didn't better our clustering. I'll try another model on the now a multi-class classification problem </span>

In [367]:
from sklearn.cluster import DBSCAN

In [374]:
dbs = DBSCAN(eps = 1, min_samples = 50)

dbs.fit(X_train)

DBSCAN(algorithm='auto', eps=1, leaf_size=30, metric='euclidean',
    metric_params=None, min_samples=50, n_jobs=None, p=None)

In [378]:
labels = dbs.labels_

In [379]:
labels

array([-1, -1, -1, ..., -1, -1, -1])

<span style='color:navy'> There is only one label, -1. This will give an error if we tried to compute the silhouette score. The DBSCAN didn't work. </span>

In [382]:
print('silhouette coefficient: %.3f' % silhouette_score(X_train, labels))

ValueError: Number of labels is 1. Valid values are 2 to n_samples - 1 (inclusive)

In [386]:
# trying hierarchical model
from sklearn.cluster import AgglomerativeClustering

In [387]:
hir = AgglomerativeClustering(n_clusters = 2)

In [388]:
hir.fit(X_train)

AgglomerativeClustering(affinity='euclidean', compute_full_tree='auto',
            connectivity=None, linkage='ward', memory=None, n_clusters=2,
            pooling_func='deprecated')

In [389]:
hir.labels_

array([0, 0, 0, ..., 0, 0, 0])

<span style='color:navy'> Again, there's only one label. Maybe this dataset is too scattered to be clustered, or just one big loose cluster. </span>

In [419]:
# just to cut the doubt, I will grid search over the KMeans
from sklearn.model_selection import GridSearchCV

In [420]:
kmeans = KMeans()

params = {'n_clusters' : [10, 200, 500]}

gs = GridSearchCV(estimator = kmeans, param_grid = params, cv = 5)

In [421]:
# fitting the entire dataset, before standarizing
gs.fit(X)

gs.best_params_



{'n_clusters': 500}

In [423]:
kmeans = KMeans(n_clusters = 500)
kmeans.fit(X)
labels = kmeans.labels_
print('silhouette coefficient: %.3f' % silhouette_score(X, labels))

silhouette coefficient: 0.076
