In [1]:
import pandas as pd
import numpy as np
from sklearn import preprocessing
from sklearn.feature_extraction.text import TfidfVectorizer

In [2]:
from sklearn.metrics import silhouette_score, rand_score
from sklearn.cluster import KMeans

In [3]:
# Basic preprocessing
def preprocess(df):
    df_new = df.copy()
    
    # Discard identifiers, style information, timestamps
    df_new = df_new[df_new.columns.difference(['image', 'style', 'reviewTime', 
                                               'reviewerID', 'asin', 'reviewerName', 'unixReviewTime'])]
    
    # Turn category into binary features
    for cat in df_new.category.unique():
        df_new[cat] = df_new['category'] == cat
        
    # Drop category column
    df_new.drop(columns=['category'], inplace=True)
    
    # NaN vote is 0 users found helpful
    df_new.vote.fillna(0, inplace=True)
    
    # Turn vote into binary feature
    df_new.vote = df_new.vote > 0
    
    # NaN summary is empty summary
    df_new.summary.fillna('', inplace=True)
    
    # Turn Booleans into binary variables
    df_new.replace({False: 0, True: 1}, inplace=True)
    
    return df_new

In [4]:
def apply_tfidf(df, review_vectorizer, summary_vectorizer):
    review_matrix = pd.DataFrame(data=review_vectorizer.transform(df.reviewText).toarray(), columns='R_' + review_vectorizer.get_feature_names_out())
    summary_matrix = pd.DataFrame(data=summary_vectorizer.transform(df.summary).toarray(), columns='S_' + summary_vectorizer.get_feature_names_out())
    df_new = pd.concat([df, review_matrix, summary_matrix], axis=1)
    df_new.drop(columns=['summary', 'reviewText'], inplace=True)
    return df_new

In [5]:
test_df = pd.read_csv('Test.csv')
cats = test_df.category.unique()

# Task 3

In [6]:
# Preprocessing of data
proc_df = preprocess(test_df)

In [7]:
# Fit TF-IDF vectorizer for 'reviewText' and 'summary' features, creating max. 11500 features.
r_vectorizer = TfidfVectorizer(max_features=12000, stop_words='english', ngram_range=(1, 3))
s_vectorizer = TfidfVectorizer(max_features=12000, stop_words='english', ngram_range=(1, 3))
r_vectorizer.fit(proc_df.reviewText)
s_vectorizer.fit(proc_df.summary)

TfidfVectorizer(max_features=12000, ngram_range=(1, 3), stop_words='english')

In [8]:
# Apply TF-IDF vectorization
proc_df = apply_tfidf(proc_df, r_vectorizer, s_vectorizer)

In [9]:
proc_df

Unnamed: 0,verified,vote,automotive,CDs,grocery,cell_phones,sports,toys,R_00,R_000,...,S_zionist,S_zionist israeli,S_zionist israeli terrorists,S_zipper,S_zipper pull,S_zipper small,S_zipper small pocket,S_zombie,S_zombie rarities,S_zombie rarities classy
0,1,0,1,0,0,0,0,0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1,1,1,0,0,0,0,0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1,0,1,0,0,0,0,0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,1,0,1,0,0,0,0,0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0,0,1,0,0,0,0,0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4495,1,0,0,0,0,0,0,1,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4496,1,0,0,0,0,0,0,1,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4497,1,0,0,0,0,0,0,1,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4498,1,0,0,0,0,0,0,1,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [10]:
# Remove category rows to set X
X = proc_df[proc_df.columns.difference(cats)]

In [11]:
# Apply robust scaling
scaler = preprocessing.RobustScaler()
X = pd.DataFrame(scaler.fit_transform(X), columns=X.columns, index=X.index)

In [12]:
# Optimization: find correlation threshold that yields a set of features 
# with good silhouette and rand index scores.
for i in range(90, 121, 2):
    col_cands = [list(X.columns[abs(X.corrwith(proc_df[cat])) > i/1000]) for cat in cats]
    relcols = np.unique(np.sum(np.array(col_cands, dtype=object)))
    kmeans = KMeans(n_clusters=6, random_state=42)
    kmeans.fit(X[relcols])
    print(f'i={i/1000}, #={len(relcols)}, sil={silhouette_score(X[relcols], kmeans.labels_)}')
    print(f"    rand={rand_score(test_df['category'], kmeans.labels_)}")

i=0.09, #=460, sil=0.5602397154714417
    rand=0.6320184732409672
i=0.092, #=427, sil=0.3714821694673595
    rand=0.6493811464276011
i=0.094, #=399, sil=0.571704023745826
    rand=0.6313660813514115
i=0.096, #=374, sil=0.5835301653854685
    rand=0.6318043021906102
i=0.098, #=356, sil=0.5874560094536463
    rand=0.6318043021906102
i=0.1, #=331, sil=0.5671146897833251
    rand=0.63323612654664
i=0.102, #=313, sil=0.5996783614107795
    rand=0.6320184732409672
i=0.104, #=294, sil=0.6051897070254941
    rand=0.6318043021906102
i=0.106, #=275, sil=0.6185538197893687
    rand=0.6320184732409672
i=0.108, #=258, sil=0.39551254827016946
    rand=0.5536157664666222
i=0.11, #=247, sil=0.5694457127192949
    rand=0.5308383591415377
i=0.112, #=233, sil=0.4620679780019511
    rand=0.5581230396878318
i=0.114, #=220, sil=0.41516404325945927
    rand=0.5574951470697191
i=0.116, #=211, sil=0.42601043059787236
    rand=0.5664448889876763
i=0.118, #=204, sil=0.5747631956375296
    rand=0.5313253809488528

In [13]:
relcols = np.unique(np.sum(np.array([list(X.columns[abs(X.corrwith(proc_df[cat])) > 0.106]) for cat in cats], dtype=object)))

In [14]:
relcols

array(['R_acoustic', 'R_album', 'R_album just', 'R_albums',
       'R_arrangements', 'R_artist', 'R_artists', 'R_audience', 'R_baby',
       'R_ballad', 'R_ballads', 'R_band', 'R_bands', 'R_bass',
       'R_battery', 'R_beats', 'R_best', 'R_bike', 'R_birthday',
       'R_bitter', 'R_blues', 'R_boring', 'R_bubbles', 'R_bulbs',
       'R_butter', 'R_buttons', 'R_camera', 'R_car', 'R_career', 'R_case',
       'R_cases', 'R_catchy', 'R_cd', 'R_charge', 'R_charger',
       'R_charging', 'R_child', 'R_chips', 'R_chocolate', 'R_chorus',
       'R_christmas', 'R_classic', 'R_coffee', 'R_collection',
       'R_commercial', 'R_compilation', 'R_compositions', 'R_concert',
       'R_country', 'R_cups', 'R_cute', 'R_dance', 'R_daughter',
       'R_debut', 'R_delicious', 'R_disc', 'R_doll', 'R_drink',
       'R_dropped', 'R_drum', 'R_drums', 'R_dvd', 'R_eat', 'R_eating',
       'R_edges', 'R_era', 'R_example', 'R_fan', 'R_fans', 'R_filter',
       'R_final', 'R_fit', 'R_flavor', 'R_flavors', 'R_food

In [15]:
# The number of features we will use to cluster
len(relcols)

275

In [16]:
# Discard all other features
X = X[relcols]

In [17]:
kmeans = KMeans(n_clusters=6, random_state=42)
kmeans.fit(X)

KMeans(n_clusters=6, random_state=42)

In [18]:
print(f'Silhouette score: {silhouette_score(X, kmeans.labels_)}')
print(f"RAND index score (ground: 'category'): {rand_score(test_df['category'], kmeans.labels_)}")

Silhouette score: 0.6185538197893687
RAND index score (ground: 'category'): 0.6320184732409672
