# Shout Optimizer data processing

The aim of this tool is to create a price estimator for new products on an online auction site.
* Data is downloaded from Huuto.net API
* Items are clustered together using machine learning
* Same clusters have only same item
* An average price for cluster is calculated

In [201]:
import pandas as pd
import numpy as np

import string

import re

import csv

from gensim.models import phrases, Word2Vec

from sklearn.cluster import KMeans
from sklearn import cluster
from sklearn import metrics

from sklearn.cluster import AgglomerativeClustering

from ast import literal_eval


from gensim.test.utils import datapath
from gensim.models.word2vec import Text8Corpus
from gensim.models.phrases import Phrases, Phraser

import gensim
import gensim.downloader as api

In [202]:
# Assess sample size for test purposes
sample = 1500

# Import data that has been downloaded from huuto.net API
df = pd.read_csv('ps2_data.csv')

# Print columns to inspect data
print(df.columns)

# Apply eval function to links because they are a list, not String
df['links']=df['links'].apply(literal_eval)

# Subset data with sample variable
#df = df[:sample].copy()

Index(['Unnamed: 0', 'index', 'links', 'id', 'title', 'category', 'seller',
       'sellerId', 'currentPrice', 'buyNowPrice', 'saleMethod', 'listTime',
       'postalCode', 'location', 'closingTime', 'bidderCount', 'offerCount',
       'hasReservePrice', 'hasReservePriceExceeded', 'upgrades', 'images'],
      dtype='object')


{'self': 'https://api.huuto.net/1.1/items/520206177',
 'category': 'https://api.huuto.net/1.1/categories/459',
 'alternative': 'https://www.huuto.net/kohteet/the-scorpion-king---rise-of-the-akkadian-ps2/520206177',
 'images': 'https://api.huuto.net/1.1/items/520206177/images'}

### Preprocessing

In [203]:
# Take only wanted columns from dataset
tdf = df[['title','links','currentPrice']].copy()

# Lower case all titles
tdf['title'] = tdf['title'].str.lower()
tdf['title'][100]

'transformers - the game ps2'

##### Remove punctuation:

In [270]:
tdf['title'] = tdf['title'].str.replace('[{}]'.format(string.punctuation),' ')
tdf['title'][100]

'transformers game'

##### Remove stopwords and extra whitespace:

In [205]:
with open('stop-word-list_huuto.csv', newline='') as f:
    reader = csv.reader(f)
    for row in reader:
        stopwords = row
stopwords = list(map(str.strip, stopwords))
#remove empties



In [206]:
def RemoveSpace(word):
    regex = re.compile(r'[\n\r\t\xa0]')
    word = regex.sub(" ", word)
    _RE_COMBINE_WHITESPACE = re.compile(r"(?a:\s+)")
    _RE_STRIP_WHITESPACE = re.compile(r"(?a:^\s+|\s+$)")
    
    word = _RE_COMBINE_WHITESPACE.sub(" ", word)
    word = _RE_STRIP_WHITESPACE.sub("", word) 
    return word

In [207]:
tdf['title'] = tdf['title'].str.replace(f'\\b({"|".join(stopwords)})\\b','')        
tdf['title'] = tdf['title'].apply(lambda x: (RemoveSpace(x)))
titles2 = tdf
titles2['title'][1440:1450]

1440                                              jak x
1441                         crash bandicoot nitro kart
1442                                       sonic heroes
1443    tom clancy s splinter cell pandora tomorrow 467
1444                                        deus ex 460
1445                                           singstar
1446                                                   
1447                 star wars episode iii revenge sith
1448                                       sonic heroes
1449                           alone dark new nightmare
Name: title, dtype: object

In [208]:
# Remove empty titles
titles2 = titles2[titles2['title'] != ''].copy()
titles2= titles2.reset_index(drop=True)
titles2[1440:1450]

Unnamed: 0,title,links,currentPrice
1440,jak x,{'self': 'https://api.huuto.net/1.1/items/5201...,10.0
1441,crash bandicoot nitro kart,{'self': 'https://api.huuto.net/1.1/items/5206...,20.0
1442,sonic heroes,{'self': 'https://api.huuto.net/1.1/items/5201...,9.0
1443,tom clancy s splinter cell pandora tomorrow 467,{'self': 'https://api.huuto.net/1.1/items/5243...,4.0
1444,deus ex 460,{'self': 'https://api.huuto.net/1.1/items/5243...,8.0
1445,singstar,{'self': 'https://api.huuto.net/1.1/items/5246...,6.0
1446,star wars episode iii revenge sith,{'self': 'https://api.huuto.net/1.1/items/5245...,4.15
1447,sonic heroes,{'self': 'https://api.huuto.net/1.1/items/5229...,5.0
1448,alone dark new nightmare,{'self': 'https://api.huuto.net/1.1/items/5222...,16.0
1449,nhl 2k8 playstation 2,{'self': 'https://api.huuto.net/1.1/items/5238...,5.0


#### Create lists of words

In [210]:
titles2['title'] = titles2['title'].str.split()
dataset = titles2
dataset[:10]

Unnamed: 0,title,links,currentPrice
0,"[racing, simulation, 3, 10244]",{'self': 'https://api.huuto.net/1.1/items/5199...,4.5
1,"[sopranos, road, respect]",{'self': 'https://api.huuto.net/1.1/items/5199...,12.0
2,"[red, faction, ii]",{'self': 'https://api.huuto.net/1.1/items/5199...,8.0
3,"[reservoir, dogs]",{'self': 'https://api.huuto.net/1.1/items/5200...,12.0
4,"[lego, batman, video, game]",{'self': 'https://api.huuto.net/1.1/items/5198...,10.0
5,"[ratchet, gladiator]",{'self': 'https://api.huuto.net/1.1/items/5201...,10.0
6,"[star, wars, force, unleashed]",{'self': 'https://api.huuto.net/1.1/items/5201...,11.0
7,"[star, trek, conquest]",{'self': 'https://api.huuto.net/1.1/items/5201...,10.0
8,"[mercenaries, 2, world, flames]",{'self': 'https://api.huuto.net/1.1/items/5201...,8.0
9,"[men, black, ii, alien, escape]",{'self': 'https://api.huuto.net/1.1/items/5201...,9.0


## Bigrams with gensim Phrases

In [211]:
sentences = dataset.copy()

sentences

Unnamed: 0,title,links,currentPrice
0,"[racing, simulation, 3, 10244]",{'self': 'https://api.huuto.net/1.1/items/5199...,4.5
1,"[sopranos, road, respect]",{'self': 'https://api.huuto.net/1.1/items/5199...,12.0
2,"[red, faction, ii]",{'self': 'https://api.huuto.net/1.1/items/5199...,8.0
3,"[reservoir, dogs]",{'self': 'https://api.huuto.net/1.1/items/5200...,12.0
4,"[lego, batman, video, game]",{'self': 'https://api.huuto.net/1.1/items/5198...,10.0
...,...,...,...
6040,"[sims, 2, autiosaari]",{'self': 'https://api.huuto.net/1.1/items/5261...,1.0
6041,"[v, rally, 3, käytetty]",{'self': 'https://api.huuto.net/1.1/items/5260...,8.5
6042,"[forgotten, realms, demon, stone]",{'self': 'https://api.huuto.net/1.1/items/5255...,3.8
6043,"[nhl, 2001, playstation, 2]",{'self': 'https://api.huuto.net/1.1/items/5238...,3.0


In [221]:
# Create bigrams with gensim phrases.
# Bigrams are identified naturally from the data, for example 'star', 'wars' -> 'star_wars'
# This will help distinguish for example Star Wars and Star Trek from each other.
# It also works as dimensionality reduction.

phrases = Phrases(sentences['title'], min_count=1, threshold=1)

bigram = Phraser(phrases)

In [None]:
bigrams = phrases.Phrases(sentences['title'])
bigrams[sentences['title']]

In [222]:
# Initialize the bigrams column in sentences DataFrame

sentences['bigrams']=''

# Create a list of bigrams for each row

for index, row in sentences.iterrows():
    i  = bigram[row['title']]
    sentences.at[index,'bigrams']=i

['lego_batman', 'video_game']


In [None]:
# Make a Word2Vec vectorization with bigrams
# Note: this is not used, but it is left here for future development purposes

m = Word2Vec(sentences['bigrams'], size =50, min_count=1, sg=1, iter=20)

# Test to see results of the model

m.wv.most_similar('force_unleashed')

In [224]:
# Turn Word2Vec vectorization into a more usable form
# Note: this is not used, but it is left here for future development purposes

def vectorizer(sent,m):
    vec =[]
    numw =0
    for w in sent:
        try:
            if numw == 0:
                vec = m[w]
            else: vec = np.add(vec, m[w])
            numw +=1
        except:
            pass
        
    return np.asarray(vec) / numw

  
  if __name__ == '__main__':


In [None]:
# Run the vectorizer function
# Note: this is not used, but it is left here for future development purposes

l=[]
for i in sentences['bigrams']:
    l.append(vectorizer(i,m))
X = np.array(l)

In [225]:
# Check X
# Note: this is not used, but it is left here for future development purposes

X

array([[-7.7660196e-05,  1.8645324e-03, -2.9357316e-03, ...,
         3.6388212e-03, -2.2470416e-03,  4.2536426e-03],
       [-6.4247204e-03,  6.3266405e-03,  5.3309620e-04, ...,
        -3.7191394e-03, -3.3003383e-04, -2.2671768e-03],
       [-3.8311707e-03,  1.5928482e-03, -2.9937066e-03, ...,
         1.9909623e-03, -3.2473728e-03,  1.8449014e-03],
       ...,
       [-3.4147839e-04, -1.3880475e-03, -6.1621917e-03, ...,
        -1.0889105e-04, -1.1411640e-03,  9.5788436e-03],
       [-7.1901525e-04, -1.7206230e-03, -4.3801884e-03, ...,
         1.1538871e-03,  4.3616248e-03, -7.0206830e-03],
       [-7.5207609e-03, -2.0042148e-03, -1.1678798e-03, ...,
         6.0614287e-03,  6.2229461e-03, -3.8640955e-03]], dtype=float32)

In [226]:
# Get corpus, or vocabulary of different words and bigrams in the data

# bag of words

corpusW = []
for row in sentences['bigrams']:
    for word in row:
        if word not in corpusW:
            corpusW.append(word)

bagofwords= set(corpusW)
corpusW

[['ps_2', 283],
 ['pal', 228],
 ['2', 194],
 ['playstation_2', 173],
 ['need_speed', 105],
 ['medal_honor', 105],
 ['3', 92],
 ['platinum', 65],
 ['star_wars', 60],
 ['grand_theft', 59],
 ['final_fantasy', 59],
 ['crash_bandicoot', 58],
 ['tom_clancy', 56],
 ['rising_sun', 50],
 ['harry_potter', 50],
 ['metal_gear', 50],
 ['racing', 50],
 ['4', 49],
 ['pro_evolution', 48],
 ['singstar_legendat', 42],
 ['b_pal', 40],
 ['gran_turismo', 39],
 ['lord_rings', 39],
 ['k', 39],
 ['ratchet_clank', 38],
 ['city', 37],
 ['agent_hugo', 37],
 ['s_splinter', 36],
 ['ii', 36],
 ['guitar_hero', 35],
 ['james_bond', 34],
 ['fight_night', 33],
 ['lego_star', 33],
 ['–', 32],
 ['hits', 31],
 ['rollercoaster_world', 31],
 ['eyetoy_play', 31],
 ['b', 31],
 ['prostreet', 31],
 ['solid_2', 30],
 ['bakugan_battle', 29],
 ['max_payne', 29],
 ['sims_2', 29],
 ['eye_toy', 28],
 ['singstar_abba', 28],
 ['revenge_sith', 28],
 ['prince_persia', 27],
 ['sons_liberty', 27],
 ['brawlers', 27],
 ['promo', 26],
 ['lego

In [None]:
# Initialize new columns for each word/bigram in dataframe

km_df3 = sentences.copy()
for item in bagofwords:
    km_df3[item] = 0


In [None]:
# VECTORIZE Bag of words dataframe
# The number of occurrences of a gram does not matter, only whether the words exists or not
# For example a title saying 'Star Wars 1, Part of Star Wars trilogy' does not make the title more "Star Wars-like"
# Than just "Star Wars 1".

for index,row in km_df3.iterrows():
    for word in row['bigrams']:
        km_df3.at[index, word] = 1

In [None]:
# Find counts of grams in the data

countsW = []
dropdf1 = km_df3.drop(['title','bigrams','links','currentPrice'],axis=1)
for key, item in dropdf1.iteritems():
    countsW.append([item.name,item.sum()])
    


In [None]:
# Sort the list to find the most common grams in data

countsW.sort(key = lambda i: i[1],reverse=True) 
countsW

In [227]:
# Drop columns that create noise in the data. These columns should not be basis for a cluster.

dropdf1 = dropdf1.drop(['ps_2','pal','playstation_2','platinum','b_pal','–','pal_sinetöity','play_station','ps_ii','video_game','game','trilogy','2_pelilevy','nro','pelit','hyvä','pelejä','muoveissa','harvinainen','suomiversio'],axis=1)

## Clusterize the data with K-Means algorithm

In [228]:
# Number of clusters has been decided based on trying different amounts of clusters

n_clusters = 1650

hc = KMeans(n_clusters = n_clusters, max_iter =300, init = 'k-means++', n_init=1)
y_hc = hc.fit_predict(dropdf1)

# Test to see one cluster
for index, sentence in sentences.iterrows():
    if(y_hc[index] == 25):
#T        print(str(y_hc[index])+ ":" + str(sentence['title']))

25:['singstar', 'r', 'b']
25:['shrek', 'third', 'b']
25:['singstar', 'r', 'b']
25:['singstar', 'r', 'b']
25:['singstar', 'r', 'b']
25:['singstar', 'r', 'b']
25:['singstar', 'r', 'b']
25:['singstar', 'r', 'b']
25:['singstar', 'r', 'b']
25:['singstar', 'r', 'b']
25:['singstar', 'r', 'b']
25:['singstar', 'r', 'b']
25:['singstar', 'r', 'b']
25:['singstar', 'r', 'b']
25:['ps', '2', 'singstar', 'r', 'b', 'pal']
25:['singstar', 'r', 'b']
25:['singstar', 'r', 'b']
25:['singstar', 'r', 'b']
25:['singstar', 'r', 'b', '9405']
25:['singstar', 'r', 'b']
25:['singstar', 'r', 'b']
25:['singstar', 'r', 'b']


In [229]:
# Test to see first x clusters in the data

for index, sentence in sentences.iterrows():
    for jndex, jentence in sentences.iterrows():
        if(y_hc[jndex]==index):
                print(str(y_hc[jndex])+ ":" + str(jentence['title']))
    
    if index > 10:
        break

0:['crash', 'nitro', 'kart', 'postikulun']
1:['lego', 'batman', 'video', 'game']
1:['lego', 'batman', 'video', 'game']
1:['lego', 'batman']
1:['lego', 'batman', 'video', 'game']
1:['lego', 'batman', 'video', 'game']
1:['lego', 'batman', 'video', 'game']
1:['lego', 'batman', 'video', 'game']
1:['lego', 'batman', 'video', 'game']
1:['lego', 'batman', 'video', 'game']
1:['lego', 'batman', 'video', 'game']
1:['lego', 'batman', 'video', 'game']
1:['lego', 'batman', 'video', 'game']
1:['lego', 'batman', 'video', 'game']
1:['lego', 'batman', 'video', 'game']
1:['lego', 'batman', 'video', 'game']
1:['lego', 'batman', 'video', 'game']
1:['lego', 'batman']
1:['lego', 'batman', 'video', 'game']
1:['lego', 'batman', 'video', 'game']
1:['lego', 'batman', 'video', 'game']
2:['fight', 'night', 'round', '3']
2:['fight', 'night', 'round', '3']
2:['fight', 'night', 'round', '3']
2:['fight', 'night', 'round', '3']
2:['fight', 'night', 'round', '3']
2:['fight', 'night', 'round', '3']
2:['fight', 'night', 

## Enhance the clustering

In [230]:
# Find the most common label in clusters. The biggest class contains noise data: 
# items that are hard to clusterize for k-means algorithm.
# For example, items that contain multiple games, typos in title, rare games that only occur once in data.

from statistics import mode
noiseClass= mode(y_hc)
noiseClass

62

In [231]:
# Add the labels to the dataframe

dropdf1['label']=y_hc

In [232]:
# Create a new DataFrame just for the noise data
# Save the old index, and reindex the dataframe for easier access

noises = dropdf1.loc[dropdf1['label']==noiseClass].copy()
noises['index1'] = noises.index
noises =noises.reset_index(drop=True)
noises['index1']

0        80
1       142
2       151
3       202
4       224
       ... 
349    5991
350    6022
351    6024
352    6033
353    6037
Name: index1, Length: 354, dtype: int64

In [233]:
# Try clusterizing the noise data again.
# This time, hierarchical clustering is used because data is divided into small clusters

n_clusters2 = int(len(noises)/1.25)

hc2 = AgglomerativeClustering(n_clusters = n_clusters2, affinity = 'euclidean', linkage = 'ward')

y_hc2 = hc2.fit_predict(noises.drop('index1',axis=1))

In [234]:
# Print the clusters in noise data for inspection.

i = 0
for index, sentence in noises.iterrows():
    for jndex, jentence in noises.iterrows():
        if(y_hc2[jndex]==index):
                print(str(y_hc2[jndex])+ ":" + str(sentences.at[jentence['index1'],'title']))
    
    if index > n_clusters2:
        break

0:['pinball']
0:['powershot', 'pinball']
1:['007', 'nightfire']
1:['james', 'bond', '007', 'nightfire']
2:['gta', 'iii']
2:['gta', 'iii']
3:['just', 'cause']
3:['just', 'cause', '465']
4:['iridium', 'runners', '8591']
4:['iridium', 'runners']
5:['8kpl', 'pelejä']
6:['ico', 'pal']
6:['ico', 'pal']
7:['sky', 'odyssey']
7:['sky', 'odyssey', '11548']
8:['buzz', 'megavisa']
8:['buzz', 'megavisa']
9:['speed', 'racer']
9:['speed', 'racer']
10:['spytoy']
10:['spytoy', 'eye', 'toy']
11:['desert', 'storm', 'ps', '2', 'pell']
12:['celebrity', 'deathmarsh']
13:['vampire', 'night']
13:['vampire', 'night', 'ps', '2']
14:['forbidden', 'siren']
14:['forbidden', 'siren']
15:['makai', 'kingdom']
15:['makai', 'kingdom']
16:['singstar', 'take']
16:['singstar', 'take', 'harvinainen']
17:['iron', 'man']
17:['iron', 'man']
18:['powerdrome']
18:['powerdrome']
19:['dodgeball']
19:['dodgeball', 'pal']
20:['celebrity', 'deathmatch']
20:['celebrity', 'deathmatch']
21:['kultainen', 'kompassi']
21:['kultainen', 'ko

236:['top', 'gun', 'ps', '2']
237:['frendit', 'ps', '2']
238:['play', 'sports']
239:['metropolismania', 'ps', 'ii']
240:['bouncer']
241:['area', '51']
242:['play', 'station', '2', 'pelilevy', 'nro', '45']
243:['shox']
244:['play', 'station', '2', 'pelilevy', 'nro', '16']
245:['play', 'station', '2', 'pelilevy', 'nro', '60']
246:['kessen', '2']
247:['shadow', 'zorro']
248:['nemoa', 'etsimässä']
249:['play', 'station', '2', 'pelilevy', 'nro', '19']
250:['play', 'station', '2', 'pelilevy', 'nro', '39']
251:['play', 'station', '2', 'pelilevy', 'nro', '68']
252:['hitman', '2']
253:['xiii']
254:['psi', 'ops']
255:['ssx']
256:['f1', '06']
257:['ps', '2', 'motogp', 'pal']
258:['kinetic', 'combat']
259:['red', 'ninja']
260:['fantavision', 'pal']
261:['play', 'station', '2', 'pelilevy', 'nro', '22']
262:['nhl', '2k8', 'playstation', '2']
263:['robocop']
264:['speed', 'challenge']
265:['king', 'arthur']
266:['captain', 'scarlett']
267:['mojo', 'pal']
268:['play', 'station', '2', 'pelilevy', 'nro'

In [235]:
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestClassifier

# The noise class actually contained items that belong in a cluster that was already made.
# Create a regression model to try to fit these items in the clusters
# After trial, logistic regression seems to work the best.

logReg = LogisticRegression()

linReg = LinearRegression()

rfc = RandomForestClassifier(n_estimators=100)

classModel = logReg

In [237]:
# Subset data: train with data that has labels that are not the biggest cluster
# Fit the data into model

fitdf = dropdf1.loc[dropdf1['label']!=noiseClass]

classifier = classModel.fit(fitdf.drop('label',axis=1),fitdf['label'])



In [239]:
# Find predictions for the noise items

noisePreds= classModel.predict(noises.drop(['index1','label','bestPred','bestAcc'],axis=1))

In [242]:
# The model cannot predict everything correctly, because not all data has a predetermined cluster
# This is indicated with data being predicted wrongly into a single cluster

wrongPred = mode(noisePreds)
wrongPred

43

In [243]:
# Find the good predictions, and store the target cluster and prediction accuracy

# Regression model has been trained with all other classes than the noise class.
# The scores variable is just a list of scores for each label that was given in the regression model fit function.
# So, all indexes in scores that are bigger than the noise class number are off by one.
# To fix this error, pred must be one smaller if it is bigger than the noise class when checking the best score.
 
minlim = 0.00001
maxlim = 0.06

goodPreds = []
for index, row in noises.iterrows():   
    t_x = noises.drop(['index1','label','bestPred','bestAcc'],axis=1).loc[[index]]
    pred = noisePreds[index]        
    scores = classifier.predict_proba(t_x)
    
    if pred >= noiseClass:
        bestScore = scores[0][pred-1]
    else:
        bestScore = scores[0][pred]
        
    if (bestScore >= minlim)& (bestScore<maxlim) &(pred!= wrongPred):
        goodPreds.append([index,pred,bestScore])
        print("Accuracy: " + str(bestScore))
        print(pred)
    

Accuracy: 0.011316124733541252
406
Accuracy: 0.001446817445093634
1451
Accuracy: 0.0014223935159845675
1404
Accuracy: 0.021259731386006592
263
Accuracy: 0.0014468174450936273
1601
Accuracy: 0.006930792142419627
456
Accuracy: 0.0014468174450936273
1559
Accuracy: 0.012807307807268105
154
Accuracy: 0.001217795330961697
244
Accuracy: 0.004467082112682401
1036
Accuracy: 0.0012989189155681446
524
Accuracy: 0.0014216657647251423
1333
Accuracy: 0.002813051884957945
617
Accuracy: 0.0012989216419338832
1059
Accuracy: 0.0014162713174661445
331
Accuracy: 0.0038664379543693022
1217
Accuracy: 0.0014162713174661445
331
Accuracy: 0.009110395367366667
402
Accuracy: 0.007252629242894306
663
Accuracy: 0.003160164568834006
421
Accuracy: 0.010083245855667894
245
Accuracy: 0.0012989216419338832
1059
Accuracy: 0.0013616149235995468
1129
Accuracy: 0.004173510169827388
148
Accuracy: 0.007027239017674773
67
Accuracy: 0.0013720598487197131
798
Accuracy: 0.0013722053399859307
1225
Accuracy: 0.0021813129079852164


In [244]:
# Print each "good" prediction, as well as all other items that were clustered in the same cluster in the hierarchical clustering
# And print the items in the predicted target cluster

for i,row in enumerate(goodPreds):
    print("----------------------------------")
    print("")
    print("Originals as noise, names: ")
    
    for j,row2 in enumerate(goodPreds):
        if (row2[1] == row[1]):
            print(str(sentences.at[noises.at[row2[0],'index1'],'title']))
        
    print("Would be reclassified into: " + str(row[1]))
    print("With accuracy: " + str(row[2]))
    print("Items in this cluster are: ")


    # test to see one cluster
    for index, sentence in sentences.iterrows():
        if(y_hc[index] == row[1]):                
            print(str(y_hc[index])+ ":" + str(sentence['bigrams']))

----------------------------------

Originals as noise, names: 
['eye', 'toy', 'chat']
['spytoy', 'eye', 'toy']
['eye', 'toy', 'play']
['play', 'sports']
['eye', 'toy', 'astro', 'zoo']
Would be reclassified into: 406
With accuracy: 0.011316124733541252
Items in this cluster are: 
406:['eye_toy', 'play_sports']
406:['eye_toy', 'play_sports']
406:['eye_toy', 'play_sports']
406:['eye_toy', 'play_sports']
406:['eye_toy', 'play_sports']
406:['eye_toy', 'play_sports']
----------------------------------

Originals as noise, names: 
['dancingstage', 'super', 'nova', '2']
Would be reclassified into: 1451
With accuracy: 0.001446817445093634
Items in this cluster are: 
1451:['dancingstage_super', 'nova']
----------------------------------

Originals as noise, names: 
['surfing', 'h30']
Would be reclassified into: 1404
With accuracy: 0.0014223935159845675
Items in this cluster are: 
1404:['sunny', 'garsia', 'surfing']
----------------------------------

Originals as noise, names: 
['lord', 'ring',

----------------------------------

Originals as noise, names: 
['speed', 'challenge']
Would be reclassified into: 1225
With accuracy: 0.0013722053399859307
Items in this cluster are: 
1225:['speed_challenge', 'jacques_villeneuve', 's', 'racing_vision']
----------------------------------

Originals as noise, names: 
['hardware', 'online', 'arena']
Would be reclassified into: 280
With accuracy: 0.0021813129079852164
Items in this cluster are: 
280:['hardware_online', 'arena', '42', '490', 'k', 'network', 'play', 'only']
----------------------------------

Originals as noise, names: 
['nhl', '2k8', 'playstation', '2']
Would be reclassified into: 1470
With accuracy: 0.0014427172373663519
Items in this cluster are: 
1470:['nhl_2k8', '482']
----------------------------------

Originals as noise, names: 
['madden', '2004']
['fifa', '2004']
Would be reclassified into: 245
With accuracy: 0.010073181833505874
Items in this cluster are: 
245:['fifa_football', '2004', '11333']
245:['fifa_football

975:['sonic_unleashed', '26', '202', 'tasohyppely']
----------------------------------

Originals as noise, names: 
['formula', 'challenge']
['lotus', 'challenge']
Would be reclassified into: 210
With accuracy: 0.013053095897353612
Items in this cluster are: 
210:['grand_prix', 'challenge']
210:['grand_prix', 'challenge']
210:['grand_prix', 'challenge']
210:['grand_prix', 'challenge']
210:['grand_prix', 'challenge']
210:['ps_2', 'grand_prix', 'challenge', 'b_pal']
----------------------------------

Originals as noise, names: 
['twenty', 'party', 'playstation', '2']
['play', 'station', '2', 'pelilevy', 'nro', '39']
Would be reclassified into: 154
With accuracy: 0.001043324306251911
Items in this cluster are: 
154:['twenty_2', 'party']
154:['twenty_2', 'party']
154:['twenty_2', 'party']
154:['twenty_2', 'party', '39']
154:['twenty_2', 'party']
154:['ps_2', 'twenty_2', 'party', 'pal']
----------------------------------

Originals as noise, names: 
['play', 'station', '2', 'pelilevy', 'nr

518:['age_empires', 'ii_age', 'kings']
518:['age_empires', 'ii_age', 'kings', '470']
----------------------------------

Originals as noise, names: 
['metropolismania', 'ps', 'ii']
['ps', '2', 'pes', 'management', 'pal']
['vegas', 'casino']
Would be reclassified into: 5
With accuracy: 0.0017310655818382887
Items in this cluster are: 
5:['soulcalibur', '2']
5:['pes', '2']
5:['warriors_orochi', '2']
5:['castle_shikigami', '2']
5:['time_crisis', '2']
5:['vegas_casino', '2']
5:['ice_age', '2']
5:['midnight_club', '2']
5:['metropolismania', '2']
5:['warriors_orochi', '2']
----------------------------------

Originals as noise, names: 
['eye', 'toy', 'chat']
['spytoy', 'eye', 'toy']
['eye', 'toy', 'play']
['play', 'sports']
['eye', 'toy', 'astro', 'zoo']
Would be reclassified into: 406
With accuracy: 0.01720023514008179
Items in this cluster are: 
406:['eye_toy', 'play_sports']
406:['eye_toy', 'play_sports']
406:['eye_toy', 'play_sports']
406:['eye_toy', 'play_sports']
406:['eye_toy', 'play_

95:['eyetoy_play']
95:['eyetoy_play']
95:['eyetoy_play', 'platinum', 'vain_levy']
95:['eyetoy_play']
95:['ps_2', 'eyetoy_play', 'pal']
95:['eyetoy_play']
95:['eyetoy_play']
----------------------------------

Originals as noise, names: 
['ratched', 'clank', '3']
Would be reclassified into: 1210
With accuracy: 0.0014195001961333368
Items in this cluster are: 
1210:['ratchet', 'and', 'clank_3']
----------------------------------

Originals as noise, names: 
['knockout', 'kings', '2002']
['cricket', '2002']
['madden', 'nfl', '2002']
Would be reclassified into: 495
With accuracy: 0.006012613660740898
Items in this cluster are: 
495:['is_football', '2002', 'platinum']
495:['is_football', '2002']
495:['is_football', '2002']
----------------------------------

Originals as noise, names: 
['oni']
['oni']
Would be reclassified into: 1042
With accuracy: 0.001298471946383192
Items in this cluster are: 
1042:['oni', 'dark', 'future', 'uncertain', 'past']
----------------------------------

Origina

24:['toy_story', '3']
----------------------------------

Originals as noise, names: 
['fighting', 'fury']
['fighting', 'angelsps2']
Would be reclassified into: 1551
With accuracy: 0.0014180187668310004
Items in this cluster are: 
1551:['pride', 'fighting', 'championships']
----------------------------------

Originals as noise, names: 
['ps', '2', 'gun', 'pal']
Would be reclassified into: 1581
With accuracy: 0.0014461330921436625
Items in this cluster are: 
1581:['gun_pal', 'sinetöity']
----------------------------------

Originals as noise, names: 
['unlimited', 'saga']
Would be reclassified into: 750
With accuracy: 0.001371186703069472
Items in this cluster are: 
750:['king_fighters', 'collection', 'orochi', 'saga']
----------------------------------

Originals as noise, names: 
['espn', 'nba', 'basketball']
Would be reclassified into: 586
With accuracy: 0.0067113926603021064
Items in this cluster are: 
586:['espn_nba', '2k5']
586:['espn_nba', '2k5', 'huippu']
586:['espn_nba', '2k5'

In [248]:
# Change the label numbers of properly predicted classes

for i,row in enumerate(goodPreds):
    #print(y_hc[noises.at[row[0],'index1']])
    y_hc[noises.at[row[0],'index1']]=row[1]
    #print(y_hc[noises.at[row[0],'index1']])
    #print("----------------------------")

62
406
----------------------------
62
1451
----------------------------
62
1404
----------------------------
62
263
----------------------------
62
1601
----------------------------
62
456
----------------------------
62
1559
----------------------------
62
154
----------------------------
62
244
----------------------------
62
1036
----------------------------
62
524
----------------------------
62
1333
----------------------------
62
617
----------------------------
62
1059
----------------------------
62
331
----------------------------
62
1217
----------------------------
62
331
----------------------------
62
402
----------------------------
62
663
----------------------------
62
421
----------------------------
62
245
----------------------------
62
1059
----------------------------
62
1129
----------------------------
62
148
----------------------------
62
67
----------------------------
62
798
----------------------------
62
1225
----------------------------
62
280
-----------

In [252]:
# Drop the rows from noises that have already been reclassified

for i,row in enumerate(goodPreds):
    noises=noises.drop(i)

In [254]:
# Relabel rest of noise items with their new label+amount of previous clusters

for index, row in noises.iterrows():
    y_hc[row['index1']]=y_hc2[index]+n_clusters
    

In [261]:
# See if the reclassification worked

fixedTest = 1559
for row in y_hc:
    if row == fixedTest:
        print(row)

1559
1559


In [262]:
# Insert the new labels into the dataframe

dropdf1['label']=y_hc
sentences.loc[y_hc==fixedTest]

Unnamed: 0,title,links,currentPrice,bigrams
343,"[zapper, playstation, 2]",{'self': 'https://api.huuto.net/1.1/items/5225...,20.0,"[zapper, playstation_2]"
2243,"[zapper, 415]",{'self': 'https://api.huuto.net/1.1/items/5254...,5.0,"[zapper, 415]"


In [264]:
# See how many clusters we have now

dropdf1['label'].nunique()

1816

In [265]:
# Make df where cluster data is stored
# Columns: Cluster_id, average, median, item links, cluster name, (cluster bigrams)
clusterdf = pd.DataFrame(columns={'cTitle','iTitleList','cNum','mean','median','links','itemCount','priceList'})

for index, sentence in sentences.iterrows():
    cTitle = []
    cTitleList = []
    cNum = index
    mean = None
    median = None
    links= []
    itemCount = 0
    iTitleList = []
    
    priceList = []
    
    for jndex, jentence in sentences.iterrows():
        if(y_hc[jndex]==index):             
            #print(str(y_hc[jndex])+ ":" + str(jentence))
            
            # Add all cluster item titles to iTitleList
            iTitleList.append(jentence['title'])
            
            # Add title to iTitleList
            links.append(jentence['links'].get('alternative'))
            
            #Add one to itemCount
            itemCount += 1
            
            # Add price to priceList
            priceList.append(jentence['currentPrice'])
            
            
    # Calculate title for cluster (cTitle)
    for row in iTitleList:
        for word in row:
            if word not in cTitle:
                cTitle.append(word)
    
    priceList = np.array(priceList)
    # Calculate mean
    mean = np.mean(priceList)
    
    # Calculate median
    median = np.median(priceList)
    
    clusterdf = clusterdf.append({'cTitle':cTitle,'iTitleList': iTitleList,'cNum':cNum,
                                  'mean':mean,'median':median,'links':links,
                                  'itemCount':itemCoungt,'priceList':priceList}, ignore_index=True)
    if index > n_clusters:
        break

In [269]:
check = 495
#clusterdf[:check]
print(clusterdf['cTitle'][check])
print(clusterdf['links'][check])
clusterdf.loc[check]

['is', 'football', '2002', 'platinum', 'knockout', 'kings']
['https://www.huuto.net/kohteet/this-is-football-2002-platinum-ps2--40/521058892', 'https://www.huuto.net/kohteet/knockout-kings-2002-ps2/524753663', 'https://www.huuto.net/kohteet/this-is-football-2002-ps2/526313211', 'https://www.huuto.net/kohteet/this-is-football-2002-ps2/526153006']


mean                                                     2.4375
iTitleList    [[is, football, 2002, platinum], [knockout, ki...
priceList                                 [2.95, 3.0, 2.0, 1.8]
cNum                                                        495
median                                                    2.475
links         [https://www.huuto.net/kohteet/this-is-footbal...
itemCount                                                     4
cTitle          [is, football, 2002, platinum, knockout, kings]
Name: 495, dtype: object

In [267]:
clusterdf = clusterdf[:n_clusters]

In [268]:
clusterdf.to_csv("huutoClusters21_10.csv",index=False)

In [None]:
# Make search logic