In [None]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import re
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from sklearn.cluster import MeanShift
from sklearn.mixture import GaussianMixture

from nltk.stem.lancaster import LancasterStemmer
from nltk.tokenize import word_tokenize

#from LocalitySensitiveHashing import *

#from annoy import AnnoyIndex

In [None]:
train_data = pd.read_csv("ramen-ratings.csv")
train_data = train_data.iloc[:, 1:-1]

In [None]:
train_data

Unnamed: 0,Brand,Variety,Style,Country,Stars
0,New Touch,T's Restaurant Tantanmen,Cup,Japan,3.75
1,Just Way,Noodles Spicy Hot Sesame Spicy Hot Sesame Guan...,Pack,Taiwan,1
2,Nissin,Cup Noodles Chicken Vegetable,Cup,USA,2.25
3,Wei Lih,GGE Ramen Snack Tomato Flavor,Pack,Taiwan,2.75
4,Ching's Secret,Singapore Curry,Pack,India,3.75
...,...,...,...,...,...
2575,Vifon,"Hu Tiu Nam Vang [""Phnom Penh"" style] Asian Sty...",Bowl,Vietnam,3.5
2576,Wai Wai,Oriental Style Instant Noodles,Pack,Thailand,1
2577,Wai Wai,Tom Yum Shrimp,Pack,Thailand,2
2578,Wai Wai,Tom Yum Chili Flavor,Pack,Thailand,2


In [None]:
train_data["Variety"] = train_data["Variety"] + " " + train_data["Brand"] + " " + train_data["Style"] + " " + train_data["Country"]

train_data["Variety"] = train_data["Variety"].astype(str)

In [None]:
#Processing the text
stemmer = LancasterStemmer()

def processing(text):
  text = re.sub(r'\@\S+','',text)
  text = re.sub(r'http\:\/\/\S+','',text)
  text = re.sub(r'[^\w\s]','',text)
  tokens = text.lower().split(" ")
  stemmed_tokens = [stemmer.stem(x) for x in tokens]
  return ' '.join(stemmed_tokens)

train_data["Variety"] = train_data["Variety"].apply(processing)
train_data

Unnamed: 0,Brand,Variety,Style,Country,Stars
0,New Touch,ts resta tantanm new touch cup jap,Cup,Japan,3.75
1,Just Way,noodl spicy hot sesam spicy hot sesam guanmiao...,Pack,Taiwan,1
2,Nissin,cup noodl chick veget nissin cup us,Cup,USA,2.25
3,Wei Lih,gge ram snack tomato flav wei lih pack taiw,Pack,Taiwan,2.75
4,Ching's Secret,singap curry ching secret pack ind,Pack,India,3.75
...,...,...,...,...,...
2575,Vifon,hu tiu nam vang phnom penh styl as styl instan...,Bowl,Vietnam,3.5
2576,Wai Wai,ory styl inst noodl wai wai pack thailand,Pack,Thailand,1
2577,Wai Wai,tom yum shrimp wai wai pack thailand,Pack,Thailand,2
2578,Wai Wai,tom yum chil flav wai wai pack thailand,Pack,Thailand,2


In [None]:
vectorizer = CountVectorizer(binary=True,min_df=5)

#Fitting and processing countVectorizer
vectorizer.fit(train_data["Variety"])
train_vector = vectorizer.transform(train_data["Variety"])

In [None]:
#K-means clustering
kmeans_v = KMeans(n_clusters=10, random_state=0).fit(train_vector)

In [None]:
kmeans_v.labels_

array([3, 1, 8, ..., 7, 7, 7], dtype=int32)

In [None]:
#Mean shift clustering
gaussian_m = GaussianMixture(n_components=10, random_state=0,init_params='random').fit(train_vector.toarray())

In [None]:
gaussian_v = gaussian_m.predict(train_vector.toarray())
gaussian_v

array([9, 7, 2, ..., 3, 3, 0])

In [None]:
x = pd.read_csv("ramen-ratings.csv")
x = x.iloc[:, 1:-2]
x["kmeans"] = kmeans_v.labels_
x["gaussian"] = gaussian_v
x

Unnamed: 0,Brand,Variety,Style,Country,kmeans,gaussian
0,New Touch,T's Restaurant Tantanmen,Cup,Japan,3,9
1,Just Way,Noodles Spicy Hot Sesame Spicy Hot Sesame Guan...,Pack,Taiwan,1,7
2,Nissin,Cup Noodles Chicken Vegetable,Cup,USA,8,2
3,Wei Lih,GGE Ramen Snack Tomato Flavor,Pack,Taiwan,1,8
4,Ching's Secret,Singapore Curry,Pack,India,7,4
...,...,...,...,...,...,...
2575,Vifon,"Hu Tiu Nam Vang [""Phnom Penh"" style] Asian Sty...",Bowl,Vietnam,0,2
2576,Wai Wai,Oriental Style Instant Noodles,Pack,Thailand,2,8
2577,Wai Wai,Tom Yum Shrimp,Pack,Thailand,7,3
2578,Wai Wai,Tom Yum Chili Flavor,Pack,Thailand,7,3


In [None]:
from sklearn import metrics

metrics.rand_score(kmeans_v.labels_, gaussian_v)

0.8137536031933535