In [None]:
import numpy as np
import pandas as pd
import nltk
import re
import os
import codecs
from sklearn import feature_extraction
import mpld3

In [None]:
df = pd.read_csv('Final sample Gettech.csv',sep=';',index_col=0)
df_test = df.groupby(['accommodation_id','basename','at','description','value_type_id'])['amenities_id'].apply(list)
df_test = df_test.to_frame().reset_index()
df_test2 = df.groupby(['accommodation_id','basename','at','description','value_type_id'])['amenities_cont'].apply(list)
df_test2 = df_test2.to_frame().reset_index()
final_df = pd.merge(df_test, df_test2[["accommodation_id", "amenities_cont"]], on="accommodation_id", how="left")
final_df.to_csv('processed_csv_file.csv', sep='\t', encoding='utf-8')
final_df.head()

In [None]:
item_names = final_df['basename'].tolist()
item_descriptions = final_df['description'].tolist()
item_at = final_df['at'].tolist()

In [None]:
stopwords = nltk.corpus.stopwords.words('english')

In [None]:
from nltk.stem.snowball import SnowballStemmer
stemmer = SnowballStemmer("english")

In [None]:
def tokenize_and_stem(text):
    # first tokenize by sentence, then by word to ensure that punctuation is caught as it's own token
    tokens = [word for sent in nltk.sent_tokenize(text) for word in nltk.word_tokenize(sent)]
    filtered_tokens = []
    # filter out any tokens not containing letters (e.g., numeric tokens, raw punctuation)
    for token in tokens:
        if re.search('[a-zA-Z]', token):
            filtered_tokens.append(token)
    stems = [stemmer.stem(t) for t in filtered_tokens]
    return stems


def tokenize_only(text):
    # first tokenize by sentence, then by word to ensure that punctuation is caught as it's own token
    tokens = [word.lower() for sent in nltk.sent_tokenize(text) for word in nltk.word_tokenize(sent)]
    filtered_tokens = []
    # filter out any tokens not containing letters (e.g., numeric tokens, raw punctuation)
    for token in tokens:
        if re.search('[a-zA-Z]', token):
            filtered_tokens.append(token)
    return filtered_tokens

In [None]:
totalvocab_stemmed = []
totalvocab_tokenized = []
for i in item_descriptions:
    allwords_stemmed = tokenize_and_stem(i) #for each item in 'descriptions', tokenize/stem
    totalvocab_stemmed.extend(allwords_stemmed) #extend the 'totalvocab_stemmed' list
    
    allwords_tokenized = tokenize_only(i)
    totalvocab_tokenized.extend(allwords_tokenized)

In [None]:
vocab_frame = pd.DataFrame({'words': totalvocab_tokenized}, index = totalvocab_stemmed)
print('there are ' + str(vocab_frame.shape[0]) + ' items in vocab_frame')

In [None]:
vocab_frame.head(100)

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

#define vectorizer parameters
tfidf_vectorizer = TfidfVectorizer(max_df=0.8, max_features=200000,
                                 min_df=0.2, stop_words='english',
                                 use_idf=True, tokenizer=tokenize_and_stem, ngram_range=(1,3))

%time tfidf_matrix = tfidf_vectorizer.fit_transform(item_descriptions) #fit the vectorizer to item_descriptions

print(tfidf_matrix.shape)

In [None]:
terms = tfidf_vectorizer.get_feature_names()


In [None]:
from sklearn.metrics.pairwise import cosine_similarity
dist = 1 - cosine_similarity(tfidf_matrix)

In [None]:
print(terms)

In [None]:
from sklearn.cluster import KMeans

num_clusters = 2

km = KMeans(n_clusters=num_clusters)

%time km.fit(tfidf_matrix)

clusters = km.labels_.tolist()

In [None]:
from sklearn.externals import joblib

#uncomment the below to save your model 
#since I've already run my model I am loading from the pickle

#joblib.dump(km,  'doc_cluster2.pkl')

km = joblib.load('doc_cluster2.pkl')
clusters = km.labels_.tolist()

In [None]:
items = { 'name': item_names, 'type': item_at, 'description': item_descriptions, 'cluster': clusters }

frame = pd.DataFrame(items, index = [clusters] , columns = ['name', 'type', 'cluster'])

In [None]:
frame['cluster'].value_counts()

In [None]:
grouped = frame['name'].groupby(frame['cluster']) #groupby cluster for aggregation purposes

grouped.head()
#grouped[frame['cluster']].apply(pd.DataFrame)
#print(type(grouped))
#grouped.reset_index()[['cluster', 'name']].to_csv('names_clusters.csv')

In [None]:
from __future__ import print_function

print("Top terms per cluster:")
print()
#sort cluster centers by proximity to centroid
order_centroids = km.cluster_centers_.argsort()[:, ::-1] 

for i in range(num_clusters):
    print("Cluster %d words:" % i, end='')
    
    for ind in order_centroids[i, :6]: #replace 6 with n words per cluster
        print(' %s' % vocab_frame.ix[terms[ind].split(' ')].values.tolist()[0][0].encode('utf-8', 'ignore'), end=',')
    print() #add whitespace
    print() #add whitespace
    
    print("Cluster %d names:" % i, end='')
    for title in frame.ix[i]['name'].values.tolist():
        print(' %s,' % title, end='')
    print() #add whitespace
    print() #add whitespace
    
print()
print()

In [None]:
import os  # for os.path.basename

import matplotlib.pyplot as plt
import matplotlib as mpl

from sklearn.manifold import MDS

MDS()

# convert two components as we're plotting points in a two-dimensional plane
# "precomputed" because we provide a distance matrix
# we will also specify `random_state` so the plot is reproducible.
mds = MDS(n_components=2, dissimilarity="precomputed", random_state=1)

pos = mds.fit_transform(dist)  # shape (n_components, n_samples)

xs, ys = pos[:, 0], pos[:, 1]
print()
print()