In [1]:
import re
import numpy as np
import pandas as pd
from pprint import pprint

# Gensim
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel
from gensim.models import Word2Vec

# spacy for lemmatization
import spacy

import nltk
from nltk.corpus import stopwords

from sklearn import cluster

import warnings
warnings.filterwarnings("ignore",category=DeprecationWarning)

In [None]:
# NLTK Stop words
from nltk.corpus import stopwords
stop_words = stopwords.words('english')
stop_words.extend(['from', 'subject', 're', 'edu', 'use'])

# Path to folder with necessary files
path = 'YOUR/PATH/'

print('loading text...')
# load review data

reviews = pd.read_csv(path+"test_cleaned_reviews.csv")
reviews = reviews[reviews.review.str.contains('notebook|computer', regex=True)]

reviews_text = reviews.review.values.tolist()
reviews_text = str(reviews_text)

processed_reviews = reviews_text.lower()
processed_reviews = re.sub('[^a-zA-Z]', ' ', processed_reviews )
processed_reviews = re.sub(r'\s+', ' ', processed_reviews)


print('tokenizing...')
# tokenize sentences

all_sentences = nltk.sent_tokenize(processed_reviews)

data_words = [nltk.word_tokenize(sent) for sent in all_sentences]


print('bigrams, stopwords and lemmatizations...')
# Build the bigram model and function

def make_bigrams(texts):
    return [bigram_mod[doc] for doc in texts]


# Define functions for stopwords and lemmatization
def remove_stopwords(texts):
    return [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in texts]

def lemmatization(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
    texts_out = []
    for sent in texts:
        doc = nlp(" ".join(sent)) 
        texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
    return texts_out


# Remove Stop Words
data_words_nostops = remove_stopwords(data_words)

# Form Bigrams
bigram = gensim.models.Phrases(data_words_nostops, min_count=2, threshold=10)
bigram_mod = gensim.models.phrases.Phraser(bigram)

data_words_bigrams = make_bigrams(data_words_nostops)

# Initialize spacy 
nlp = spacy.load('en_core_web_sm', disable=['parser', 'ner'])

# Do lemmatization keeping only noun, adj, vb, adv
nlp.max_length = 106568194
data_lemmatized = lemmatization(data_words_bigrams, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV'])


print('fitting word2vec...')
# Fit word2vec
word2vec = Word2Vec(data_lemmatized, min_count=2)
X = word2vec[word2vec.wv.vocab]

print('fitting KMeans...')
# Fit KMeans
kmeans = cluster.KMeans(n_clusters=10)
kmeans.fit(X)

print('running kmeans model on aspect sample...')
# For loop on aspect df to cluster any words in word2vec vocabulary
pd.options.mode.chained_assignment = None 
aspects = pd.read_csv(path+"laptop_filtered_aspect_sample.csv")
aspects['cluster'] = np.nan

for i in range(len(aspects)-1):
    try:
        pred = kmeans.predict(word2vec[aspects.aspect_name[i:i+1]])
        aspects['cluster'][i] = pred
    except:
        continue
        
print('saving dataframe with cluster data...')        
aspects.to_csv(path+'clustered_aspects.csv', index=False)