In [1]:
import matplotlib.pyplot as plt
import pandas as pd
import re

df_t = pd.read_csv('QandA.csv')

In [2]:
# Tokenize chats
from nltk.tokenize import RegexpTokenizer
tokenizer = RegexpTokenizer(r'\w+')

txt = df_t.QandA.apply(lambda s: tokenizer.tokenize(s.lower()))

  from collections import Sequence


In [3]:
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer

porter_stemmer = PorterStemmer()
def stem_words(l):
    return [porter_stemmer.stem(words) for words in l]

txt_list = list(map(stem_words,txt))

In [4]:
wordnet_lemmatizer = WordNetLemmatizer()
def lemmatize_words(l):
    return [wordnet_lemmatizer.lemmatize(words) for words in l]

txt_list = list(map(lemmatize_words,txt_list))

# NLTK Stop words
from nltk.corpus import stopwords
stop_words = stopwords.words('english')
stop_words.extend(['hello', 'hi', 'welcom', 'headout', 'know', 'experi', 'refer', 'help', 'ani', 'chat', 'problem', 'may', 'reach', 'need', 'let', 'u', 'feel', 'free', 'contact', 'realli', 'appreci', 'could', 'rate', 'chat', 'thank', 'today', 'wa', 'nice', 'talk', 'great', 'day', 'goodby', 'would', 'like','plea', 'wait', 'minut', 'check', 'thi', 'anyth', 'el', 'step', 'away', 'assist', 'custom', 'bye', 'hey', 'ok','get', 'ye', 'safari', 'khalifa', 'burj', 'aquarium', 'roman', 'palatin', 'vatican', 'dubai'])
def remove_stopwords(l):
    return [word for word in l if word not in stop_words]
txt_list = list(map(remove_stopwords,txt_list))

In [48]:
df_l = pd.Series(list(map(lambda x: ' '.join(x),txt_list)))

In [18]:
import numpy as np

In [20]:
np.array(txt_list).shape

(66180,)

In [22]:
tl = txt_list[:1000]

In [None]:
import numpy as np
import scipy.sparse as ss
from corextopic import corextopic as ct

In [51]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(stop_words='english', max_features=20000, binary=True)
c = cv.fit_transform(df_l)

In [52]:
c.shape

(66180, 10330)

In [53]:
words = list(np.asarray(cv.get_feature_names()))

In [60]:
# Anchor
anchor_words = [['cashback', 'cash'], 
                ['refund', 'cancel'], 
                ['child', 'adult','year','old','age','kid'], 
                ['seat', 'choos','select','section','offic','exact','togeth'],
                ['discount','coupon','code','use'],
                ['card','payment','work','complet','error','issu','differ','tri']
               ]

In [61]:
%%time
anchored_topic_model = ct.Corex(n_hidden=20, max_iter=200, seed=1)
anchored_topic_model.fit(c, words=words, anchors=anchor_words, anchor_strength=6)

CPU times: user 1min 40s, sys: 3.34 s, total: 1min 43s
Wall time: 52.3 s


In [63]:
# Print all topics from the CorEx topic model
topics = anchored_topic_model.get_topics()
for n,topic in enumerate(topics):
    topic_words,_ = zip(*topic)
    print('{}: '.format(n) + ','.join(topic_words))

0: cashback,cash,wallet,facebook,log,creat,account,user,virtual,futur
1: cancel,refund,polici,0100,347,897,strict,reschedul,touch,amend
2: adult,child,year,old,age,kid,abov,yr,18,price
3: seat,select,choos,offic,section,exact,togeth,box,best,intellig
4: use,discount,code,coupon,promo,browser,app,wow,alreadi,love
5: tri,card,differ,work,issu,payment,complet,error,credit,chrome
6: ticket,buy,purchas,book,onlin,want,onli,valid,befor,look
7: park,bird,king,dolphin,lion,garden,jurong,bay,seal,ice
8: http,www,com,tour,chapel,sistin,unit,arab,world,museum
9: tower,eiffel,cruis,pari,sein,summit,floor,franc,bateaux,river
10: hotel,pick,drop,desert,dinner,transfer,pickup,citi,dune,airport
11: pm,time,slot,30,date,open,septemb,00,hour,2018
12: group,servic,peopl,provid,altern,small,ask,resid,question,flag
13: support,write,request,apolog,delay,inventori,connect,respons,fulfil,manag
14: sorri,avail,unfortun,abl,websit,vendor,inconveni,sold,possibl,sinc
15: email,receiv,reserv,id,sent,make,send,con

In [54]:
%%time
topic_model = ct.Corex(n_hidden=20, words=words, max_iter=200, verbose=False, seed=1)
topic_model.fit(c, words=words)

CPU times: user 1min 44s, sys: 3.84 s, total: 1min 48s
Wall time: 55 s


In [55]:
# Print all topics from the CorEx topic model
topics = topic_model.get_topics()
for n,topic in enumerate(topics):
    topic_words,_ = zip(*topic)
    print('{}: '.format(n) + ','.join(topic_words))

0: la,por,en,una,que,entrada,para,lo,si,se
1: http,www,skip,tour,com,line,guid,tower,eiffel,colosseum
2: card,use,credit,code,tri,facebook,coupon,payment,log,discount
3: cruis,arab,emir,unit,dinner,desert,dhow,river,sein,dune
4: pm,time,30,slot,00,hour,open,10,septemb,2018
5: empir,build,speak,english,state,piazza,spanish,translat,ancient,languag
6: seat,section,exact,best,togeth,choos,intellig,orchestra,map,avail
7: chapel,sistin,world,museum,pas,univers,studio,singapor,ferrari,sentosa
8: email,receiv,sent,send,id,confirm,reserv,number,familia,mail
9: hotel,pick,drop,point,transfer,pickup,citi,meet,locat,bu
10: child,year,adult,old,age,kid,abov,yr,18,daughter
11: box,offic,request,paid,chrome,write,googl,support,error,complet
12: cancel,inconveni,refund,polici,sorri,delay,apolog,strict,ha,48
13: book,make,websit,onli,purchas,abl,onlin,becaus,say,befor
14: trip,round,depart,boat,board,assur,question,wheelchair,inquiri,vegetarian
15: 19,max,22nd,rang,195,play,shot,fir,reduc,46
16: june,

In [15]:
# Define a matrix where rows are samples (docs) and columns are features (words)
X = np.array([[0,0,0,1,1],
              [1,1,1,0,0],
              [1,1,1,1,1]], dtype=int)
# Sparse matrices are also supported
X = ss.csr_matrix(X)
# Word labels for each column can be provided to the model
words = ['dog', 'cat', 'fish', 'apple', 'orange']
# Document labels for each row can be provided
docs = ['fruit doc', 'animal doc', 'mixed doc']

# Train the CorEx topic model
topic_model = ct.Corex(n_hidden=2)  # Define the number of latent (hidden) topics to use.
topic_model.fit(X, words=words, docs=docs)

<corextopic.corextopic.Corex at 0x7fb21be96128>

In [16]:
topics = topic_model.get_topics()
for topic_n,topic in enumerate(topics):
    words,mis = zip(*topic)
    topic_str = str(topic_n+1)+': '+','.join(words)
    print(topic_str)

1: dog,cat,fish
2: apple,orange


In [11]:
top_docs = topic_model.get_top_docs()
for topic_n, topic_docs in enumerate(top_docs):
    docs,probs = zip(*topic_docs)
    topic_str = str(topic_n+1)+': '+','.join(docs)
    print(topic_str)

1: animal doc,mixed doc,fruit doc
2: animal doc,mixed doc,fruit doc
