In [1]:
#imports

import pandas as pd
import gensim
import pyLDAvis.gensim
from sklearn.feature_extraction.text import CountVectorizer
from gensim import corpora, models
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from pprint import pprint

pd.set_option('display.max_colwidth', -1)

## Classifying Tweets to Categories
<a id="intertopic_map"></a>

## LDA - Intertopic Distance Map

In [2]:
airlines = pd.read_csv('./data/with_sentiment.csv')

In [3]:
# Create Dictionary
id2word = corpora.Dictionary(airlines.lemmatized.str.split())

# Create Corpus
texts = airlines.lemmatized.str.split()

# Term Document Frequency
corpus = [id2word.doc2bow(text) for text in texts]

In [4]:
# Build LDA model
lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                           id2word=id2word,
                                           num_topics=3, 
                                           random_state=42,
                                           update_every=1,
                                           chunksize=500,
                                           passes=90,
                                           alpha='auto',
                                           per_word_topics=True)

In [None]:
# Print the keywords in the topics
pprint(lda_model.print_topics())
doc_lda = lda_model[corpus]

[(0,
  '0.011*"\'one\'," + 0.010*"\'ticket\'," + 0.009*"\'new\'," + '
  '0.009*"\'seat\'," + 0.008*"\'cancelled\'," + 0.006*"\'first\'," + '
  '0.006*"\'american\'," + 0.005*"\'travel\'," + 0.005*"\'problem\'," + '
  '0.005*"\'delta\',"'),
 (1,
  '0.014*"\'plane\'," + 0.012*"\'time\'," + 0.011*"\'hour\'," + '
  '0.010*"\'bag\'," + 0.008*"\'delayed\'," + 0.007*"\'max\'," + '
  '0.007*"\'delay\'," + 0.007*"\'gate\'," + 0.006*"\'boeing\'," + '
  '0.006*"\'day\',"'),
 (2,
  '0.011*"\'service\'," + 0.011*"\'customer\'," + 0.010*"\'u\'," + '
  '0.007*"\'get\'," + 0.006*"\'would\'," + 0.006*"\'like\'," + '
  '0.005*"\'make\'," + 0.005*"\'home\'," + 0.005*"\'know\'," + '
  '0.005*"\'help\',"')]


In [None]:
# Visualize the topics
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim.prepare(lda_model, corpus, id2word)
vis

## Doc2Vec

In [None]:
documents = [TaggedDocument(doc, [i]) for i, doc in enumerate(airlines['text'])]

#  building a model
model = Doc2Vec(documents, vector_size=300, window=5, min_count=1, dm =1)
#Infer vector for a new document:
vector = model.infer_vector(["system", "response"])
#vector

In [None]:
similar_doc = model.docvecs.most_similar(3,topn = 50)
similar_doc.sort(key= lambda x: x[1])
similar_doc

In [None]:
airlines.iloc[[i[0] for i in similar_doc],:]