In [1]:
#imports

import pandas as pd
import gensim
import pyLDAvis.gensim
from sklearn.feature_extraction.text import CountVectorizer
from gensim import corpora, models
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from pprint import pprint

pd.set_option('display.max_colwidth', -1)

## Classifying Tweets to Categories
<a id="intertopic_map"></a>

## LDA - Intertopic Distance Map

In [2]:
airlines = pd.read_csv('./data/with_sentiment.csv')

In [3]:
# Create Dictionary
id2word = corpora.Dictionary(airlines.lemmatized.str.split())

# Create Corpus
texts = airlines.lemmatized.str.split()

# Term Document Frequency
corpus = [id2word.doc2bow(text) for text in texts]

In [4]:
# Build LDA model
lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                           id2word=id2word,
                                           num_topics=3, 
                                           random_state=42,
                                           update_every=1,
                                           chunksize=500,
                                           passes=90,
                                           alpha='auto',
                                           per_word_topics=True)

In [5]:
# Print the keywords in the topics
pprint(lda_model.print_topics())
doc_lda = lda_model[corpus]

[(0,
  '0.012*"\'new\'," + 0.011*"\'seat\'," + 0.008*"\'first\'," + '
  '0.007*"\'ticket\'," + 0.007*"\'travel\'," + 0.005*"\'class\'," + '
  '0.005*"\'trip\'," + 0.005*"\'chicago\'," + 0.005*"\'free\'," + '
  '0.004*"\'via\']"'),
 (1,
  '0.011*"\'max\'," + 0.009*"\'american\'," + 0.009*"\'boeing\'," + '
  '0.007*"\'flight\'," + 0.006*"\'ever\'," + 0.006*"\'worst\'," + '
  '0.005*"\'delta\'," + 0.005*"\'far\'," + 0.005*"\'san\'," + '
  '0.005*"\'cancellation\',"'),
 (2,
  '0.009*"\'plane\'," + 0.008*"\'service\'," + 0.008*"\'get\'," + '
  '0.008*"\'customer\'," + 0.007*"\'time\'," + 0.007*"\'hour\'," + '
  '0.007*"\'u\'," + 0.007*"\'bag\'," + 0.006*"\'one\'," + 0.005*"\'day\',"')]


In [6]:
# Visualize the topics
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim.prepare(lda_model, corpus, id2word)
vis

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  return pd.concat([default_term_info] + list(topic_dfs))


## Doc2Vec

In [7]:
documents = [TaggedDocument(doc, [i]) for i, doc in enumerate(airlines['text'])]

#  building a model
model = Doc2Vec(documents, vector_size=300, window=5, min_count=1, dm =1)
#Infer vector for a new document:
vector = model.infer_vector(["system", "response"])
#vector

In [8]:
similar_doc = model.docvecs.most_similar(3,topn = 50)
similar_doc.sort(key= lambda x: x[1])
similar_doc

[(5704, 0.8619507551193237),
 (1603, 0.8625760078430176),
 (5347, 0.8648976683616638),
 (3007, 0.8652684092521667),
 (913, 0.865969181060791),
 (1885, 0.8667536973953247),
 (1429, 0.8668698072433472),
 (3896, 0.8671457767486572),
 (2494, 0.867577314376831),
 (5398, 0.8688647747039795),
 (3182, 0.8700319528579712),
 (655, 0.8701715469360352),
 (4844, 0.870286762714386),
 (1975, 0.8714767694473267),
 (921, 0.8716994524002075),
 (3727, 0.872498631477356),
 (304, 0.8727860450744629),
 (490, 0.8728874325752258),
 (2002, 0.8739687204360962),
 (183, 0.8748645782470703),
 (5078, 0.8754234313964844),
 (2033, 0.8755291700363159),
 (2457, 0.8757647275924683),
 (5629, 0.8760643005371094),
 (2112, 0.8770704865455627),
 (1500, 0.8775063753128052),
 (629, 0.8776907324790955),
 (5309, 0.8779200315475464),
 (2999, 0.8786770105361938),
 (1240, 0.87958824634552),
 (5928, 0.8821991682052612),
 (422, 0.882508397102356),
 (5137, 0.882559061050415),
 (4823, 0.8825976848602295),
 (5627, 0.8835605978965759),
 

In [9]:
airlines.iloc[[i[0] for i in similar_doc],:]

Unnamed: 0.1,Unnamed: 0,text,tweet_coord,tweet_created,tweet_location,negative,positive,neutral,compound,lemmatized,clean,blob_polarity,blob_subjectivity
5704,5704,United Airlines put an underage passenger on a plane to the wrong country prompting an allnight ordeal between a,,Mon Jul 01 21:32:31 +0000 2019,,0.148,0.134,0.718,-0.0772,"['put', 'underage', 'passenger', 'plane', 'wrong', 'country', 'prompting', 'allnight', 'ordeal']",put underage passenger plane wrong country prompting allnight ordeal,-0.5,0.9
1603,1603,it would be great to get a callback from the agent that disconnected me trying to put me on hold after waiting to connectfor 60min,,2015-02-21 21:08:56 -0800,"Chicago, IL",0.0,0.151,0.849,0.6249,"['would', 'great', 'get', 'callback', 'agent', 'disconnected', 'trying', 'put', 'hold', 'waiting', 'connectfor', 'min']",would great get callback agent disconnected trying put hold waiting connectfor min,0.8,0.75
5347,5347,United airlines looking for a database engineer and I dont know nobody I can recommend FML,,Thu Jul 11 22:13:22 +0000 2019,Ikeja,0.0,0.325,0.675,0.6486,"['looking', 'database', 'engineer', 'dont', 'know', 'nobody', 'recommend', 'fml']",looking database engineer dont know nobody recommend fml,0.0,0.0
3007,3007,yes when I got to the gate I specifically asked if there where any other seats Very discouraging to walk past the crew 37d,,2015-02-18 22:27:14 -0800,,0.123,0.104,0.772,-0.1263,"['yes', 'got', 'gate', 'specifically', 'asked', 'seat', 'discouraging', 'walk', 'past', 'crew']",yes got gate specifically asked seat discouraging walk past crew,-0.25,0.25
913,913,is it even legal for you guys to advertise flights that you cant honor,,2015-02-22 21:03:42 -0800,H-town,0.163,0.093,0.744,-0.2796,"['even', 'legal', 'guy', 'advertise', 'flight', 'cant', 'honor']",even legal guy advertise flight cant honor,0.2,0.2
1885,1885,troubling news time to raise the bar on legroom,,2015-02-21 12:14:34 -0800,"ÜT: 39.768182,-86.167261",0.304,0.0,0.696,-0.5423,"['troubling', 'news', 'time', 'raise', 'bar', 'legroom']",troubling news time raise bar legroom,0.0,0.0
1429,1429,flight delayed by 24 hours lost my wallet have no money to eat or sleep yet your representatives will not helpunitedsucks,,2015-02-22 07:03:06 -0800,,0.262,0.0,0.738,-0.6597,"['delayed', 'hour', 'lost', 'wallet', 'money', 'eat', 'sleep', 'yet', 'representative', 'helpunitedsucks']",delayed hour lost wallet money eat sleep yet representative helpunitedsucks,0.0,0.0
3896,3896,When Spirit Airlines stock is 16 higher than your own you should really reconsider your continued p,,Wed Jun 19 23:40:26 +0000 2019,,0.0,0.102,0.898,0.1779,"['spirit', 'stock', 'higher', 'really', 'reconsider', 'continued', 'p']",spirit stock higher really reconsider continued p,0.225,0.35
2494,2494,no because you will charge me or delay me further United 1612 still waiting,,2015-02-20 07:42:57 -0800,"Orlando, FL",0.246,0.153,0.601,-0.1779,"['charge', 'delay', 'still', 'waiting']",charge delay still waiting,0.0,0.0
5398,5398,Southwest and United Airlines extend 737 Max cancellations after new flaw found via,,Sun Jul 14 21:03:19 +0000 2019,,0.0,0.29,0.71,0.5423,"['southwest', 'extend', 'max', 'cancellation', 'new', 'flaw', 'found', 'via']",southwest extend max cancellation new flaw found via,0.136364,0.454545
