In [1]:
#imports

import pandas as pd
import gensim
import pyLDAvis.gensim
from sklearn.feature_extraction.text import CountVectorizer
from gensim import corpora, models
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from pprint import pprint

pd.set_option('display.max_colwidth', -1)

## Classifying Tweets to Categories
<a id="intertopic_map"></a>

## LDA - Intertopic Distance Map

In [2]:
airlines = pd.read_csv('./data/with_sentiment.csv')

In [3]:
# Create Dictionary
id2word = corpora.Dictionary(airlines.lemmatized.str.split())

# Create Corpus
texts = airlines.lemmatized.str.split()

# Term Document Frequency
corpus = [id2word.doc2bow(text) for text in texts]

In [4]:
# Build LDA model
lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                           id2word=id2word,
                                           num_topics=3, 
                                           random_state=42,
                                           update_every=1,
                                           chunksize=500,
                                           passes=90,
                                           alpha='auto',
                                           per_word_topics=True)

In [5]:
# Print the keywords in the topics
pprint(lda_model.print_topics())
doc_lda = lda_model[corpus]

[(0,
  '0.011*"\'one\'," + 0.010*"\'ticket\'," + 0.009*"\'new\'," + '
  '0.009*"\'seat\'," + 0.008*"\'cancelled\'," + 0.006*"\'first\'," + '
  '0.006*"\'american\'," + 0.005*"\'travel\'," + 0.005*"\'problem\'," + '
  '0.005*"\'delta\',"'),
 (1,
  '0.014*"\'plane\'," + 0.012*"\'time\'," + 0.011*"\'hour\'," + '
  '0.010*"\'bag\'," + 0.008*"\'delayed\'," + 0.007*"\'max\'," + '
  '0.007*"\'delay\'," + 0.007*"\'gate\'," + 0.006*"\'boeing\'," + '
  '0.006*"\'day\',"'),
 (2,
  '0.011*"\'service\'," + 0.011*"\'customer\'," + 0.010*"\'u\'," + '
  '0.007*"\'get\'," + 0.006*"\'would\'," + 0.006*"\'like\'," + '
  '0.005*"\'make\'," + 0.005*"\'home\'," + 0.005*"\'know\'," + '
  '0.005*"\'help\',"')]


In [6]:
# Visualize the topics
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim.prepare(lda_model, corpus, id2word)
vis

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  return pd.concat([default_term_info] + list(topic_dfs))


## Doc2Vec

In [7]:
documents = [TaggedDocument(doc, [i]) for i, doc in enumerate(airlines['text'])]

#  building a model
model = Doc2Vec(documents, vector_size=300, window=5, min_count=1, dm =1)
#Infer vector for a new document:
vector = model.infer_vector(["system", "response"])
#vector

In [8]:
similar_doc = model.docvecs.most_similar(3,topn = 50)
similar_doc.sort(key= lambda x: x[1])
similar_doc

[(2662, 0.9055027365684509),
 (3837, 0.9059790372848511),
 (5786, 0.9079231023788452),
 (1792, 0.9087985754013062),
 (4054, 0.9094513058662415),
 (3901, 0.9096845388412476),
 (3359, 0.9097814559936523),
 (3840, 0.9100387096405029),
 (4426, 0.91017746925354),
 (1006, 0.9102189540863037),
 (473, 0.9105767011642456),
 (6003, 0.9110958576202393),
 (4628, 0.9113451838493347),
 (5432, 0.9116016626358032),
 (4074, 0.913498044013977),
 (5487, 0.9155327677726746),
 (2548, 0.9158735275268555),
 (1386, 0.9163708090782166),
 (1524, 0.9175707697868347),
 (4244, 0.9176082015037537),
 (4964, 0.9180948734283447),
 (2364, 0.9181377291679382),
 (3929, 0.9189269542694092),
 (4982, 0.9191336035728455),
 (5533, 0.9191353917121887),
 (6099, 0.9195352792739868),
 (3408, 0.9209022521972656),
 (4386, 0.9210591316223145),
 (948, 0.921255886554718),
 (888, 0.9219498038291931),
 (1920, 0.9221858382225037),
 (5968, 0.9225319027900696),
 (5218, 0.9234181642532349),
 (5345, 0.9237065315246582),
 (465, 0.923873543739

In [9]:
airlines.iloc[[i[0] for i in similar_doc],:]

Unnamed: 0.1,Unnamed: 0,text,tweet_coord,tweet_created,tweet_location,negative,positive,neutral,compound,lemmatized,clean,blob_polarity,blob_subjectivity
2662,2662,board customers onto a plane with no pilot The only thing that expedited was my time in line to book a new flight Come on now,,2015-02-19 19:21:13 -0800,"Arlington, VA",0.087,0.0,0.913,-0.296,"['board', 'customer', 'onto', 'plane', 'pilot', 'thing', 'expedited', 'time', 'line', 'book', 'new', 'come']",board customer onto plane pilot thing expedited time line book new come,0.136364,0.454545
3837,3837,UnitedAirlines one of my family members lost his passport in a United Airlines flight and is trying to get it back,,Mon Jul 22 23:46:51 +0000 2019,,0.1,0.121,0.779,0.128,"['unitedairlines', 'one', 'family', 'member', 'lost', 'passport', 'trying', 'get', 'back']",unitedairlines one family member lost passport trying get back,0.0,0.0
5786,5786,United Airlines admits mistake over boarding dispute with Calgary mom and sick child,,Sat Jun 22 21:36:15 +0000 2019,"Daniel's Hr, NL",0.393,0.234,0.374,-0.5267,"['admits', 'mistake', 'boarding', 'dispute', 'calgary', 'mom', 'sick', 'child']",admits mistake boarding dispute calgary mom sick child,-0.714286,0.857143
1792,1792,im glad u can solve the prob But my experience remains will not fly again on United,,2015-02-21 14:38:26 -0800,,0.0,0.353,0.647,0.7269,"['glad', 'solve', 'prob', 'experience', 'remains', 'fly']",glad solve prob experience remains fly,0.65,0.95
4054,4054,ieri United Airlines is still in business and they had a passenger removed fr,,Tue Jul 16 23:54:08 +0000 2019,,0.0,0.189,0.811,0.4215,"['ieri', 'still', 'business', 'passenger', 'removed', 'fr']",ieri still business passenger removed fr,0.0,0.0
3901,3901,Basically zero customer service while stranded in Manila United Airlines had to help us Finally,,Tue Jun 25 21:53:24 +0000 2019,,0.0,0.297,0.703,0.6705,"['basically', 'zero', 'customer', 'service', 'stranded', 'manila', 'help', 'u', 'finally']",basically zero customer service stranded manila help u finally,0.0,1.0
3359,3359,just give up Your service is terrible If I wasnt forced to use United on certain routes I never would,,2015-02-18 06:09:24 -0800,"columbus, oh",0.127,0.301,0.572,0.5073,"['give', 'service', 'terrible', 'wasnt', 'forced', 'use', 'certain', 'route', 'never', 'would']",give service terrible wasnt forced use certain route never would,-0.361905,0.590476
3840,3840,frequent flyer for all airlines mileage system in airlines united frequent flyer miles program register frequen,,Wed Jul 17 23:34:12 +0000 2019,,0.0,0.157,0.843,0.4215,"['frequent', 'flyer', 'mileage', 'system', 'frequent', 'flyer', 'mile', 'program', 'register', 'frequen']",frequent flyer mileage system frequent flyer mile program register frequen,0.1,0.3
4426,4426,American Airlines or United Airlines Alaska Air,,Fri Jun 28 21:58:46 +0000 2019,"Seattle, WA",0.0,0.318,0.682,0.4215,"['american', 'alaska', 'air']",american alaska air,0.0,0.0
1006,1006,sitting on UAL 683 a comedy of errors UAL is incompetent,,2015-02-22 18:41:51 -0800,,0.367,0.167,0.467,-0.4588,"['sitting', 'ual', 'comedy', 'error', 'ual', 'incompetent']",sitting ual comedy error ual incompetent,-0.35,0.366667
