In [1]:
#imports

import pandas as pd
import gensim
import pyLDAvis.gensim
from sklearn.feature_extraction.text import CountVectorizer
from gensim import corpora, models
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from pprint import pprint

pd.set_option('display.max_colwidth', -1)

## Classifying Tweets to Categories
<a id="intertopic_map"></a>

## LDA - Intertopic Distance Map

In [2]:
airlines = pd.read_csv('./data/with_sentiment.csv')

In [3]:
# Create Dictionary
id2word = corpora.Dictionary(airlines.lemmatized.str.split())

# Create Corpus
texts = airlines.lemmatized.str.split()

# Term Document Frequency
corpus = [id2word.doc2bow(text) for text in texts]

In [4]:
# Build LDA model
lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                           id2word=id2word,
                                           num_topics=6, 
                                           random_state=42,
                                           update_every=1,
                                           chunksize=500,
                                           passes=90,
                                           alpha='auto',
                                           per_word_topics=True)

In [5]:
# Print the keywords in the topics
pprint(lda_model.print_topics())
doc_lda = lda_model[corpus]

[(0,
  '0.019*"\'max\'," + 0.011*"\'travel\'," + 0.011*"\'boeing\'," + '
  '0.010*"\'take\'," + 0.009*"\'class\'," + 0.008*"\'cancellation\'," + '
  '0.007*"\'via\']" + 0.007*"\'miss\'," + 0.007*"\'minute\'," + '
  '0.006*"\'first\',"'),
 (1,
  '0.016*"\'american\'," + 0.010*"\'air\'," + 0.009*"\'delta\'," + '
  '0.008*"\'landing\'," + 0.007*"\'fly\'," + 0.007*"\'made\'," + '
  '0.007*"\'seaworld\'," + 0.007*"\'hold\'," + 0.007*"\'bad\'," + '
  '0.006*"\'weather\',"'),
 (2,
  '0.029*"\'service\'," + 0.027*"\'customer\'," + 0.009*"\'passenger\'," + '
  '0.009*"\'experience\'," + 0.009*"\'ever\'," + 0.008*"\'international\'," + '
  '0.008*"\'worst\'," + 0.008*"\'check\'," + 0.008*"\'ive\'," + '
  '0.007*"\'fly\',"'),
 (3,
  '0.015*"\'flying\'," + 0.007*"\'boarding\'," + 0.006*"\'news\'," + '
  '0.006*"\'big\'," + 0.006*"\'wrong\'," + 0.005*"\'terminal\'," + '
  '0.004*"\'country\'," + 0.004*"\'aviation\'," + 0.004*"\'making\'," + '
  '0.004*"\'emirate\',"'),
 (4,
  '0.014*"\'plane\'," + 

In [6]:
# Visualize the topics
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim.prepare(lda_model, corpus, id2word)
vis

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  return pd.concat([default_term_info] + list(topic_dfs))


## Doc2Vec

In [7]:
documents = [TaggedDocument(doc, [i]) for i, doc in enumerate(airlines['text'])]

#  building a model
model = Doc2Vec(documents, vector_size=300, window=5, min_count=1, dm =1)
#Infer vector for a new document:
vector = model.infer_vector(["system", "response"])
#vector

In [8]:
similar_doc = model.docvecs.most_similar(3,topn = 50)
similar_doc.sort(key= lambda x: x[1])
similar_doc

[(2129, 0.9532811641693115),
 (3857, 0.9535890817642212),
 (3889, 0.9537265300750732),
 (4020, 0.9539952278137207),
 (358, 0.954095184803009),
 (2057, 0.954257607460022),
 (4121, 0.954363226890564),
 (5224, 0.9546170830726624),
 (4753, 0.9548377990722656),
 (4346, 0.9550571441650391),
 (5401, 0.9552822113037109),
 (112, 0.9553805589675903),
 (5835, 0.9558187127113342),
 (5837, 0.9558968544006348),
 (1404, 0.9560844302177429),
 (5218, 0.9562580585479736),
 (1873, 0.9562865495681763),
 (1959, 0.9565156698226929),
 (2834, 0.9566709995269775),
 (4430, 0.9569083452224731),
 (2868, 0.9569994211196899),
 (44, 0.9571046829223633),
 (498, 0.9572144150733948),
 (2300, 0.9572303295135498),
 (4283, 0.9575250148773193),
 (1479, 0.9576835632324219),
 (4056, 0.9578889608383179),
 (5273, 0.9580610990524292),
 (2446, 0.9580742120742798),
 (4280, 0.9586573243141174),
 (1037, 0.9593287706375122),
 (957, 0.9596248269081116),
 (1884, 0.9608516693115234),
 (4486, 0.96208655834198),
 (888, 0.9621230363845825

In [9]:
airlines.iloc[[i[0] for i in similar_doc],:]

Unnamed: 0.1,Unnamed: 0,text,tweet_coord,tweet_created,tweet_location,negative,positive,neutral,compound,lemmatized,clean,blob_polarity,blob_subjectivity
2129,2129,Sure Follow for a sec and I will,,2015-02-20 19:18:26 -0800,"Omaha, Nebraska",0.0,0.315,0.685,0.3182,"['sure', 'follow', 'sec']",sure follow sec,0.2,0.744444
3857,3857,Airlines makes connecting the world easier than ever with ConnectionSaver,,Sun Jul 07 20:54:34 +0000 2019,Quad Cities Metropolitan Area,0.0,0.237,0.763,0.4215,"['make', 'connecting', 'world', 'easier', 'ever', 'connectionsaver']",make connecting world easier ever connectionsaver,0.0,0.0
3889,3889,Basically zero customer service while stranded in Manila United Airlines had to help us Finally,,Tue Jun 25 21:53:24 +0000 2019,,0.0,0.297,0.703,0.6705,"['basically', 'zero', 'customer', 'service', 'stranded', 'manila', 'help', 'u', 'finally']",basically zero customer service stranded manila help u finally,0.0,1.0
4020,4020,Another entitled politician just like Rep Sheila Jackson Lee who took,,Mon Jul 22 20:23:48 +0000 2019,,0.0,0.338,0.662,0.5574,"['another', 'entitled', 'politician', 'like', 'rep', 'sheila', 'jackson', 'lee', 'took']",another entitled politician like rep sheila jackson lee took,0.0,0.0
358,358,I send an email about my bad experience and you send back a generic response Yet another reason why Ill never fly with you again,,2015-02-23 18:33:02 -0800,"Raleigh, NC",0.231,0.0,0.769,-0.743,"['send', 'email', 'bad', 'experience', 'send', 'back', 'generic', 'response', 'yet', 'another', 'reason', 'ill', 'never', 'fly']",send email bad experience send back generic response yet another reason ill never fly,-0.32,0.513333
2057,2057,Uniteds CEO has decided to outsource and or push out more and more of their skilled and loyal employeeslack of staff,,2015-02-20 23:25:32 -0800,Chicago Illinois Crime.Inc,0.0,0.134,0.866,0.4767,"['uniteds', 'ceo', 'decided', 'outsource', 'push', 'skilled', 'loyal', 'employeeslack', 'staff']",uniteds ceo decided outsource push skilled loyal employeeslack staff,0.416667,0.666667
4121,4121,Hi the inflight features are amazing United is currently not on the list of airlines Ke,,Fri Jun 28 22:25:31 +0000 2019,"Bellevue, WA",0.0,0.32,0.68,0.765,"['hi', 'inflight', 'feature', 'amazing', 'currently', 'list', 'ke']",hi inflight feature amazing currently list ke,0.3,0.65
5224,5224,United Airlines joins the list of corporations and groups withdrawing support for SeaWorld MondayMotivation,,Sun Jul 28 22:11:36 +0000 2019,,0.0,0.314,0.686,0.6705,"['join', 'list', 'corporation', 'group', 'withdrawing', 'support', 'seaworld', 'mondaymotivation']",join list corporation group withdrawing support seaworld mondaymotivation,0.0,0.0
4753,4753,Never fly United Airlines Seriously the worst travel experience Ive ever had,,Tue Jun 25 23:28:33 +0000 2019,,0.475,0.0,0.525,-0.7982,"['never', 'fly', 'seriously', 'worst', 'travel', 'experience', 'ive', 'ever']",never fly seriously worst travel experience ive ever,-0.7,0.95
4346,4346,About had it with United and American Airlines and their inconsideration of Medically fragile passengers who NEED t,,Sun Jul 28 23:44:30 +0000 2019,"Illinois, USA",0.0,0.149,0.851,0.4215,"['american', 'inconsideration', 'medically', 'fragile', 'passenger', 'need']",american inconsideration medically fragile passenger need,0.0,0.25
