Import the data

In [1]:
import os
os.system('pip install nltk')
os.system('pip install stanza')
os.system('pip install emoji')
os.system('pip install -U sentence-transformers')

import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
import re
import nltk
from nltk.corpus import stopwords
from nltk.corpus import wordnet
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.sentiment.vader import SentimentIntensityAnalyzer

import ssl

try:
    _create_unverified_https_context = ssl._create_unverified_context
except AttributeError:
    pass
else:
    ssl._create_default_https_context = _create_unverified_https_context

nltk.download('stopwords')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('vader_lexicon')
nltk.download('words')
words = set(nltk.corpus.words.words())
import stanza
stanza.download("en")

class Tweet():
    def __init__(self, text, text_clean, token, author_id):
        self.token = token
        self.text = text
        self.text_clean = text_clean
        self.author_id = author_id
        
        self.similiar_tweets = []
        self.similiar_authors = []

        self.sentiments = {}
        self.associations = []

class User():
    def __init__(self, author_id):
        self.author_id = author_id
        self.tweet_tokens = []
        self.similiar_authors = []
        self.author_edges = {}

#Removing Emojis
def remove_emojis(data):
    emoj = re.compile("["
                      u"\U0001F600-\U0001F64F"  # emoticons
                      u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                      u"\U0001F680-\U0001F6FF"  # transport & map symbols
                      u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                      u"\U00002500-\U00002BEF"  # chinese char
                      u"\U00002702-\U000027B0"
                      u"\U00002702-\U000027B0"
                      u"\U000024C2-\U0001F251"
                      u"\U0001f926-\U0001f937"
                      u"\U00010000-\U0010ffff"
                      u"\u2640-\u2642"
                      u"\u2600-\u2B55"
                      u"\u200d"
                      u"\u23cf"
                      u"\u23e9"
                      u"\u231a"
                      u"\ufe0f"  # dingbats
                      u"\u3030"
                      "]+", re.UNICODE)
    return re.sub(emoj, '', str(data))

def cleaner(text):
    tweet = re.sub("@[A-Za-z0-9]+","",str(text)) #Remove @ sign
    tweet = re.sub(r"(?:\@|http?\://|https?\://|www)\S+", "", str(text)) #Remove http links
    tweet = re.sub('[()!?]', ' ', str(text)) #removing punctuation
    tweet = re.sub('\[.*?\]',' ', str(text))
    tweet = " ".join(tweet.split())
    tweet = tweet.replace("#", "").replace("_", " ") #Remove hashtag sign but keep the text
    tweet = " ".join(w for w in nltk.wordpunct_tokenize(str(text))
                     if w.lower() in words or not w.isalpha())
    return text

def calculate_sentiments(text, stop_words, nlp):
    txt = text
    sentList = nltk.sent_tokenize(txt) # Splitting the text into sentences
    fcluster = []
    totalfeatureList = []
    finalcluster = []
    featureList = []
    categories = []
    dic = {}

    for line in sentList:
        # Remove links from line
        line = re.sub(r'http\S+|#', '', line)

        # Swap '-', ';', '*' with commas
        line = re.sub(':', '.', line)
        line = re.sub('\n|@', '', line)

        # Remove consecutive punctuation recursively
        r = re.compile(r'([.,/#!$%^&*;:{}=_`~()-])[.,/#!$%^&*;:{}=_`~()-]+')
        line = r.sub(r'\1', line)

        # Replace hashtags with association term
        line = re.sub('#', 'hashtag is ', line)

        try:
            newtaggedList = []
            txt_list = nltk.word_tokenize(line) # Splitting up into words
            taggedList = nltk.pos_tag(txt_list) # Doing Part-of-Speech Tagging to each word

            newwordList = []
            flag = 0
            for i in range(0,len(taggedList)-1):
                if(taggedList[i][1]=="NN" and taggedList[i+1][1]=="NN"): # If two consecutive words are Nouns then they are joined together
                    newwordList.append(taggedList[i][0]+taggedList[i+1][0])
                    flag=1
                else:
                    if(flag==1):
                        flag=0
                        continue
                    newwordList.append(taggedList[i][0])
                    if(i==len(taggedList)-2):
                        newwordList.append(taggedList[i+1][0])

            finaltxt = ' '.join(word for word in newwordList)
            new_txt_list = nltk.word_tokenize(finaltxt)
            wordsList = [w for w in new_txt_list if not w in stop_words]
            taggedList = nltk.pos_tag(wordsList)

            doc = nlp(finaltxt) # Object of Stanford NLP Pipeleine

            dep_node = []

            for dep_edge in doc.sentences[0].dependencies:
                dep_node.append([dep_edge[2].text, dep_edge[0].id, dep_edge[1]])

            for i in range(0, len(dep_node)):
                if (int(dep_node[i][1]) != 0):
                    dep_node[i][1] = newwordList[(int(dep_node[i][1]) - 1)]

            # featureList = []
            # categories = []
            for i in taggedList:
                if(i[1]=='JJ' or i[1]=='NN' or i[1]=='JJR' or i[1]=='NNS' or i[1]=='RB'):
                    featureList.append(list(i))
                    totalfeatureList.append(list(i)) # This list will store all the features for every sentence
                    categories.append(i[0])

            for i in featureList:
                filist = []
                for j in dep_node:
                    if((j[0]==i[0] or j[1]==i[0]) and (j[2] in ["nsubj", "acl:relcl", "obj", "dobj", "agent", "advmod", "amod", "neg", "prep_of", "acomp", "xcomp", "compound"])):
                        if(j[0]==i[0]):
                            filist.append(j[1])
                        else:
                            filist.append(j[0])
                fcluster.append([i[0], filist])

        except IndexError:
            print('IndexError:', line)
            return []

        except AttributeError:
            print('AttributeError')
            return []

    for i in featureList:
        dic[i[0]] = i[1]

    for i in fcluster:
        if(dic[i[0]]=="NN"):
            finalcluster.append(i)

    return finalcluster

def upload_to_output(path, bucket_name, folder_name):
    storage_client = storage.Client()
    bucket = storage_client.bucket(bucket_name)
    blob = bucket.blob(folder_name + '/' + path.split('/')[-1])
    blob.upload_from_filename(path)

from google.cloud import storage   
bucket_name = 'sw-airlines-data-hub'

Collecting nltk
  Using cached nltk-3.7-py3-none-any.whl (1.5 MB)
Collecting regex>=2021.8.3
  Using cached regex-2022.10.31-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (757 kB)
Installing collected packages: regex, nltk
Successfully installed nltk-3.7 regex-2022.10.31
Collecting stanza
  Using cached stanza-1.4.2-py3-none-any.whl (691 kB)
Collecting emoji
  Using cached emoji-2.2.0-py3-none-any.whl
Installing collected packages: emoji, stanza
Successfully installed emoji-2.2.0 stanza-1.4.2
Collecting sentence-transformers
  Using cached sentence_transformers-2.2.2-py3-none-any.whl
Collecting sentencepiece
  Using cached sentencepiece-0.1.97-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
Collecting transformers<5.0.0,>=4.6.0
  Using cached transformers-4.24.0-py3-none-any.whl (5.5 MB)
Collecting huggingface-hub>=0.4.0
  Using cached huggingface_hub-0.10.1-py3-none-any.whl (163 kB)
Collecting filelock
  Using cached filelock-3.8.0-py3-none-any.whl (10 k

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/jupyter/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /home/jupyter/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/jupyter/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /home/jupyter/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!
[nltk_data] Downloading package words to /home/jupyter/nltk_data...
[nltk_data]   Package words is already up-to-date!
  from .autonotebook import tqdm as notebook_tqdm
Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.4.1.json: 193kB [00:00, 45.0MB/s]                    
2022-11-01 23:20:24 INFO: Downloading default packages for language: en (Eng

In [2]:
pip install --upgrade joblib==1.1.0

Note: you may need to restart the kernel to use updated packages.


In [3]:
!pip install top2vec[sentence_encoders]
!pip install top2vec[sentence_transformers]
!pip install --upgrade joblib==1.1.0
!pip install top2vec
!pip install -U sentence-transformers
!pip install -U top2vec

Collecting top2vec[sentence_encoders]
  Using cached top2vec-1.0.27-py3-none-any.whl (25 kB)
Collecting gensim>=4.0.0
  Using cached gensim-4.2.0-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (24.1 MB)
Collecting wordcloud
  Using cached wordcloud-1.8.2.2-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (435 kB)
Collecting hdbscan>=0.8.27
  Using cached hdbscan-0.8.29-cp37-cp37m-linux_x86_64.whl
Collecting umap-learn>=0.5.1
  Using cached umap_learn-0.5.3-py3-none-any.whl
Collecting tensorflow-text
  Using cached tensorflow_text-2.10.0-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (5.9 MB)
Collecting smart-open>=1.8.1
  Using cached smart_open-6.2.0-py3-none-any.whl (58 kB)
Collecting cython>=0.27
  Using cached Cython-0.29.32-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_24_x86_64.whl (1.9 MB)
Collecting pynndescent>=0.5
  Using cached pynndescent-0.5.8-py3-none-any.whl
Collecting flatbuffers>=2.0
  Using cached flatbuffers-22.10.26

In [4]:
import pandas as pd
import re
import nltk as nltk
from nltk.corpus import stopwords
import multiprocessing
import string
import numpy as np
import matplotlib.pyplot as plt
from top2vec import Top2Vec
import umap
import seaborn as sns
import hdbscan

2022-11-01 23:21:37.525795: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2022-11-01 23:21:37.817797: E tensorflow/stream_executor/cuda/cuda_blas.cc:2981] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2022-11-01 23:21:39.442165: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /usr/local/cuda/lib64:/usr/local/cuda/lib:/usr/local/lib/x86_64-linux-gnu:/usr/local/nvidia/lib:/usr/local/nvidia/lib64:/usr/local/nvidia/lib:/usr/local/nvidia/lib64
2022-11-01 23:21:39.442355: W tensorflow/strea

In [5]:
df = pd.read_csv('gs://sw-airlines-data-hub/data/processed/sw-airlines-tweets-w-users.csv')

In [6]:
df.head()

Unnamed: 0.1,Unnamed: 0,text,conversation_id,author_id,created_at_x,in_reply_to_user_id,public_metrics.retweet_count,public_metrics.reply_count,public_metrics.like_count,public_metrics.quote_count,...,verified,description,created_at_y,profile_image_url,username,user.followers_count,user.following_count,user.tweet_count,user.listed_count,withheld.country_codes_y
0,0,@SouthwestAir Addressed ?? All I received was ...,1564990393199910913,1556045423298547712,2022-08-31T15:05:56.000Z,7212562.0,0,0,0,0,...,False,,2022-08-06T22:32:19.000Z,https://pbs.twimg.com/profile_images/155604553...,mjmurtha515,0.0,21.0,5.0,0.0,
1,1,@SouthwestAir now I can't get through to PHL b...,1564827636379127809,890234680703946752,2022-08-31T15:03:43.000Z,8.902347e+17,0,0,0,0,...,False,,2017-07-26T15:37:42.000Z,https://pbs.twimg.com/profile_images/143026606...,dbmuniz01,92.0,1064.0,2127.0,0.0,
2,2,@SouthwestAir Case number 48779036,1564990393199910913,1556045423298547712,2022-08-31T15:01:54.000Z,7212562.0,0,1,0,0,...,False,,2022-08-06T22:32:19.000Z,https://pbs.twimg.com/profile_images/155604553...,mjmurtha515,0.0,21.0,5.0,0.0,
3,3,Wanna attend the @NSAIOfficial #songwriterawar...,1564991701684555778,41620367,2022-08-31T15:01:12.000Z,,0,1,0,0,...,False,Experience all that Music City has to offer at...,2009-05-21T16:32:27.000Z,https://pbs.twimg.com/profile_images/603274534...,RenNashville,1839.0,571.0,4435.0,74.0,
4,4,The flight crew is way to excited and happy on...,1564991556473556994,3091670150,2022-08-31T15:00:37.000Z,,0,0,0,0,...,False,YHTBH podcast out everywhere every Monday. Hou...,2015-03-17T00:54:48.000Z,https://pbs.twimg.com/profile_images/157486347...,comedianarthur,1246.0,785.0,7638.0,5.0,


Clean the Tweets

In [7]:
def cleaner(text):
    tweet = re.sub("@[A-Za-z0-9]+","",str(text)) #Remove @ sign
    tweet = re.sub(r"(?:\@|http?\://|https?\://|www)\S+", "", str(text)) #Remove http links
    tweet = re.sub('[()!?]', ' ', str(text)) #removing punctuation
    tweet = re.sub('[^a-zA-Z0-9 @ . , : - _]', '', str(text)) #Remove special characters
    tweet = re.sub('\[.*?\]',' ', str(text))
    tweet = re.sub(r'\n', '.  ', str(text)) # Remove /n characters to avoid problems with analysis
    tweet = " ".join(tweet.split())
    tweet = tweet.replace("#", "").replace("_", " ") #Remove hashtag sign but keep the text
#    tweet = " ".join(w for w in nltk.wordpunct_tokenize(str(text))
#         if w.lower() in words or not w.isalpha())
    return text

In [8]:
df['tweet_clean'] = df['text'].str.lower().str.replace(r'[^0-9a-zA-Z\s]+', '', regex=True).apply(cleaner)

In [9]:
df.tweet_clean.head()

0    southwestair addressed  all i received was an ...
1    southwestair now i cant get through to phl bag...
2                    southwestair case number 48779036
3    wanna attend the nsaiofficial songwriterawards...
4    the flight crew is way to excited and happy on...
Name: tweet_clean, dtype: object

In [10]:
#Removing Emojis
def remove_emojis(data):
    emoj = re.compile("["
        u"\U0001F600-\U0001F64F"  # emoticons
        u"\U0001F300-\U0001F5FF"  # symbols & pictographs
        u"\U0001F680-\U0001F6FF"  # transport & map symbols
        u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
        u"\U00002500-\U00002BEF"  # chinese char
        u"\U00002702-\U000027B0"
        u"\U00002702-\U000027B0"
        u"\U000024C2-\U0001F251"
        u"\U0001f926-\U0001f937"
        u"\U00010000-\U0010ffff"
        u"\u2640-\u2642" 
        u"\u2600-\u2B55"
        u"\u200d"
        u"\u23cf"
        u"\u23e9"
        u"\u231a"
        u"\ufe0f"  # dingbats
        u"\u3030"
                      "]+", re.UNICODE)
    return re.sub(emoj, '', str(data))

In [11]:
df['tweet_clean'] = df['tweet_clean'].apply(remove_emojis)

In [12]:
#Taking out duplicate tweets with author id and text 
df=df.drop_duplicates(subset= ['author_id', 'text'], keep= 'first')
#Taking out articles with the same title

In [13]:
df=df.drop_duplicates(subset= ['text'], keep= 'first')

In [14]:
df.shape

(61918, 26)

In [15]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/jupyter/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [16]:
import gensim, logging, warnings
# NLTK Stop words
from nltk.corpus import stopwords
stop_words = stopwords.words('english')
stop_words.extend(['southwest', 'southwestair','airline', 'airlines', 'flights','travel','flight',
                   'come',  'from', 'use', 'not', 'would', 'say', 'could', 'be', 'know', 'good',
                   'go', 'get', 'do', 'done', 'try', 'many', 'some', 'nice', 'thank', 'think', 'see', 
                   'rather', 'easy', 'easily', 'lot', 'lack', 'make', 'want', 'seem', 'run', 'need', 
                   'even', 'right', 'line', 'even', 'also', 'may', 'take'])

%matplotlib inline
warnings.filterwarnings("ignore",category=DeprecationWarning)
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.ERROR)

In [17]:
#Remove Stop Words from topics, just for a try
df['tweet_without_stopwords'] = df['tweet_clean'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop_words)]))

Begin the Top2Vec

In [18]:
model = Top2Vec(df['tweet_clean'].values, embedding_model='universal-sentence-encoder')

2022-11-01 23:21:54,331 - top2vec - INFO - Pre-processing documents for training
2022-11-01 23:21:54,331 : INFO : Pre-processing documents for training
2022-11-01 23:22:04,441 - top2vec - INFO - Downloading universal-sentence-encoder model
2022-11-01 23:22:04,441 : INFO : Downloading universal-sentence-encoder model
2022-11-01 23:22:15.732043: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcuda.so.1'; dlerror: libcuda.so.1: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /usr/local/cuda/lib64:/usr/local/cuda/lib:/usr/local/lib/x86_64-linux-gnu:/usr/local/nvidia/lib:/usr/local/nvidia/lib64:/usr/local/nvidia/lib:/usr/local/nvidia/lib64
2022-11-01 23:22:15.732103: W tensorflow/stream_executor/cuda/cuda_driver.cc:263] failed call to cuInit: UNKNOWN ERROR (303)
2022-11-01 23:22:15.732145: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:156] kernel driver does not appear to be running on this host (06a7ce61

In [19]:
model2 = Top2Vec(df['tweet_without_stopwords'].values, embedding_model='universal-sentence-encoder')

2022-11-01 23:24:08,920 - top2vec - INFO - Pre-processing documents for training
2022-11-01 23:24:08,920 : INFO : Pre-processing documents for training
2022-11-01 23:24:15,304 - top2vec - INFO - Downloading universal-sentence-encoder model
2022-11-01 23:24:15,304 : INFO : Downloading universal-sentence-encoder model
2022-11-01 23:24:21,954 - top2vec - INFO - Creating joint document/word embedding
2022-11-01 23:24:21,954 : INFO : Creating joint document/word embedding
2022-11-01 23:24:35,566 - top2vec - INFO - Creating lower dimension embedding of documents
2022-11-01 23:24:35,566 : INFO : Creating lower dimension embedding of documents
2022-11-01 23:25:32,650 - top2vec - INFO - Finding dense areas of documents
2022-11-01 23:25:32,650 : INFO : Finding dense areas of documents
2022-11-01 23:25:37,330 - top2vec - INFO - Finding topics
2022-11-01 23:25:37,330 : INFO : Finding topics


In [20]:
model.get_num_topics()

334

In [21]:
#Per the nature of Top2Vec, we actually have a WORSE fit (i.e. more topics) when we remove stop words! 

#This is because stop words appear in almost all tweets, therefore they will be equidistant from all topics and are not the nearst word to any topic.
model2.get_num_topics()

405

In [27]:
topic_mapping = model.hierarchical_topic_reduction(num_topics=16)

In [30]:
model.topic_words_reduced

16

In [36]:
model.search_topics(keywords=["cancel"], num_topics=3)

([array(['cancelled', 'cancellations', 'canceled', 'cancelling',
         'canceling', 'cancellation', 'cancel', 'inflight', 'flown',
         'airline', 'flight', 'flew', 'airasia', 'qantas', 'allegiant',
         'westjet', 'airfare', 'delaying', 'faa', 'jetblue', 'flying',
         'aviation', 'takeoff', 'airplanes', 'ryanair', 'cathay',
         'airfares', 'rebook', 'layover', 'easyjet', 'turbulence',
         'lufthansa', 'refunded', 'airplane', 'airlines', 'icao', 'delayed',
         'aircraft', 'booked', 'airways', 'reimbursement', 'pilot',
         'airbus', 'refund', 'swa', 'jet', 'sfo', 'pilots', 'planes',
         'flights'], dtype='<U15'),
  array(['airline', 'airasia', 'inflight', 'qantas', 'westjet', 'airlines',
         'jetblue', 'lufthansa', 'cathay', 'airways', 'refunded', 'easyjet',
         'cancellations', 'airfares', 'allegiant', 'flight', 'ryanair',
         'airfare', 'reimbursement', 'cancellation', 'klm', 'refund',
         'airbus', 'passengers', 'rebook', '