In [1]:
import os
os.system('pip install nltk')
os.system('pip install stanza')
os.system('pip install emoji')
os.system('pip install -U sentence-transformers')

import pandas as pd
import numpy as np
# from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import re
import nltk
from nltk.corpus import stopwords
from nltk.corpus import wordnet
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.sentiment.vader import SentimentIntensityAnalyzer

import ssl

try:
    _create_unverified_https_context = ssl._create_unverified_context
except AttributeError:
    pass
else:
    ssl._create_default_https_context = _create_unverified_https_context

nltk.download('stopwords')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('vader_lexicon')
nltk.download('words')
words = set(nltk.corpus.words.words())
import stanza
stanza.download("en")

class Tweet():
    def __init__(self, text, text_clean, token, author):
        self.token = token
        self.text = text
        self.text_clean = text_clean
        self.author = author

        self.sentiments = {}
        self.associations = []

class User():
    def __init__(self, author_id, handle, tweets):
        self.author_id = author_id
        self.handle = handle
        self.tweets = {}
        for x in tweets: self.tweets[x.token] = x

#Removing Emojis
def remove_emojis(data):
    emoj = re.compile("["
                      u"\U0001F600-\U0001F64F"  # emoticons
                      u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                      u"\U0001F680-\U0001F6FF"  # transport & map symbols
                      u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                      u"\U00002500-\U00002BEF"  # chinese char
                      u"\U00002702-\U000027B0"
                      u"\U00002702-\U000027B0"
                      u"\U000024C2-\U0001F251"
                      u"\U0001f926-\U0001f937"
                      u"\U00010000-\U0010ffff"
                      u"\u2640-\u2642"
                      u"\u2600-\u2B55"
                      u"\u200d"
                      u"\u23cf"
                      u"\u23e9"
                      u"\u231a"
                      u"\ufe0f"  # dingbats
                      u"\u3030"
                      "]+", re.UNICODE)
    return re.sub(emoj, '', str(data))

def cleaner(text):
    tweet = re.sub("@[A-Za-z0-9]+","",str(text)) #Remove @ sign
    tweet = re.sub(r"(?:\@|http?\://|https?\://|www)\S+", "", str(text)) #Remove http links
    tweet = re.sub('[()!?]', ' ', str(text)) #removing punctuation
    tweet = re.sub('\[.*?\]',' ', str(text))
    tweet = " ".join(tweet.split())
    tweet = tweet.replace("#", "").replace("_", " ") #Remove hashtag sign but keep the text
    tweet = " ".join(w for w in nltk.wordpunct_tokenize(str(text))
                     if w.lower() in words or not w.isalpha())
    return text

def calculate_sentiments(text, stop_words, nlp):
    txt = text
    sentList = nltk.sent_tokenize(txt) # Splitting the text into sentences
    fcluster = []
    totalfeatureList = []
    finalcluster = []
    featureList = []
    categories = []
    dic = {}

    for line in sentList:
        # Remove links from line
        line = re.sub(r'http\S+|#', '', line)

        # Swap '-', ';', '*' with commas
        line = re.sub(':', '.', line)
        line = re.sub('\n|@', '', line)

        # Remove consecutive punctuation recursively
        r = re.compile(r'([.,/#!$%^&*;:{}=_`~()-])[.,/#!$%^&*;:{}=_`~()-]+')
        line = r.sub(r'\1', line)

        # Replace hashtags with association term
        line = re.sub('#', 'hashtag is ', line)

        try:
            newtaggedList = []
            txt_list = nltk.word_tokenize(line) # Splitting up into words
            taggedList = nltk.pos_tag(txt_list) # Doing Part-of-Speech Tagging to each word

            newwordList = []
            flag = 0
            for i in range(0,len(taggedList)-1):
                if(taggedList[i][1]=="NN" and taggedList[i+1][1]=="NN"): # If two consecutive words are Nouns then they are joined together
                    newwordList.append(taggedList[i][0]+taggedList[i+1][0])
                    flag=1
                else:
                    if(flag==1):
                        flag=0
                        continue
                    newwordList.append(taggedList[i][0])
                    if(i==len(taggedList)-2):
                        newwordList.append(taggedList[i+1][0])

            finaltxt = ' '.join(word for word in newwordList)
            new_txt_list = nltk.word_tokenize(finaltxt)
            wordsList = [w for w in new_txt_list if not w in stop_words]
            taggedList = nltk.pos_tag(wordsList)

            doc = nlp(finaltxt) # Object of Stanford NLP Pipeleine

            dep_node = []

            for dep_edge in doc.sentences[0].dependencies:
                dep_node.append([dep_edge[2].text, dep_edge[0].id, dep_edge[1]])

            for i in range(0, len(dep_node)):
                if (int(dep_node[i][1]) != 0):
                    dep_node[i][1] = newwordList[(int(dep_node[i][1]) - 1)]

            # featureList = []
            # categories = []
            for i in taggedList:
                if(i[1]=='JJ' or i[1]=='NN' or i[1]=='JJR' or i[1]=='NNS' or i[1]=='RB'):
                    featureList.append(list(i))
                    totalfeatureList.append(list(i)) # This list will store all the features for every sentence
                    categories.append(i[0])

            for i in featureList:
                filist = []
                for j in dep_node:
                    if((j[0]==i[0] or j[1]==i[0]) and (j[2] in ["nsubj", "acl:relcl", "obj", "dobj", "agent", "advmod", "amod", "neg", "prep_of", "acomp", "xcomp", "compound"])):
                        if(j[0]==i[0]):
                            filist.append(j[1])
                        else:
                            filist.append(j[0])
                fcluster.append([i[0], filist])

        except IndexError:
            print('IndexError:', line)
            return []

        except AttributeError:
            print('AttributeError')
            return []

    for i in featureList:
        dic[i[0]] = i[1]

    for i in fcluster:
        if(dic[i[0]]=="NN"):
            finalcluster.append(i)

    return finalcluster



Collecting nltk
  Using cached nltk-3.7-py3-none-any.whl (1.5 MB)
Collecting regex>=2021.8.3
  Using cached regex-2022.10.31-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (757 kB)
Installing collected packages: regex, nltk
Successfully installed nltk-3.7 regex-2022.10.31
Collecting stanza
  Using cached stanza-1.4.2-py3-none-any.whl (691 kB)
Collecting emoji
  Using cached emoji-2.2.0-py3-none-any.whl
Installing collected packages: emoji, stanza
Successfully installed emoji-2.2.0 stanza-1.4.2
Collecting sentence-transformers
  Using cached sentence_transformers-2.2.2-py3-none-any.whl
Collecting huggingface-hub>=0.4.0
  Using cached huggingface_hub-0.11.0-py3-none-any.whl (182 kB)
Collecting transformers<5.0.0,>=4.6.0
  Using cached transformers-4.24.0-py3-none-any.whl (5.5 MB)
Collecting sentencepiece
  Using cached sentencepiece-0.1.97-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
Collecting filelock
  Using cached filelock-3.8.0-py3-none-any.whl (10 k

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/jupyter/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /home/jupyter/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/jupyter/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /home/jupyter/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!
[nltk_data] Downloading package words to /home/jupyter/nltk_data...
[nltk_data]   Package words is already up-to-date!
  from .autonotebook import tqdm as notebook_tqdm
Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.4.1.json: 193kB [00:00, 40.9MB/s]                    
2022-11-25 18:22:39 INFO: Downloading default packages for language: en (Eng

In [2]:
!gsutil cp gs://sw-airlines-data-hub/data/processed/twt2twt_w_score_w_sentiments.pkl ./

Copying gs://sw-airlines-data-hub/data/processed/twt2twt_w_score_w_sentiments.pkl...
- [1 files][ 64.6 MiB/ 64.6 MiB]                                                
Operation completed over 1 objects/64.6 MiB.                                     


In [3]:
import pickle
with open('twt2twt_w_score_w_sentiments.pkl', 'rb') as f:
    twts = pickle.load(f)

In [4]:
twts[0].associations[0][1:]

[['thanks']]

In [5]:
sentiments_dict = {}

for t in twts:
    for a in t.associations:
        subject = a[0]
        try: [sentiments_dict[subject].append(x) for x in a[1]]
        except KeyError: 
            sentiments_dict[subject] = []
            [sentiments_dict[subject].append(x) for x in a[1]]

In [6]:
!pip install --upgrade vaderSentiment

Collecting vaderSentiment
  Using cached vaderSentiment-3.3.2-py2.py3-none-any.whl (125 kB)
Installing collected packages: vaderSentiment
Successfully installed vaderSentiment-3.3.2


In [7]:
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

In [8]:
analyzer = SentimentIntensityAnalyzer()

sentences = [x.text_clean for x in twts]
sentiments = {}

In [9]:
aspect_df = pd.DataFrame(index=sentiments_dict.keys(), columns=['pos','neg','neu','compound', 'count'])

for x in sentiments_dict.keys():
    vs = analyzer.polarity_scores(' '.join(sentiments_dict[x]))
    n = len(sentiments_dict[x]) 
    sentiments[x] = aspect_df.loc[x] = ([vs[x] for x in vs.keys()] + [n])

In [10]:
aspect_df

Unnamed: 0,pos,neg,neu,compound,count
southwestair,0.145,0.603,0.252,1.0,18111
flight,0.181,0.707,0.111,-0.9999,10834
airline,0.175,0.601,0.224,0.9998,4144
decide,0.187,0.702,0.111,-0.0772,15
egainmasouthwestair,0.0,1.0,0.0,0.0,1
...,...,...,...,...,...
ampsustainability,0.0,1.0,0.0,0.0,2
sustainabilityexpert,0.0,1.0,0.0,0.0,1
experteduardomariz,0.0,1.0,0.0,0.0,5
woce,0.0,1.0,0.0,0.0,2


In [11]:
aspect_df[aspect_df.compound > 0.50].sort_values(by='count' , ascending=False).head(30)

Unnamed: 0,pos,neg,neu,compound,count
southwestair,0.145,0.603,0.252,1.0,18111
travel,0.046,0.89,0.064,0.9967,4500
airline,0.175,0.601,0.224,0.9998,4144
travelaviation,0.006,0.956,0.039,0.9985,3230
aviation,0.006,0.964,0.03,0.9963,2934
time,0.076,0.82,0.103,0.9983,2833
cruise,0.015,0.917,0.068,0.9991,2487
air,0.034,0.909,0.057,0.9914,2017
fly,0.05,0.838,0.112,0.9987,1848
help,0.061,0.801,0.138,0.9992,1744


In [12]:
from collections import Counter
cnt = Counter()

top10_sentiments_per_aspect = {}

for x in sentiments_dict.keys(): top10_sentiments_per_aspect[x] = Counter(sentiments_dict[x]).most_common(10)

In [13]:
top10_sentiments_per_aspect['customerservice']

[('southwestair', 39),
 ('airlines', 38),
 ('great', 34),
 ('worst', 32),
 ('terrible', 24),
 ('southwestaircustomer', 24),
 ('horrible', 24),
 ('called', 23),
 ('poor', 23),
 ('best', 21)]

In [39]:
for x in twts:
    for a in x.associations:
        if 'customerservice' in a and 'called' in a[1]:
            print(x.text_clean)
            print('----------')

southwestair i have called customer service 3 times about an issue with a voucher and i keep getting the same response and no resolution who can i contact to get my voucher
----------
southwestair thought for sure this was a mistake called your customer service  to get it straightened out and quentin proceeds to talk to me like im an idiot i ask to speak to his manager he says it will be at least 90 min so i will give up
----------
southwestair i purchased my flight about 5 months ago i added early bird i did my part my flight leaves tomorrow and i got b53 position seems like early bird is a scam i called customer service and was told that swa is sorry for the inconvenience
----------
southwestair earlybird ck in on app didnt work for both of my flights  called customer service both times and was put on hold 40 minutes them disconnected where is the service you so proudly advertise all the time
----------
southwestair my bag is in san jose i cant get there and cant pay for the shipping

In [14]:
%%time
tweet_sentiments = {}
for x in twts:
    vs = analyzer.polarity_scores(x.text_clean)
    tweet_sentiments[x.token] = ([vs[y] for y in vs.keys()] + [x.text_clean])

CPU times: user 17.2 s, sys: 30.3 ms, total: 17.2 s
Wall time: 17.2 s


In [19]:
tweet_sentiments[twts[0].token]

[0.0,
 1.0,
 0.0,
 0.0,
 'southwestair addressed  all i received was an automated email saying customer relations would contact me over a month ago']

In [15]:
tweet_aspect_df = pd.DataFrame(index=tweet_sentiments.keys(), columns=['pos','neg','neu','compound', 'text_clean'])

for token in tweet_sentiments.keys(): tweet_aspect_df.loc[token] = tweet_sentiments[token]
tweet_aspect_df['is_neg'] = tweet_aspect_df.compound.apply(lambda x: True if x < -0.50 else False)
tweet_aspect_df['is_pos'] = tweet_aspect_df.compound.apply(lambda x: True if x > 0.50 else False)
tweet_aspect_df['tweet_token'] = tweet_aspect_df.index
tweet_aspect_df.head()

Unnamed: 0,pos,neg,neu,compound,text_clean,is_neg,is_pos,tweet_token
1589998047110782976,0.289,0.412,0.299,0.0258,southwestair thanks ill do that now,False,False,1589998047110782976
1589997663336148993,0.0,1.0,0.0,0.0,southwestair we have a flight from norfolk to ...,False,False,1589997663336148993
1589996983066189824,0.0,1.0,0.0,0.0,aviatorjlat egainma southwestair usdot nonuttr...,False,False,1589996983066189824
1589996585316147200,0.079,0.842,0.079,0.0,kaoconnor southwestair denairport midwayairpor...,False,False,1589996585316147200
1589996464733769728,0.071,0.612,0.316,0.7003,good morning southwestair do we get free alcoh...,False,True,1589996464733769728


In [16]:
!gsutil cp gs://sw-airlines-data-hub/data/processed/tweet2tweet_df3.pickle ./

Copying gs://sw-airlines-data-hub/data/processed/tweet2tweet_df3.pickle...
- [1 files][ 68.5 MiB/ 68.5 MiB]                                                
Operation completed over 1 objects/68.5 MiB.                                     


In [17]:
t2t = pd.read_pickle('tweet2tweet_df3.pickle')
t2t.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 199727 entries, 6 to 1111606
Data columns (total 21 columns):
 #   Column           Non-Null Count   Dtype  
---  ------           --------------   -----  
 0   TweetTokenA      199727 non-null  object 
 1   TweetTokenB      199727 non-null  object 
 2   SimilarityScore  199727 non-null  float64
 3   AspectsA         199727 non-null  object 
 4   AspectsB         199727 non-null  object 
 5   AuthorTokenA     199727 non-null  int64  
 6   UserA            198814 non-null  object 
 7   text_x           199727 non-null  object 
 8   a_created_at     199727 non-null  object 
 9   a_rt_cnt         199727 non-null  int64  
 10  a_reply_cnt      199727 non-null  int64  
 11  a_like_count     199727 non-null  int64  
 12  a_qt_count       199727 non-null  int64  
 13  AuthorTokenB     199727 non-null  int64  
 14  UserB            199313 non-null  object 
 15  text_y           199727 non-null  object 
 16  b_created_at     199727 non-null  obj

In [18]:
# Match A Sentiments
t2t = t2t.merge(tweet_aspect_df, left_on='TweetTokenA', right_on='tweet_token')
t2t = t2t.rename(columns={'text_clean': 'text_clean_A',
                          'pos': 'A_pos_score', 
                          'neg' : 'A_neg_score', 
                          'neu':'A_neutral_score', 
                          'compound':'A_compound_score', 
                          'is_neg':'A_is_neg',
                          'is_pos':'A_is_pos'})
t2t = t2t.drop(columns=['tweet_token'])

# Match B Sentiments
t2t = t2t.merge(tweet_aspect_df, left_on='TweetTokenB', right_on='tweet_token')
t2t = t2t.rename(columns={'text_clean': 'text_clean_B',
                          'pos': 'B_pos_score', 
                          'neg' : 'B_neg_score', 
                          'neu':'B_neutral_score', 
                          'compound':'B_compound_score', 
                          'is_neg':'B_is_neg',
                          'is_pos':'B_is_pos'})

t2t = t2t.drop(columns=['tweet_token'])
len(t2t)

199727

In [19]:
t2t.head()

Unnamed: 0,TweetTokenA,TweetTokenB,SimilarityScore,AspectsA,AspectsB,AuthorTokenA,UserA,text_x,a_created_at,a_rt_cnt,...,text_clean_A,a_is_neg,a_is_pos,pos_score_b,neg_score_b,neutral_score_b,compound_score_b,text_clean_B,b_is_neg,b_is_pos
0,1583825720316305408,1589953501517737985,0.890007,"[['golf', ['clubs']], ['golftrip', ['triptoday...",[],215716356,The_Real_FKD,So I fly with my golf clubs pretty consistentl...,2022-10-22T14:20:52.000Z,0,...,so i fly with my golf clubs pretty consistentl...,False,False,0.0,1.0,0.0,0.0,ryanfoxy24 aircanada southwestair im going to ...,False,False
1,1589953491040366592,1592671061837221889,0.892296,"[['southwestairshame', ['control']], ['control...","[['seat', ['own', 'choosing']], ['idea', ['gre...",1416721475424464898,JimMcneese,@SouthwestAir Shame on you once again.. can’t ...,2022-11-08T12:10:26.000Z,0,...,southwestair shame on you once again cant cont...,False,True,0.087,0.787,0.126,0.0387,neveragain i used to love traveling on southwe...,False,False
2,1576613811477393409,1592671061837221889,0.894714,"[['seatingsystem', ['dislike']], ['seat', ['ch...","[['seat', ['own', 'choosing']], ['idea', ['gre...",1126752147620544512,JoshuaBigler,I love @SouthwestAir’s customer service and la...,2022-10-02T16:43:19.000Z,0,...,i love southwestairs customer service and lack...,False,False,0.087,0.787,0.126,0.0387,neveragain i used to love traveling on southwe...,False,False
3,1569153634897924096,1592671061837221889,0.882039,"[['southwestair', ['wanted']], ['let', ['wante...","[['seat', ['own', 'choosing']], ['idea', ['gre...",493689130,WaifusaurusRex,@SouthwestAir just wanted to let someone in yo...,2022-09-12T02:39:14.000Z,0,...,southwestair just wanted to let someone in you...,True,False,0.087,0.787,0.126,0.0387,neveragain i used to love traveling on southwe...,False,False
4,1591637809877901313,1592671061837221889,0.894431,"[['feel', ['used', 'valued', 'somehow']], ['pa...","[['seat', ['own', 'choosing']], ['idea', ['gre...",1337825236973056001,TravelsWithMic2,I really miss the days when #airlines cared ab...,2022-11-13T03:43:19.000Z,1,...,i really miss the days when airlines cared abo...,False,False,0.087,0.787,0.126,0.0387,neveragain i used to love traveling on southwe...,False,False


In [20]:
t2t.to_pickle('tweet2tweet_df4.pickle')

def upload_to_output(path, bucket_name, folder_name):
    storage_client = storage.Client()
    bucket = storage_client.bucket(bucket_name)
    blob = bucket.blob(folder_name + '/' + path.split('/')[-1])
    blob.upload_from_filename(path)

from google.cloud import storage   
bucket_name = 'sw-airlines-data-hub'
upload_to_output('tweet2tweet_df4.pickle', bucket_name, 'data/processed')