In [None]:
import os
os.system('pip install nltk')
os.system('pip install stanza')
os.system('pip install emoji')
os.system('pip install -U sentence-transformers')

import pandas as pd
import numpy as np
# from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import re
import nltk
from nltk.corpus import stopwords
from nltk.corpus import wordnet
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.sentiment.vader import SentimentIntensityAnalyzer

import ssl

try:
    _create_unverified_https_context = ssl._create_unverified_context
except AttributeError:
    pass
else:
    ssl._create_default_https_context = _create_unverified_https_context

nltk.download('stopwords')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('vader_lexicon')
nltk.download('words')
words = set(nltk.corpus.words.words())
import stanza
stanza.download("en")

class Tweet():
    def __init__(self, text, text_clean, token, author):
        self.token = token
        self.text = text
        self.text_clean = text_clean
        self.author = author

        self.sentiments = {}
        self.associations = []

class User():
    def __init__(self, author_id, handle, tweets):
        self.author_id = author_id
        self.handle = handle
        self.tweets = {}
        for x in tweets: self.tweets[x.token] = x

#Removing Emojis
def remove_emojis(data):
    emoj = re.compile("["
                      u"\U0001F600-\U0001F64F"  # emoticons
                      u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                      u"\U0001F680-\U0001F6FF"  # transport & map symbols
                      u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                      u"\U00002500-\U00002BEF"  # chinese char
                      u"\U00002702-\U000027B0"
                      u"\U00002702-\U000027B0"
                      u"\U000024C2-\U0001F251"
                      u"\U0001f926-\U0001f937"
                      u"\U00010000-\U0010ffff"
                      u"\u2640-\u2642"
                      u"\u2600-\u2B55"
                      u"\u200d"
                      u"\u23cf"
                      u"\u23e9"
                      u"\u231a"
                      u"\ufe0f"  # dingbats
                      u"\u3030"
                      "]+", re.UNICODE)
    return re.sub(emoj, '', str(data))

def cleaner(text):
    tweet = re.sub("@[A-Za-z0-9]+","",str(text)) #Remove @ sign
    tweet = re.sub(r"(?:\@|http?\://|https?\://|www)\S+", "", str(text)) #Remove http links
    tweet = re.sub('[()!?]', ' ', str(text)) #removing punctuation
    tweet = re.sub('\[.*?\]',' ', str(text))
    tweet = " ".join(tweet.split())
    tweet = tweet.replace("#", "").replace("_", " ") #Remove hashtag sign but keep the text
    tweet = " ".join(w for w in nltk.wordpunct_tokenize(str(text))
                     if w.lower() in words or not w.isalpha())
    return text

def calculate_sentiments(text, stop_words, nlp):
    txt = text
    sentList = nltk.sent_tokenize(txt) # Splitting the text into sentences
    fcluster = []
    totalfeatureList = []
    finalcluster = []
    featureList = []
    categories = []
    dic = {}

    for line in sentList:
        # Remove links from line
        line = re.sub(r'http\S+|#', '', line)

        # Swap '-', ';', '*' with commas
        line = re.sub(':', '.', line)
        line = re.sub('\n|@', '', line)

        # Remove consecutive punctuation recursively
        r = re.compile(r'([.,/#!$%^&*;:{}=_`~()-])[.,/#!$%^&*;:{}=_`~()-]+')
        line = r.sub(r'\1', line)

        # Replace hashtags with association term
        line = re.sub('#', 'hashtag is ', line)

        try:
            newtaggedList = []
            txt_list = nltk.word_tokenize(line) # Splitting up into words
            taggedList = nltk.pos_tag(txt_list) # Doing Part-of-Speech Tagging to each word

            newwordList = []
            flag = 0
            for i in range(0,len(taggedList)-1):
                if(taggedList[i][1]=="NN" and taggedList[i+1][1]=="NN"): # If two consecutive words are Nouns then they are joined together
                    newwordList.append(taggedList[i][0]+taggedList[i+1][0])
                    flag=1
                else:
                    if(flag==1):
                        flag=0
                        continue
                    newwordList.append(taggedList[i][0])
                    if(i==len(taggedList)-2):
                        newwordList.append(taggedList[i+1][0])

            finaltxt = ' '.join(word for word in newwordList)
            new_txt_list = nltk.word_tokenize(finaltxt)
            wordsList = [w for w in new_txt_list if not w in stop_words]
            taggedList = nltk.pos_tag(wordsList)

            doc = nlp(finaltxt) # Object of Stanford NLP Pipeleine

            dep_node = []

            for dep_edge in doc.sentences[0].dependencies:
                dep_node.append([dep_edge[2].text, dep_edge[0].id, dep_edge[1]])

            for i in range(0, len(dep_node)):
                if (int(dep_node[i][1]) != 0):
                    dep_node[i][1] = newwordList[(int(dep_node[i][1]) - 1)]

            # featureList = []
            # categories = []
            for i in taggedList:
                if(i[1]=='JJ' or i[1]=='NN' or i[1]=='JJR' or i[1]=='NNS' or i[1]=='RB'):
                    featureList.append(list(i))
                    totalfeatureList.append(list(i)) # This list will store all the features for every sentence
                    categories.append(i[0])

            for i in featureList:
                filist = []
                for j in dep_node:
                    if((j[0]==i[0] or j[1]==i[0]) and (j[2] in ["nsubj", "acl:relcl", "obj", "dobj", "agent", "advmod", "amod", "neg", "prep_of", "acomp", "xcomp", "compound"])):
                        if(j[0]==i[0]):
                            filist.append(j[1])
                        else:
                            filist.append(j[0])
                fcluster.append([i[0], filist])

        except IndexError:
            # print('IndexError:', line)
            return []

        except AttributeError:
            print('AttributeError')
            return []

    for i in featureList:
        dic[i[0]] = i[1]

    for i in fcluster:
        if(dic[i[0]]=="NN"):
            finalcluster.append(i)

    return finalcluster



Collecting nltk
  Using cached nltk-3.7-py3-none-any.whl (1.5 MB)
Collecting regex>=2021.8.3
  Using cached regex-2022.10.31-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (757 kB)
Installing collected packages: regex, nltk
Successfully installed nltk-3.7 regex-2022.10.31
Collecting stanza
  Using cached stanza-1.4.2-py3-none-any.whl (691 kB)
Collecting emoji
  Using cached emoji-2.2.0-py3-none-any.whl
Collecting torch>=1.3.0
  Using cached torch-1.13.0-cp37-cp37m-manylinux1_x86_64.whl (890.2 MB)
Collecting nvidia-cublas-cu11==11.10.3.66
  Using cached nvidia_cublas_cu11-11.10.3.66-py3-none-manylinux1_x86_64.whl (317.1 MB)
Collecting nvidia-cudnn-cu11==8.5.0.96
  Using cached nvidia_cudnn_cu11-8.5.0.96-2-py3-none-manylinux1_x86_64.whl (557.1 MB)
Collecting nvidia-cuda-runtime-cu11==11.7.99
  Using cached nvidia_cuda_runtime_cu11-11.7.99-py3-none-manylinux1_x86_64.whl (849 kB)
Collecting nvidia-cuda-nvrtc-cu11==11.7.99
  Using cached nvidia_cuda_nvrtc_cu11-11.7.99-2-py3-none-

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/jupyter/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /home/jupyter/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/jupyter/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /home/jupyter/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!
[nltk_data] Downloading package words to /home/jupyter/nltk_data...
[nltk_data]   Package words is already up-to-date!


In [None]:
df = pd.read_csv('gs://sw-airlines-data-hub/data/processed/sw-airlines-tweets-w-users.csv')
df.head()

In [None]:
df['tweet_clean'] = df['text'].str.lower().str.replace(r'[^0-9a-zA-Z\s]+', '', regex=True).apply(cleaner)
df['tweet_clean'] = df['tweet_clean'].apply(remove_emojis)
X = df[['tweet_token', 'author_id','tweet_clean', 'text']]
X.head()

In [None]:
nlp = stanza.Pipeline('en')
stop_words = set(stopwords.words('english'))

In [None]:
topics = ['book', 'cancel', 'call', 'support', 'delay', 'change', 
          'never', 'fear', 'pandemic', 'group', 'pilot', 'mask', 'avgeek', 'technology', 'pay']
dfs = []
for t in topics:
    dfs.append(X[X.tweet_clean.str.contains(t)])
    print(t, len(X[X.tweet_clean.str.contains(t)]))

x0 = pd.concat(dfs)
x0 = x0.set_index('tweet_token')
x0['tweet_token'] = x0.index
x0 = x0[['tweet_token', 'author_id', 'text', 'tweet_clean']]
x0 = x0.drop_duplicates()
len(x0)

In [None]:
x0.head()

In [7]:
!gsutil cp gs://sw-airlines-data-hub/data/processed/twt2twt_w_score.pkl ./
!gsutil cp gs://sw-airlines-data-hub/data/processed/auth2auth_w_score.pkl ./

import pickle
with open('twt2twt_w_score.pkl', 'rb') as f:
    tweet_objs = pickle.load(f)
    
with open('auth2auth_w_score.pkl', 'rb') as f:
    auth_objs = pickle.load(f)

tweets = {} 
for x in tweet_objs: tweets[x.token] = x
tweet_tokens = [x.token for x in tweet_objs]
    
authors = {} 
for x in auth_objs: authors[x.author_id] = x
unique_authors = [x.author_id for x in auth_objs]

Copying gs://sw-airlines-data-hub/data/processed/twt2twt_w_score.pkl...
/ [1 files][ 34.0 MiB/ 34.0 MiB]                                                
Operation completed over 1 objects/34.0 MiB.                                     
Copying gs://sw-airlines-data-hub/data/processed/auth2auth_w_score.pkl...
/ [1 files][  6.5 MiB/  6.5 MiB]                                                
Operation completed over 1 objects/6.5 MiB.                                      


In [None]:
%%time
sentiments = {}
length = len(tweet_tokens)
import datetime
import math
completion_times = []
i=0

for tkn in tweet_tokens:
    t1 = datetime.datetime.now()
    tweet = tweets[tkn].text_clean
    ttoken = tkn
    associations = calculate_sentiments(tweet, stop_words, nlp).copy()
    [tweets[tkn].associations.append(a) for a in associations.copy() if a[1]]
    i+=1

    # Calculate performance
    t2 = datetime.datetime.now()
    delta = t2 - t1
    seconds = delta.total_seconds()
    completion_times.append(seconds)
    avg_s = round(sum(completion_times)/len(completion_times),2)
    estimated_hours_left = (avg_s)*(length-i)/3600
    hours_left = math.floor(estimated_hours_left)
    minutes_left = math.floor((estimated_hours_left - hours_left)*60)
    
    if i % 1000 == 0: 
        print('Itr', i, 'Duration:', seconds, "  ", 'Estimated Remaining Time: ',
              hours_left,'hours :',minutes_left,'minutes.')
        # break

total_hours_needed = avg_s*length / 3600

Itr 1000 Duration: 0.176777    Estimated Remaining Time:  4 hours : 4 minutes.
Itr 2000 Duration: 0.279321    Estimated Remaining Time:  4 hours : 11 minutes.
Itr 3000 Duration: 0.078934    Estimated Remaining Time:  3 hours : 57 minutes.
Itr 4000 Duration: 0.155101    Estimated Remaining Time:  3 hours : 53 minutes.
Itr 5000 Duration: 0.195343    Estimated Remaining Time:  3 hours : 49 minutes.
Itr 6000 Duration: 0.104038    Estimated Remaining Time:  3 hours : 46 minutes.
Itr 7000 Duration: 0.201705    Estimated Remaining Time:  3 hours : 42 minutes.
Itr 8000 Duration: 0.160117    Estimated Remaining Time:  3 hours : 38 minutes.
Itr 9000 Duration: 0.227891    Estimated Remaining Time:  3 hours : 35 minutes.
Itr 10000 Duration: 0.126333    Estimated Remaining Time:  3 hours : 31 minutes.
Itr 11000 Duration: 0.105798    Estimated Remaining Time:  3 hours : 27 minutes.
Itr 12000 Duration: 0.170477    Estimated Remaining Time:  3 hours : 24 minutes.
Itr 13000 Duration: 0.297329    Estima

In [None]:
tweet_objs = list(tweets.values())

import pickle
with open('twt2twt_w_score_w_sentiments.pkl', 'wb') as f:
    pickle.dump(tweet_objs, f)

In [None]:
def upload_to_output(path, bucket_name, folder_name):
    storage_client = storage.Client()
    bucket = storage_client.bucket(bucket_name)
    blob = bucket.blob(folder_name + '/' + path.split('/')[-1])
    blob.upload_from_filename(path)

from google.cloud import storage
bucket_name = 'sw-airlines-data-hub'
upload_to_output('twt2twt_w_score_w_sentiments.pkl', bucket_name, 'data/processed')