In [None]:
import os
os.system('pip install nltk')
os.system('pip install stanza')
os.system('pip install emoji')
os.system('pip install -U sentence-transformers')

import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import re
import nltk
from nltk.corpus import stopwords
from nltk.corpus import wordnet
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.sentiment.vader import SentimentIntensityAnalyzer

import ssl

try:
    _create_unverified_https_context = ssl._create_unverified_context
except AttributeError:
    pass
else:
    ssl._create_default_https_context = _create_unverified_https_context

nltk.download('stopwords')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('vader_lexicon')
nltk.download('words')
words = set(nltk.corpus.words.words())
import stanza
stanza.download("en")

class Tweet():
    def __init__(self, text, text_clean, token, author):
        self.token = token
        self.text = text
        self.text_clean = text_clean
        self.author = author

        self.sentiments = {}
        self.associations = []

class User():
    def __init__(self, author_id, handle, tweets):
        self.author_id = author_id
        self.handle = handle
        self.tweets = {}
        for x in tweets: self.tweets[x.token] = x

#Removing Emojis
def remove_emojis(data):
    emoj = re.compile("["
                      u"\U0001F600-\U0001F64F"  # emoticons
                      u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                      u"\U0001F680-\U0001F6FF"  # transport & map symbols
                      u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                      u"\U00002500-\U00002BEF"  # chinese char
                      u"\U00002702-\U000027B0"
                      u"\U00002702-\U000027B0"
                      u"\U000024C2-\U0001F251"
                      u"\U0001f926-\U0001f937"
                      u"\U00010000-\U0010ffff"
                      u"\u2640-\u2642"
                      u"\u2600-\u2B55"
                      u"\u200d"
                      u"\u23cf"
                      u"\u23e9"
                      u"\u231a"
                      u"\ufe0f"  # dingbats
                      u"\u3030"
                      "]+", re.UNICODE)
    return re.sub(emoj, '', str(data))

def cleaner(text):
    tweet = re.sub("@[A-Za-z0-9]+","",str(text)) #Remove @ sign
    tweet = re.sub(r"(?:\@|http?\://|https?\://|www)\S+", "", str(text)) #Remove http links
    tweet = re.sub('[()!?]', ' ', str(text)) #removing punctuation
    tweet = re.sub('\[.*?\]',' ', str(text))
    tweet = " ".join(tweet.split())
    tweet = tweet.replace("#", "").replace("_", " ") #Remove hashtag sign but keep the text
    tweet = " ".join(w for w in nltk.wordpunct_tokenize(str(text))
                     if w.lower() in words or not w.isalpha())
    return text

def calculate_sentiments(text, stop_words, nlp):
    txt = text
    sentList = nltk.sent_tokenize(txt) # Splitting the text into sentences
    fcluster = []
    totalfeatureList = []
    finalcluster = []
    featureList = []
    categories = []
    dic = {}

    for line in sentList:
        # Remove links from line
        line = re.sub(r'http\S+|#', '', line)

        # Swap '-', ';', '*' with commas
        line = re.sub(':', '.', line)
        line = re.sub('\n|@', '', line)

        # Remove consecutive punctuation recursively
        r = re.compile(r'([.,/#!$%^&*;:{}=_`~()-])[.,/#!$%^&*;:{}=_`~()-]+')
        line = r.sub(r'\1', line)

        # Replace hashtags with association term
        line = re.sub('#', 'hashtag is ', line)

        try:
            newtaggedList = []
            txt_list = nltk.word_tokenize(line) # Splitting up into words
            taggedList = nltk.pos_tag(txt_list) # Doing Part-of-Speech Tagging to each word

            newwordList = []
            flag = 0
            for i in range(0,len(taggedList)-1):
                if(taggedList[i][1]=="NN" and taggedList[i+1][1]=="NN"): # If two consecutive words are Nouns then they are joined together
                    newwordList.append(taggedList[i][0]+taggedList[i+1][0])
                    flag=1
                else:
                    if(flag==1):
                        flag=0
                        continue
                    newwordList.append(taggedList[i][0])
                    if(i==len(taggedList)-2):
                        newwordList.append(taggedList[i+1][0])

            finaltxt = ' '.join(word for word in newwordList)
            new_txt_list = nltk.word_tokenize(finaltxt)
            wordsList = [w for w in new_txt_list if not w in stop_words]
            taggedList = nltk.pos_tag(wordsList)

            doc = nlp(finaltxt) # Object of Stanford NLP Pipeleine

            dep_node = []

            for dep_edge in doc.sentences[0].dependencies:
                dep_node.append([dep_edge[2].text, dep_edge[0].id, dep_edge[1]])

            for i in range(0, len(dep_node)):
                if (int(dep_node[i][1]) != 0):
                    dep_node[i][1] = newwordList[(int(dep_node[i][1]) - 1)]

            # featureList = []
            # categories = []
            for i in taggedList:
                if(i[1]=='JJ' or i[1]=='NN' or i[1]=='JJR' or i[1]=='NNS' or i[1]=='RB'):
                    featureList.append(list(i))
                    totalfeatureList.append(list(i)) # This list will store all the features for every sentence
                    categories.append(i[0])

            for i in featureList:
                filist = []
                for j in dep_node:
                    if((j[0]==i[0] or j[1]==i[0]) and (j[2] in ["nsubj", "acl:relcl", "obj", "dobj", "agent", "advmod", "amod", "neg", "prep_of", "acomp", "xcomp", "compound"])):
                        if(j[0]==i[0]):
                            filist.append(j[1])
                        else:
                            filist.append(j[0])
                fcluster.append([i[0], filist])

        except IndexError:
            print('IndexError:', line)
            return []

        except AttributeError:
            print('AttributeError')
            return []

    for i in featureList:
        dic[i[0]] = i[1]

    for i in fcluster:
        if(dic[i[0]]=="NN"):
            finalcluster.append(i)

    return finalcluster



In [None]:
df = pd.read_csv('gs://sw-airlines-data-hub/data/processed/sw-airlines-tweets-w-users.csv')
df['tweet_clean'] = df['text'].str.lower().str.replace(r'[^0-9a-zA-Z\s]+', '', regex=True).apply(cleaner)
df['tweet_clean'] = df['tweet_clean'].apply(remove_emojis)
X = df[['text', 'tweet_clean', 'tweet_token', 'author_id']]
X.head()

In [None]:
tweets = {}
for i in range(0, len(X)):
    t = X.iloc[i]
    text, text_clean, tweet_token, author_id = t.text, t.tweet_clean, t.tweet_token, t.author_id
    tweets[tweet_token] = Tweet(text, text_clean, tweet_token, author_id)

In [None]:
topics = ['book', 'cancel', 'call', 'support', 'delay', 'change', 'never',
          'fear', 'pandemic', 'group', 'pilot', 'mask', 'avgeek', 'technology', 'pay']
dfs = []

for t in topics:
    dfs.append(X[X.tweet_clean.str.contains(t)])
    print(t, len(X[X.tweet_clean.str.contains(t)]))

x0 = pd.concat(dfs)
x0 = x0.set_index('tweet_token')
x0['tweet_token'] = x0.index
x0 = x0[['tweet_token', 'author_id', 'tweet_clean']]
x0 = x0.drop_duplicates(
  subset = ['tweet_clean'],
  keep = 'first').reset_index(drop = True)
lx0 = len(x0)
lx0

In [None]:
model = SentenceTransformer('bert-base-nli-mean-tokens')
# x1 = x0.sample(1000)
x0_unique = list(x0.tweet_clean.unique())
lx0_unique = len(x0_unique)
print(lx0_unique, "==", lx0, " : ", lx0_unique==lx0)

In [None]:
%%time
sentence_embeddings = model.encode(x0_unique)
sentence_embeddings.shape

In [None]:
sentence_embeddings

In [None]:
cosine_similarity(
    [sentence_embeddings[0]],
    sentence_embeddings[1:]
)

In [None]:
x0['Embedding'] = x0.apply(lambda _: '', axis=1)

In [None]:
x0['Embedding'] = sentence_embeddings.tolist()

In [None]:
x0.head()

In [None]:
def similarity(row):
    print(row.index)
X1 = x0.copy()
X1[['tweet_clean','Embedding']].apply(similarity)
import numpy as np
# put all sentence embeddings in a matrix
e_col = 'Embedding'
embed_mat = np.array([x for x in X1[e_col]])

# calculate distance between every embedding pair
sim_mat = cosine_similarity(embed_mat,embed_mat)

#get sim score for a given text at position df.iloc[sentence_id]
sentence_id = 0
# write sim scores to df4
X1['sim_score'] = sim_mat[sentence_id]

## Calculate dinstance between all pairs of sentences in DF
# drop any NA
X1.dropna(inplace=True)
# put embeddings in matrix
e_col = 'Embedding'
embed_mat = np.array([x for x in list(X1[e_col])])
# calculate distance between every embedding pair
sim_mat = cosine_similarity(embed_mat,embed_mat)
X1['sim_score'] = sim_mat[sentence_id]

def get_sim_df_total (predictions, e_col):
    # print(predictions)
    embed_mat = np.array([x for x in predictions[e_col]])
    sim_mat = cosine_similarity(embed_mat, embed_mat)

    for i,v in enumerate(sim_mat):
        s = predictions.iloc[i].tweet_clean
        predictions[s] = sim_mat[i]
    return predictions

In [None]:
%%time
sim_matrix_df = get_sim_df_total(X1, 'Embedding')
# sim_matrix_df.to_feather('sim_matrix_df.feather')

In [None]:
import pickle
sim_matrix_df.to_pickle('sim_matrix_df.pickle')

In [None]:
# sim_matrix_df.to_csv('sim_matrix_df.csv')

In [None]:
def upload_to_output(path, bucket_name, folder_name):
    storage_client = storage.Client()
    bucket = storage_client.bucket(bucket_name)
    blob = bucket.blob(folder_name + '/' + path.split('/')[-1])
    blob.upload_from_filename(path)

from google.cloud import storage
bucket_name = 'sw-airlines-data-hub'
# upload_to_output('sim_matrix_df.feather', bucket_name, 'data/processed')
upload_to_output('sim_matrix_df.pickle', bucket_name, 'data/processed')