# Similarity Network Notebook

In [1]:
import os
os.system('pip install nltk')
os.system('pip install stanza')
os.system('pip install emoji')
os.system('pip install -U sentence-transformers')
os.system('pip install --pre torch torchvision torchaudio -f https://download.pytorch.org/whl/nightly/cu110/torch_nightly.html')


import pandas as pd
import numpy as np
# from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import re
import nltk
from nltk.corpus import stopwords
from nltk.corpus import wordnet
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.sentiment.vader import SentimentIntensityAnalyzer

import ssl

try:
    _create_unverified_https_context = ssl._create_unverified_context
except AttributeError:
    pass
else:
    ssl._create_default_https_context = _create_unverified_https_context

nltk.download('stopwords')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('vader_lexicon')
nltk.download('words')
words = set(nltk.corpus.words.words())
import stanza
stanza.download("en")

class Tweet():
    def __init__(self, text, text_clean, token, author_id):
        self.token = token
        self.text = text
        self.text_clean = text_clean
        self.author_id = author_id
        
        self.similiar_tweets = []
        self.similiar_authors = []

        self.sentiments = {}
        self.associations = []

class User():
    def __init__(self, author_id):
        self.author_id = author_id
        self.tweet_tokens = []
        self.similiar_authors = []
        self.author_edges = {}

#Removing Emojis
def remove_emojis(data):
    emoj = re.compile("["
                      u"\U0001F600-\U0001F64F"  # emoticons
                      u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                      u"\U0001F680-\U0001F6FF"  # transport & map symbols
                      u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                      u"\U00002500-\U00002BEF"  # chinese char
                      u"\U00002702-\U000027B0"
                      u"\U00002702-\U000027B0"
                      u"\U000024C2-\U0001F251"
                      u"\U0001f926-\U0001f937"
                      u"\U00010000-\U0010ffff"
                      u"\u2640-\u2642"
                      u"\u2600-\u2B55"
                      u"\u200d"
                      u"\u23cf"
                      u"\u23e9"
                      u"\u231a"
                      u"\ufe0f"  # dingbats
                      u"\u3030"
                      "]+", re.UNICODE)
    return re.sub(emoj, '', str(data))

def cleaner(text):
    tweet = re.sub("@[A-Za-z0-9]+","",str(text)) #Remove @ sign
    tweet = re.sub(r"(?:\@|http?\://|https?\://|www)\S+", "", str(text)) #Remove http links
    tweet = re.sub('[()!?]', ' ', str(text)) #removing punctuation
    tweet = re.sub('\[.*?\]',' ', str(text))
    tweet = " ".join(tweet.split())
    tweet = tweet.replace("#", "").replace("_", " ") #Remove hashtag sign but keep the text
    tweet = " ".join(w for w in nltk.wordpunct_tokenize(str(text))
                     if w.lower() in words or not w.isalpha())
    return text

def calculate_sentiments(text, stop_words, nlp):
    txt = text
    sentList = nltk.sent_tokenize(txt) # Splitting the text into sentences
    fcluster = []
    totalfeatureList = []
    finalcluster = []
    featureList = []
    categories = []
    dic = {}

    for line in sentList:
        # Remove links from line
        line = re.sub(r'http\S+|#', '', line)

        # Swap '-', ';', '*' with commas
        line = re.sub(':', '.', line)
        line = re.sub('\n|@', '', line)

        # Remove consecutive punctuation recursively
        r = re.compile(r'([.,/#!$%^&*;:{}=_`~()-])[.,/#!$%^&*;:{}=_`~()-]+')
        line = r.sub(r'\1', line)

        # Replace hashtags with association term
        line = re.sub('#', 'hashtag is ', line)

        try:
            newtaggedList = []
            txt_list = nltk.word_tokenize(line) # Splitting up into words
            taggedList = nltk.pos_tag(txt_list) # Doing Part-of-Speech Tagging to each word

            newwordList = []
            flag = 0
            for i in range(0,len(taggedList)-1):
                if(taggedList[i][1]=="NN" and taggedList[i+1][1]=="NN"): # If two consecutive words are Nouns then they are joined together
                    newwordList.append(taggedList[i][0]+taggedList[i+1][0])
                    flag=1
                else:
                    if(flag==1):
                        flag=0
                        continue
                    newwordList.append(taggedList[i][0])
                    if(i==len(taggedList)-2):
                        newwordList.append(taggedList[i+1][0])

            finaltxt = ' '.join(word for word in newwordList)
            new_txt_list = nltk.word_tokenize(finaltxt)
            wordsList = [w for w in new_txt_list if not w in stop_words]
            taggedList = nltk.pos_tag(wordsList)

            doc = nlp(finaltxt) # Object of Stanford NLP Pipeleine

            dep_node = []

            for dep_edge in doc.sentences[0].dependencies:
                dep_node.append([dep_edge[2].text, dep_edge[0].id, dep_edge[1]])

            for i in range(0, len(dep_node)):
                if (int(dep_node[i][1]) != 0):
                    dep_node[i][1] = newwordList[(int(dep_node[i][1]) - 1)]

            # featureList = []
            # categories = []
            for i in taggedList:
                if(i[1]=='JJ' or i[1]=='NN' or i[1]=='JJR' or i[1]=='NNS' or i[1]=='RB'):
                    featureList.append(list(i))
                    totalfeatureList.append(list(i)) # This list will store all the features for every sentence
                    categories.append(i[0])

            for i in featureList:
                filist = []
                for j in dep_node:
                    if((j[0]==i[0] or j[1]==i[0]) and (j[2] in ["nsubj", "acl:relcl", "obj", "dobj", "agent", "advmod", "amod", "neg", "prep_of", "acomp", "xcomp", "compound"])):
                        if(j[0]==i[0]):
                            filist.append(j[1])
                        else:
                            filist.append(j[0])
                fcluster.append([i[0], filist])

        except IndexError:
            print('IndexError:', line)
            return []

        except AttributeError:
            print('AttributeError')
            return []

    for i in featureList:
        dic[i[0]] = i[1]

    for i in fcluster:
        if(dic[i[0]]=="NN"):
            finalcluster.append(i)

    return finalcluster

def upload_to_output(path, bucket_name, folder_name):
    storage_client = storage.Client()
    bucket = storage_client.bucket(bucket_name)
    blob = bucket.blob(folder_name + '/' + path.split('/')[-1])
    blob.upload_from_filename(path)

from google.cloud import storage   
bucket_name = 'sw-airlines-data-hub'

Collecting nltk
  Using cached nltk-3.7-py3-none-any.whl (1.5 MB)
Collecting regex>=2021.8.3
  Using cached regex-2022.10.31-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (757 kB)
Installing collected packages: regex, nltk
Successfully installed nltk-3.7 regex-2022.10.31
Collecting stanza
  Using cached stanza-1.4.2-py3-none-any.whl (691 kB)
Collecting emoji
  Using cached emoji-2.2.0-py3-none-any.whl
Installing collected packages: emoji, stanza
Successfully installed emoji-2.2.0 stanza-1.4.2
Collecting sentence-transformers
  Using cached sentence_transformers-2.2.2-py3-none-any.whl
Collecting huggingface-hub>=0.4.0
  Using cached huggingface_hub-0.11.0-py3-none-any.whl (182 kB)
Collecting sentencepiece
  Using cached sentencepiece-0.1.97-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
Collecting transformers<5.0.0,>=4.6.0
  Using cached transformers-4.24.0-py3-none-any.whl (5.5 MB)
Collecting filelock
  Using cached filelock-3.8.0-py3-none-any.whl (10 k

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/jupyter/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /home/jupyter/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/jupyter/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /home/jupyter/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!
[nltk_data] Downloading package words to /home/jupyter/nltk_data...
[nltk_data]   Package words is already up-to-date!
  from .autonotebook import tqdm as notebook_tqdm
Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.4.1.json: 193kB [00:00, 56.6MB/s]                    
2022-11-23 13:24:01 INFO: Downloading default packages for language: en (Eng

In [None]:
df = pd.read_csv('gs://sw-airlines-data-hub/data/processed/sw-airlines-tweets-w-users.csv')
df['tweet_clean'] = df['text'].str.lower().str.replace(r'[^0-9a-zA-Z\s]+', '', regex=True).apply(cleaner)
df['tweet_clean'] = df['tweet_clean'].apply(remove_emojis)
X = df[['text', 'tweet_clean', 'tweet_token', 'author_id']]
X.head()

tweets = {}
authors = {}

for i in range(0, len(X)):
    t = X.iloc[i]
    
    # Tweet Object Dictionary
    text, text_clean, tweet_token, author_id = t.text, t.tweet_clean, t.tweet_token, t.author_id
    tweets[tweet_token] = Tweet(text, text_clean, tweet_token, author_id)
    
    # Author Object Dictionary
    try:
        authors[author_id].tweet_tokens.append(tweet_token)
    except KeyError:
        authors[author_id] = User(author_id)
        authors[author_id].tweet_tokens.append(tweet_token)
    

topics = ['book', 'cancel', 'call', 'support', 'delay', 'change', 'never', 'fear',
          'pandemic', 'group', 'pilot', 'mask', 'avgeek', 'technology', 'pay']
dfs = []

for t in topics:
    dfs.append(X[X.tweet_clean.str.contains(t)])
    print(t, len(X[X.tweet_clean.str.contains(t)]))

x0 = pd.concat(dfs)
x0 = x0.set_index('tweet_token')
x0['tweet_token'] = x0.index
x0 = x0[['tweet_token', 'author_id', 'tweet_clean']]
x0 = x0.drop_duplicates(
  subset = ['tweet_clean'],
  keep = 'first').reset_index(drop = True)
lx0 = len(x0)
lx0

In [None]:
sim_matrix_df = pd.read_pickle('gs://sw-airlines-data-hub/data/processed/sim_matrix_df.pickle')

In [None]:
# sim_matrix_df

## Create Tweet to Tweet Similarity

In [None]:
tweet_tokens = list(x0.tweet_token.unique())

texts = {}
token_search = {}

for x in tweet_tokens:
    texts[x] = tweets[x].text_clean
    try:
        token_search[tweets[x].text_clean].append(x)
    except KeyError:
        token_search[tweets[x].text_clean] = [x]
len(tweet_tokens)

In [6]:
%%time
import datetime
z = sim_matrix_df.set_index('tweet_token').drop(columns = ['author_id', 'tweet_clean', 'Embedding', 'sim_score'])
simmed_tweets = list(z.index)
texts = z.columns
counter = 0
t1 = datetime.datetime.now()

# for each tweet_id, get similar texts
for x in tweet_tokens:
    
    try: a = z.loc[x]
    except KeyError: continue
    
    # for each similar text, get all associated tweet_ids
    try:
        similar_texts = [[t, z.loc[x][t]] for t in texts if z.loc[x][t] > 0.88]
    except AttributeError:
        similar_texts = []
        print(x, t, counter)
    
    # get associated tokens for each tweet based on a token dictionary
    similar_tweets = []
    for t in similar_texts:
        similar_tweets += [[token,t[1]] for token in token_search[t[0]]]
    
    # add tweet ids to similar tweet list
    for sx in similar_tweets: tweets[x].similiar_tweets.append(sx)
        
    # increment similar author counter
    for sx in tweets[x].similiar_tweets:
        authors[tweets[x].author_id].similiar_authors.append(tweets[sx[0]].author_id)
        authors[tweets[sx[0]].author_id].similiar_authors.append(tweets[x].author_id)
    
    counter+=1
    if counter % 100 == 0: 
        print("Start:", t1, " ", "End:", datetime.datetime.now())
        t1 = datetime.datetime.now()

Start: 2022-11-05 18:44:03.896272   End: 2022-11-05 18:46:39.943283
CPU times: user 2min 37s, sys: 3.05 s, total: 2min 40s
Wall time: 2min 40s


In [None]:
tweets[tweet_tokens[0]].similiar_tweets

In [None]:
print(tweets[tweet_tokens[0]].author_id)
authors[tweets[tweet_tokens[0]].author_id].similiar_authors

## Upload/Download Integrity Test to GCP

In [None]:
# Pickle Local Test
tweet_objs = list(tweets.values())
auth_objs = list(authors.values())

import pickle
with open('twt2twt_w_score.pkl', 'wb') as f:
    pickle.dump(tweet_objs, f)
    
import pickle
with open('auth2auth_w_score.pkl', 'wb') as f:
    pickle.dump(auth_objs, f)


In [None]:
#GCP Test
def upload_to_output(path, bucket_name, folder_name):
    storage_client = storage.Client()
    bucket = storage_client.bucket(bucket_name)
    blob = bucket.blob(folder_name + '/' + path.split('/')[-1])
    blob.upload_from_filename(path)

from google.cloud import storage   
bucket_name = 'sw-airlines-data-hub'

upload_to_output('twt2twt_w_score.pkl', bucket_name, 'data/processed')
upload_to_output('auth2auth_w_score.pkl', bucket_name, 'data/processed')