# Similarity Network Notebook

In [1]:
import os
os.system('pip install nltk')
os.system('pip install stanza')
os.system('pip install emoji')

import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
import re
import nltk
from nltk.corpus import stopwords
from nltk.corpus import wordnet
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.sentiment.vader import SentimentIntensityAnalyzer

import ssl

try:
    _create_unverified_https_context = ssl._create_unverified_context
except AttributeError:
    pass
else:
    ssl._create_default_https_context = _create_unverified_https_context

nltk.download('stopwords')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('vader_lexicon')
nltk.download('words')
words = set(nltk.corpus.words.words())
import stanza
stanza.download("en")

class Tweet():
    def __init__(self, text, text_clean, token, author_id):
        self.token = token
        self.text = text
        self.text_clean = text_clean
        self.author_id = author_id
        
        self.similiar_tweets = []
        self.similiar_authors = []

        self.sentiments = {}
        self.associations = []

class User():
    def __init__(self, author_id):
        self.author_id = author_id
        self.tweet_tokens = []
        self.similiar_authors = []
        self.author_edges = {}

#Removing Emojis
def remove_emojis(data):
    emoj = re.compile("["
                      u"\U0001F600-\U0001F64F"  # emoticons
                      u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                      u"\U0001F680-\U0001F6FF"  # transport & map symbols
                      u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                      u"\U00002500-\U00002BEF"  # chinese char
                      u"\U00002702-\U000027B0"
                      u"\U00002702-\U000027B0"
                      u"\U000024C2-\U0001F251"
                      u"\U0001f926-\U0001f937"
                      u"\U00010000-\U0010ffff"
                      u"\u2640-\u2642"
                      u"\u2600-\u2B55"
                      u"\u200d"
                      u"\u23cf"
                      u"\u23e9"
                      u"\u231a"
                      u"\ufe0f"  # dingbats
                      u"\u3030"
                      "]+", re.UNICODE)
    return re.sub(emoj, '', str(data))

def cleaner(text):
    tweet = re.sub("@[A-Za-z0-9]+","",str(text)) #Remove @ sign
    tweet = re.sub(r"(?:\@|http?\://|https?\://|www)\S+", "", str(text)) #Remove http links
    tweet = re.sub('[()!?]', ' ', str(text)) #removing punctuation
    tweet = re.sub('\[.*?\]',' ', str(text))
    tweet = " ".join(tweet.split())
    tweet = tweet.replace("#", "").replace("_", " ") #Remove hashtag sign but keep the text
    tweet = " ".join(w for w in nltk.wordpunct_tokenize(str(text))
                     if w.lower() in words or not w.isalpha())
    return text

def calculate_sentiments(text, stop_words, nlp):
    txt = text
    sentList = nltk.sent_tokenize(txt) # Splitting the text into sentences
    fcluster = []
    totalfeatureList = []
    finalcluster = []
    featureList = []
    categories = []
    dic = {}

    for line in sentList:
        # Remove links from line
        line = re.sub(r'http\S+|#', '', line)

        # Swap '-', ';', '*' with commas
        line = re.sub(':', '.', line)
        line = re.sub('\n|@', '', line)

        # Remove consecutive punctuation recursively
        r = re.compile(r'([.,/#!$%^&*;:{}=_`~()-])[.,/#!$%^&*;:{}=_`~()-]+')
        line = r.sub(r'\1', line)

        # Replace hashtags with association term
        line = re.sub('#', 'hashtag is ', line)

        try:
            newtaggedList = []
            txt_list = nltk.word_tokenize(line) # Splitting up into words
            taggedList = nltk.pos_tag(txt_list) # Doing Part-of-Speech Tagging to each word

            newwordList = []
            flag = 0
            for i in range(0,len(taggedList)-1):
                if(taggedList[i][1]=="NN" and taggedList[i+1][1]=="NN"): # If two consecutive words are Nouns then they are joined together
                    newwordList.append(taggedList[i][0]+taggedList[i+1][0])
                    flag=1
                else:
                    if(flag==1):
                        flag=0
                        continue
                    newwordList.append(taggedList[i][0])
                    if(i==len(taggedList)-2):
                        newwordList.append(taggedList[i+1][0])

            finaltxt = ' '.join(word for word in newwordList)
            new_txt_list = nltk.word_tokenize(finaltxt)
            wordsList = [w for w in new_txt_list if not w in stop_words]
            taggedList = nltk.pos_tag(wordsList)

            doc = nlp(finaltxt) # Object of Stanford NLP Pipeleine

            dep_node = []

            for dep_edge in doc.sentences[0].dependencies:
                dep_node.append([dep_edge[2].text, dep_edge[0].id, dep_edge[1]])

            for i in range(0, len(dep_node)):
                if (int(dep_node[i][1]) != 0):
                    dep_node[i][1] = newwordList[(int(dep_node[i][1]) - 1)]

            # featureList = []
            # categories = []
            for i in taggedList:
                if(i[1]=='JJ' or i[1]=='NN' or i[1]=='JJR' or i[1]=='NNS' or i[1]=='RB'):
                    featureList.append(list(i))
                    totalfeatureList.append(list(i)) # This list will store all the features for every sentence
                    categories.append(i[0])

            for i in featureList:
                filist = []
                for j in dep_node:
                    if((j[0]==i[0] or j[1]==i[0]) and (j[2] in ["nsubj", "acl:relcl", "obj", "dobj", "agent", "advmod", "amod", "neg", "prep_of", "acomp", "xcomp", "compound"])):
                        if(j[0]==i[0]):
                            filist.append(j[1])
                        else:
                            filist.append(j[0])
                fcluster.append([i[0], filist])

        except IndexError:
            print('IndexError:', line)
            return []

        except AttributeError:
            print('AttributeError')
            return []

    for i in featureList:
        dic[i[0]] = i[1]

    for i in fcluster:
        if(dic[i[0]]=="NN"):
            finalcluster.append(i)

    return finalcluster

def upload_to_output(path, bucket_name, folder_name):
    storage_client = storage.Client()
    bucket = storage_client.bucket(bucket_name)
    blob = bucket.blob(folder_name + '/' + path.split('/')[-1])
    blob.upload_from_filename(path)

from google.cloud import storage   
bucket_name = 'sw-airlines-data-hub'

Collecting nltk
  Using cached nltk-3.7-py3-none-any.whl (1.5 MB)
Collecting regex>=2021.8.3
  Using cached regex-2022.10.31-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (757 kB)
Installing collected packages: regex, nltk
Successfully installed nltk-3.7 regex-2022.10.31
Collecting stanza
  Using cached stanza-1.4.2-py3-none-any.whl (691 kB)
Collecting emoji
  Using cached emoji-2.2.0-py3-none-any.whl
Installing collected packages: emoji, stanza
Successfully installed emoji-2.2.0 stanza-1.4.2


[nltk_data] Downloading package stopwords to
[nltk_data]     /home/jupyter/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /home/jupyter/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/jupyter/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /home/jupyter/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!
[nltk_data] Downloading package words to /home/jupyter/nltk_data...
[nltk_data]   Package words is already up-to-date!
  from .autonotebook import tqdm as notebook_tqdm
Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.4.1.json: 193kB [00:00, 49.0MB/s]                    
2022-11-25 16:46:22 INFO: Downloading default packages for language: en (Eng

In [2]:
df = pd.read_csv('gs://sw-airlines-data-hub/data/processed/sw-airlines-tweets-w-users.csv')
df['tweet_clean'] = df['text'].str.lower().str.replace(r'[^0-9a-zA-Z\s]+', '', regex=True).apply(cleaner)
df['tweet_clean'] = df['tweet_clean'].apply(remove_emojis)
X = df[['text', 'tweet_clean', 'tweet_token', 'author_id']]
X.head()

topics = ['book', 'cancel', 'call', 'support', 'delay', 'change', 'never', 'fear',
          'pandemic', 'group', 'pilot', 'mask', 'avgeek', 'technology', 'pay']
dfs = []

for t in topics:
    dfs.append(X[X.tweet_clean.str.contains(t)])
    print(t, len(X[X.tweet_clean.str.contains(t)]))

x0 = pd.concat(dfs)
x0 = x0.set_index('tweet_token')
x0['tweet_token'] = x0.index
x0 = x0[['tweet_token', 'author_id', 'tweet_clean']]
x0 = x0.drop_duplicates(
  subset = ['tweet_clean'],
  keep = 'first').reset_index(drop = True)
lx0 = len(x0)
df.head()

book 4665
cancel 3675
call 2793
support 969
delay 4676
change 1984
never 2772
fear 2590
pandemic 534
group 1657
pilot 4768
mask 737
avgeek 5994
technology 337
pay 2530


Unnamed: 0.1,Unnamed: 0,created_at_x,in_reply_to_user_id,edit_history_tweet_ids,text,author_id,conversation_id,public_metrics.retweet_count,public_metrics.reply_count,public_metrics.like_count,...,description,created_at_y,profile_image_url,username,user.followers_count,user.following_count,user.tweet_count,user.listed_count,withheld.country_codes_y,tweet_clean
0,0,2022-11-08T15:07:29.000Z,7212562.0,['1589998047110782976'],"@SouthwestAir Thanks, I'll do that now :)",790555820,1589997663336148993,0,0,0,...,,2012-08-30T01:35:58.000Z,https://pbs.twimg.com/profile_images/155849327...,NickChaps96,325.0,1594.0,7026.0,6.0,,southwestair thanks ill do that now
1,1,2022-11-08T15:05:58.000Z,7212562.0,['1589997663336148993'],@SouthwestAir we have a flight from Norfolk to...,790555820,1589997663336148993,0,1,0,...,,2012-08-30T01:35:58.000Z,https://pbs.twimg.com/profile_images/155849327...,NickChaps96,325.0,1594.0,7026.0,6.0,,southwestair we have a flight from norfolk to ...
2,2,2022-11-08T15:03:16.000Z,14179945.0,['1589996983066189824'],@AviatorJLat @EGAinMA @SouthwestAir @USDOT @No...,28853549,1589787523353833474,0,0,0,...,Nerd. Fangirl. Campaigning to #SaveDaredevil s...,2009-04-04T19:27:59.000Z,https://pbs.twimg.com/profile_images/126807649...,catzmiaou,556.0,492.0,7200.0,6.0,,aviatorjlat egainma southwestair usdot nonuttr...
3,3,2022-11-08T15:03:16.000Z,14179945.0,['1589996983066189824'],@AviatorJLat @EGAinMA @SouthwestAir @USDOT @No...,28853549,1589787523353833474,0,0,0,...,Nerd. Fangirl. Campaigning to #SaveDaredevil s...,2009-04-04T19:27:59.000Z,https://pbs.twimg.com/profile_images/126807649...,catzmiaou,556.0,492.0,7200.0,6.0,,aviatorjlat egainma southwestair usdot nonuttr...
4,4,2022-11-08T15:01:41.000Z,16271858.0,['1589996585316147200'],@kaoconnor @SouthwestAir @DENAirport @MidwayAi...,1198826459147554816,1589305162308329472,0,0,0,...,Advocating for basic decency and social graces...,2019-11-25T04:51:25.000Z,https://pbs.twimg.com/profile_images/119882696...,twit_traveling,0.0,0.0,76.0,0.0,,kaoconnor southwestair denairport midwayairpor...


In [3]:
# Import tweet 2 tweet, author to author pickles
!gsutil cp gs://sw-airlines-data-hub/data/processed/twt2twt_w_score_w_sentiments.pkl ./
!gsutil cp gs://sw-airlines-data-hub/data/processed/auth2auth.pkl ./

import pickle
with open('twt2twt_w_score_w_sentiments.pkl', 'rb') as f:
    tweet_objs = pickle.load(f)
    
with open('auth2auth_w_score.pkl', 'rb') as f:
    auth_objs = pickle.load(f)

tweets = {} 
for x in tweet_objs: tweets[x.token] = x
tweet_tokens = [x.token for x in tweet_objs]

authors = {} 
for x in auth_objs: authors[x.author_id] = x
unique_authors = [x.author_id for x in auth_objs]

Copying gs://sw-airlines-data-hub/data/processed/twt2twt_w_score_w_sentiments.pkl...
| [1 files][ 64.6 MiB/ 64.6 MiB]                                                
Operation completed over 1 objects/64.6 MiB.                                     
Copying gs://sw-airlines-data-hub/data/processed/auth2auth.pkl...
/ [1 files][  8.2 MiB/  8.2 MiB]                                                
Operation completed over 1 objects/8.2 MiB.                                      


In [None]:
tweets[tweet_tokens[3]].associations

In [None]:
%%time
t2t = pd.DataFrame(columns = ['TweetTokenA', 'TweetTokenB', 'SimilarityScore', 'AspectsA', 'AspectsB'])

for x in tweet_tokens:
    for j in tweets[x].similiar_tweets:
        t2t.loc[len(t2t)] = [x, j[0], j[1], str(tweets[x].associations), str(tweets[j[0]].associations)]

t2t = t2t.drop_duplicates()        

a_comp = df[['tweet_token', 'author_id', 'username', 'text', 'created_at_x', 'public_metrics.retweet_count',
 'public_metrics.reply_count',
 'public_metrics.like_count',
 'public_metrics.quote_count']]

t2t = t2t.merge(a_comp, left_on='TweetTokenA', right_on='tweet_token')
t2t = t2t.rename(columns={'author_id': 'AuthorTokenA', 
                          'username' : 'UserA', 
                          'text_clean' : 'A_text_clean', 
                          'created_at_x' : 'A_created_at', 
                          'public_metrics.retweet_count' : 'A_rt_cnt',
                          'public_metrics.reply_count' : 'A_reply_cnt',
                          'public_metrics.like_count' : 'A_like_count',
                          'public_metrics.quote_count' : 'A_qt_count'})
t2t = t2t.drop(columns=['tweet_token'])

t2t = t2t.merge(a_comp, left_on='TweetTokenB', right_on='tweet_token')
t2t = t2t.rename(columns={'author_id': 'AuthorTokenB', 
                          'username' : 'UserB', 
                          'text_clean':'B_text_clean', 
                          'created_at_x':'b_created_at', 
                          'public_metrics.retweet_count':'B_rt_cnt',
                          'public_metrics.reply_count':'B_reply_cnt',
                          'public_metrics.like_count':'B_like_count',
                          'public_metrics.quote_count':'B_qt_count'})
t2t = t2t.drop(columns=['tweet_token'])
t2t = t2t.drop_duplicates(subset = ['TweetTokenA', 'TweetTokenB'],  keep = 'first')
t2t = t2t[t2t.TweetTokenA!=t2t.TweetTokenB]

In [None]:
t2t

In [None]:
t2t.to_pickle('tweet2tweet_df3_pre.pickle')
def upload_to_output(path, bucket_name, folder_name):
    storage_client = storage.Client()
    bucket = storage_client.bucket(bucket_name)
    blob = bucket.blob(folder_name + '/' + path.split('/')[-1])
    blob.upload_from_filename(path)

from google.cloud import storage   
bucket_name = 'sw-airlines-data-hub'
upload_to_output('tweet2tweet_df3_pre.pickle', bucket_name, 'data/processed')

In [None]:
!gsutil cp gs://sw-airlines-data-hub/data/processed/tweet2tweet_df3_pre.pickle ./

In [None]:
t2t = pd.read_pickle('tweet2tweet_df3_pre.pickle')

In [None]:
search_list = list(t2t.index)
kill_list = []

for x in search_list:
    a_token = t2t.loc[x].TweetTokenA 
    b_token = t2t.loc[x].TweetTokenB
    a = t2t[t2t.TweetTokenB == a_token]
    b = a[a.TweetTokenA==b_token]
    i = b.index.values[0]
    kill_list.append(i)
    search_list.remove(i)

t2t = t2t.drop(index = kill_list)

In [None]:
t2t = t2t.fillna('N/A')

In [None]:
# dedupe measure
t2t.to_pickle('tweet2tweet_df3.pickle')
upload_to_output('tweet2tweet_df3.pickle', bucket_name, 'data/processed')

In [4]:
def intersection(lst1, lst2):
    lst3 = [value for value in lst1 if value in lst2]
    return lst3

for q in unique_authors: authors[q].similiar_authors = [*set(authors[q].similiar_authors)]
test_run = intersection(unique_authors, [tweets[x].author_id for x in tweet_tokens])

In [5]:
%%time
a2a = pd.DataFrame(columns = ['AuthorTokenA', 'AuthorTokenB', 'MeanSimilarity_Score', 'NumEdges', 'NumMutualSimiliarTweets', 
                              'NumTweetsA', 'NumTweetsB', 'NumTweetsCombined']) 
                   
for x in unique_authors:
    author_a = authors[x]   
    
    for j in author_a.similiar_authors:           
        num_edges = 0      
        author_b = authors[j]
                   
        a_tweets = author_a.tweet_tokens
        tweets_similiar_to_a = []
        for at in a_tweets: tweets_similiar_to_a += tweets[at].similiar_tweets
        
        b_tweets = author_b.tweet_tokens
        tweets_similiar_to_b = []
        for bt in b_tweets: tweets_similiar_to_b += tweets[bt].similiar_tweets
        
        a_similiar_tokens = [x[0] for x in tweets_similiar_to_a]
        b_similiar_tokens = [x[0] for x in tweets_similiar_to_b]
        
        # only look at tweets in between two authors
        intertweets = intersection(a_similiar_tokens + b_similiar_tokens, a_tweets + b_tweets)
        a_intertweets = intersection(a_tweets, intertweets)
        b_intertweets = intersection(b_tweets, intertweets)
        
        edge_weights = []
        
        for i in a_intertweets:
            edge_weights += [x[1] for x in tweets[i].similiar_tweets if x[0] in b_intertweets]
            
        if len(edge_weights) > 0: average_edge_weight = sum(edge_weights) / len(edge_weights)
        else: average_edge_weight = 0
            
        num_edges = len(intersection(a_similiar_tokens, b_tweets))
        numMutualSimiliarTweets = len(intersection(a_similiar_tokens, b_similiar_tokens))
                        
        a2a.loc[len(a2a)] = [author_a.author_id, author_b.author_id, average_edge_weight, num_edges, numMutualSimiliarTweets, 
                             len(a_tweets), len(b_tweets), len(a_tweets)+len(b_tweets)]   

a2a = a2a.drop_duplicates(subset = ['AuthorTokenA', 'AuthorTokenB'],  keep = 'first')

CPU times: user 52min 36s, sys: 720 ms, total: 52min 36s
Wall time: 52min 38s


In [None]:
a2a = a2a.fillna('N/A')

In [6]:
a_comp = df[['author_id', 'username', 'verified', 'profile_image_url', 'description']]
a2a = a2a.merge(a_comp, left_on='AuthorTokenA', right_on='author_id')
a2a = a2a.rename(columns={'username' : 'UserA', 'verified':'A_is_verified', 'profile_image_url': 'A_prof_img_url', 'description' : 'A_description'})
a2a = a2a.drop(columns=['author_id'])

a2a = a2a.merge(a_comp, left_on='AuthorTokenB', right_on='author_id')
a2a = a2a.rename(columns={'username' : 'UserB', 'verified':'B_is_verified', 'profile_image_url': 'B_prof_img_url', 'description' : 'B_description'})
a2a = a2a.drop(columns=['author_id'])
a2a = a2a.drop_duplicates(subset = ['AuthorTokenA', 'AuthorTokenB'],  keep = 'first')
a2a = a2a[a2a.AuthorTokenA!=a2a.AuthorTokenB]

In [7]:
search_list = list(a2a.index)
kill_list = []

for x in search_list:
    a_token = a2a.loc[x].AuthorTokenA 
    b_token = a2a.loc[x].AuthorTokenB
    a = a2a[a2a.AuthorTokenB == a_token]
    b = a[a.AuthorTokenA==b_token]
    i = b.index.values[0]
    kill_list.append(i)
    search_list.remove(i)

a2a = a2a.drop(index = kill_list)

In [8]:
%%time
a2a.to_pickle('author2author_df3.pickle') 

CPU times: user 26.9 ms, sys: 11 µs, total: 26.9 ms
Wall time: 57.2 ms


In [9]:
upload_to_output('author2author_df3.pickle', bucket_name, 'data/processed')
a2a

Unnamed: 0,AuthorTokenA,AuthorTokenB,MeanSimilarity_Score,NumEdges,NumMutualSimiliarTweets,NumTweetsA,NumTweetsB,NumTweetsCombined,UserA,a_is_verified,a_prof_img_url,a_description,UserB,b_is_verified,b_prof_img_url,b_description
0,3.091670e+09,4.299009e+09,0.881317,2.0,4.0,3.0,1.0,4.0,comedianarthur,False,https://pbs.twimg.com/profile_images/157486347...,YHTBH podcast out everywhere every Monday. Hou...,hippie_79,False,https://pbs.twimg.com/profile_images/156065396...,
3,3.941071e+07,4.299009e+09,0.890870,1.0,3.0,3.0,1.0,4.0,Cthacker1987,False,https://pbs.twimg.com/profile_images/121387058...,Just me!,hippie_79,False,https://pbs.twimg.com/profile_images/156065396...,
6,1.137970e+09,4.299009e+09,0.887860,1.0,4.0,4.0,1.0,5.0,darkALLYE,False,https://pbs.twimg.com/profile_images/151716589...,Marketer. Baker. Photographer. Liberty and Fre...,hippie_79,False,https://pbs.twimg.com/profile_images/156065396...,
10,3.912859e+07,4.299009e+09,0.884027,1.0,4.0,1.0,1.0,2.0,itsbrookielou,False,https://pbs.twimg.com/profile_images/147883697...,Vegas Born. 💙 Ella. 🐶,hippie_79,False,https://pbs.twimg.com/profile_images/156065396...,
13,1.559754e+18,4.299009e+09,0.881764,1.0,2.0,1.0,1.0,2.0,OrtizOrosa3445,False,https://pbs.twimg.com/profile_images/155975400...,God does.,hippie_79,False,https://pbs.twimg.com/profile_images/156065396...,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
56157476,1.781722e+07,7.682630e+17,0.893968,2.0,3.0,1.0,2.0,3.0,mamatoria,False,https://pbs.twimg.com/profile_images/464763219...,"educator, proud Pirate mom, unbelievably grate...",VintageloverH,False,https://pbs.twimg.com/profile_images/140673640...,Recovering lawyer. I teach Constitutional Law ...
56157639,2.249620e+08,1.874799e+09,0.985652,1.0,2.0,1.0,1.0,2.0,yespunjab,False,https://pbs.twimg.com/profile_images/127970296...,https://t.co/Hfj7lUJJPV #News #Entertainment #...,TheHansIndiaWeb,False,https://pbs.twimg.com/profile_images/119325921...,The Hans India is a leading English news paper...
56158411,2.709679e+09,4.000142e+08,0.995655,6.0,12.0,6.0,2.0,8.0,Fexcogroup,False,https://pbs.twimg.com/profile_images/154396956...,"A World Leader In Innovative #FinTech, #Paymen...",FexcoDCC,False,https://pbs.twimg.com/profile_images/985612987...,Fexco is the originator of Dynamic Currency Co...
56158741,3.783896e+07,1.170733e+18,0.889373,1.0,2.0,1.0,3.0,4.0,jfrankk13,False,https://pbs.twimg.com/profile_images/152954816...,She/her. I take photos sometimes - https://t.c...,desecratedhost,False,https://pbs.twimg.com/profile_images/147273498...,saw girlboyfriend
