# Embolization Twitter Analysis

In [1]:
import numpy as np # Imports
import pandas as pd
from scipy import stats
from statistics import mean, stdev
import requests
import re
import ast


In [2]:
data = pd.read_csv('embolization.csv') #Reading in the CSV

In [3]:
data.columns #The columns in the CSV

Index(['tweet_id', 'username', 'name', 'profile_picture', 'replies',
       'retweets', 'likes', 'is_retweet', 'posted_time', 'content', 'hashtags',
       'mentions', 'images', 'videos', 'tweet_url', 'link'],
      dtype='object')

In [4]:
data.head() #A sample of the top of the CSV

Unnamed: 0,tweet_id,username,name,profile_picture,replies,retweets,likes,is_retweet,posted_time,content,hashtags,mentions,images,videos,tweet_url,link
0,1579179890334736385,MountSinaiNeuro,Mount Sinai Neurosurgery,https://pbs.twimg.com/profile_images/933014890...,0,2,19,False,2022-10-09T18:40:00+00:00,@Neurosurgery\n Clinical Trial Results at #202...,"['2022CNS', 'aneurysm', 'embolization', 'Neuro...","['Neurosurgery', 'JMoccoMD', 'MountSinaiNeuro'...",['https://pbs.twimg.com/media/FepZ8yQVIAAZ0Ox?...,[],https://twitter.com/MountSinaiNeuro/status/157...,
1,1579178650947424256,0kanGurkan,Okan Gürkan,https://pbs.twimg.com/profile_images/157903658...,0,1,2,False,2022-10-09T18:35:04+00:00,Trancathater #genicular #artery #embolization ...,"['genicular', 'artery', 'embolization']",[],['https://pbs.twimg.com/media/FepeblHWQAAWenT?...,[],https://twitter.com/0kanGurkan/status/15791786...,
2,1579152927612366848,adelmaged9,adel maged,https://pbs.twimg.com/profile_images/157160468...,0,0,0,False,2022-10-09T16:52:51+00:00,LV thrombus\nAcute limb ischemia is suspected ...,[],[],[],[],https://twitter.com/adelmaged9/status/15791529...,
3,1578988028345085952,erdman_janiya,Janiya_Erdman,https://pbs.twimg.com/profile_images/155467774...,0,0,0,False,2022-10-09T05:57:36+00:00,Embolization Therapy: Principles and Clinical ...,[],[],[],[],https://twitter.com/erdman_janiya/status/15789...,https://t.co/1B6ZieL3N7
4,1578852335404879872,thegestgroup,GEST -Global Embolization Symposium & Technolo...,https://pbs.twimg.com/profile_images/118210273...,1,1,10,True,2022-10-08T20:58:25+00:00,@SidpadiaIR\n is one of the Course Directors f...,[],['SidpadiaIR'],['https://pbs.twimg.com/media/FekBkkTXwAE80r7?...,[],https://twitter.com/thegestgroup/status/157885...,


## 2 Sided T-tests 

Comparing the amount of likes with an image attached or no image, and there is a significant change

In [5]:
stats.ttest_ind(data['likes'].values, data[data['images'] != "[]"]["likes"].values, trim=.2) #Evaluating if there is a significant change in the amount of likes if you include an image, and there the ttest is statistically significant

Ttest_indResult(statistic=-8.646123593413298, pvalue=1.457381123262676e-17)

In [6]:
print(mean(data['likes'].values))
print(mean(data[data['images'] != "[]"]["likes"].values)) #Shows that the mean of the amount of likes is much higher with images

9
15


Comparing the amount of likes with hashtags, and there is a significant change

In [7]:
data.hashtags = data.hashtags.apply(lambda s: list(ast.literal_eval(s))) #Changing the format of the strings

In [8]:
stats.ttest_ind(data['likes'].values, data[data['hashtags'].map(len) > 0]["likes"].values, trim=.2) #Did the same as above, but with the hastags instead

Ttest_indResult(statistic=-3.1515501973456175, pvalue=0.001660415436384642)

In [9]:
print(mean(data['likes'].values))
print(mean(data[data['hashtags'].map(len) > 0]["likes"].values)) #Shows that the mean is the same, strange result

9
9


In [10]:
print(stdev(data['likes'].values))
print(stdev(data[data['hashtags'].map(len) > 0]["likes"].values)) #Shows that the stdev of the hastags is smaller, representing a tighter distribution, and maybe the likes are skewed to a higher number by outliers

26.248809496813376
22.80350850198276


### Hashtags

In [11]:
hashtagDict = {} # Getting the hashtags and the amount of likes they have, eliminating hashtags with just one tweet

for index, row in data.iterrows():
    for hashtag in row['hashtags']:
        if hashtag in hashtagDict:
            hashtagDict[hashtag].append(row['likes'])
        else:
            hashtagDict[hashtag] = [row['likes']]

hashtagDictFil = {}
for item in hashtagDict:
    if len(hashtagDict[item]) > 1:
        hashtagDictFil[item] = sum(hashtagDict[item])

In [12]:
{k: v for k, v in sorted(hashtagDictFil.items(), key=lambda item: item[1], reverse=True)} #sorting by highest value

{'embolization': 908,
 'irad': 747,
 'IRad': 672,
 'Embolization': 669,
 'SNIS2022': 568,
 'CSDH': 240,
 'Neurointervention': 222,
 'interventionalradiology': 207,
 'PAE': 197,
 'Fibroids': 173,
 'IR': 159,
 'Neurosurgery': 142,
 'UFE': 139,
 'radiology': 134,
 'CIRSE2022': 125,
 'MedTwitter': 124,
 'IRAD': 123,
 'cSDH': 120,
 'MMA': 118,
 'MedEd': 113,
 'Chestrad': 105,
 'surgery': 104,
 'BPH': 99,
 'jvir': 93,
 'SIH': 91,
 'Vascular': 90,
 'prostate': 89,
 'RelentlessPursuit': 82,
 'MensHealth': 82,
 'radres': 81,
 'TAEGR': 81,
 'Hemorrhoid': 78,
 'TwittIR': 76,
 'endovascular': 75,
 'ThyroidRFA': 75,
 'Endotwitter': 75,
 'fibroid': 72,
 'neurosurgery': 67,
 'CSFVF': 67,
 'trauma': 66,
 'iRad': 65,
 'fibroidawareness': 64,
 'Artery': 63,
 'Commentary': 61,
 'IRadRes': 58,
 'INR': 58,
 'SDH': 58,
 'cancer': 54,
 'LiverCancerAwarenessMonth': 52,
 'oncology': 51,
 'fibroids': 51,
 'Irad': 50,
 'kidneycancer': 50,
 'twittIR': 50,
 'MSK': 48,
 'embolotherapy': 48,
 'aneurysm': 47,
 'STREA

In [13]:
hashtagFullDict = {} #Getting the number of likes per tweet with the hashtag

for index, row in data.iterrows():
    for hashtag in row['hashtags']:
        if hashtag in hashtagFullDict:
            hashtagFullDict[hashtag].append(row['likes'])
        else:
            hashtagFullDict[hashtag] = [row['likes']]

General hashtags (IR or MedTwitter) won't gain as much traction as just putting the topic as the hashtag. While a general audience will see it with the hashtag, it won't get more engagement

In [14]:
stats.ttest_ind(hashtagFullDict["embolization"], hashtagFullDict["IR"], trim=.2) #Embolization vs IR

Ttest_indResult(statistic=-1.4752296624357657, pvalue=0.14417612214930423)

In [15]:
print(mean(hashtagFullDict["embolization"]))
print(mean(hashtagFullDict["IR"])) #Embolization vs IR

8.035398230088495
9.352941176470589


In [16]:
stats.ttest_ind(hashtagFullDict["embolization"], hashtagFullDict["MedTwitter"], trim=.2)

Ttest_indResult(statistic=-1.3192317226301213, pvalue=0.19105251088151712)

In [17]:
print(mean(hashtagFullDict["embolization"]))
print(mean(hashtagFullDict["MedTwitter"])) #Embolization vs IR

8.035398230088495
8.266666666666667


In [18]:
print(stdev(hashtagFullDict["embolization"]))
print(stdev(hashtagFullDict["MedTwitter"])) #Embolization vs IR

16.124476294673784
8.811248222476499


More specific hashtags, related to conferences or papers, will get more engagement

In [19]:
stats.ttest_ind(hashtagFullDict["embolization"], hashtagFullDict["SNIS2022"], trim=.2)

Ttest_indResult(statistic=-4.53846255758433, pvalue=2.0557960394235194e-05)

In [20]:
print(mean(hashtagFullDict["embolization"]))
print(mean(hashtagFullDict["SNIS2022"])) #Embolization vs IR

8.035398230088495
40.57142857142857


In [21]:
print(stdev(hashtagFullDict["embolization"]))
print(stdev(hashtagFullDict["SNIS2022"])) #Embolization vs IR

16.124476294673784
90.79115026825559


In [22]:
stats.ttest_ind(hashtagFullDict["embolization"], hashtagFullDict["CIRSE2022"], trim=.2)

Ttest_indResult(statistic=-2.888233807723638, pvalue=0.0051134101983464275)

In [23]:
print(mean(hashtagFullDict["embolization"]))
print(mean(hashtagFullDict["CIRSE2022"])) #Embolization vs IR

8.035398230088495
17.857142857142858


In [24]:
print(stdev(hashtagFullDict["embolization"]))
print(stdev(hashtagFullDict["CIRSE2022"])) #Embolization vs IR

16.124476294673784
22.534629435815532


### Articles or Images

In [25]:
def get_original_twitter_url(twitter_url):
    # without masking it as a browser request, it wont work properly
    headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36'}
    r = requests.get(url = twitter_url,headers=headers)
    data = r.text
    try:
        url = re.search("(?P<url>https?://[^\s]+)\"", data).group("url")
        return url
    except:
        return "nothing"
print(get_original_twitter_url('https://t.co/ESwzZO0ncA'))

https://www.frontiersin.org/articles/10.3389/fneur.2022.1009914?utm_source=S-TWT&amp;utm_medium=SNET&amp;utm_campaign=ECO_FNEUR_XXXXXXXX_auto-dlvrit


In [26]:
numOfLikes = []
for index, row in data[data['link'].notnull()].iterrows():
    value = ('article' or 'report') in get_original_twitter_url(row['link'])
    value1 = ('article' or 'report') in row.content
    if value or value1:
        numOfLikes.append(row['likes'])
print(numOfLikes)


[1, 1, 1, 4, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 18, 5, 0, 26, 12, 14, 13, 0, 0, 2, 1, 0, 0, 5, 38, 2, 19, 1, 2, 0, 4, 0, 0, 0, 0, 0, 0, 9, 0, 0, 0, 1, 0, 0, 0, 0, 6, 2, 0, 0, 5, 2, 0, 0, 3, 28, 15, 1, 1, 2, 0, 4]


In [27]:
stats.ttest_ind(numOfLikes, data[data['images'] != "[]"]["likes"].values, trim=.2)

Ttest_indResult(statistic=-3.1648894081766095, pvalue=0.001652982461614633)

In [28]:
print(mean(data[data['images'] != "[]"]["likes"].values)) #Images get more likes than articles...
print(mean(numOfLikes))

15
3.787878787878788
