In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import plotly.express as px

data = pd.read_csv(f"newdata_v1.csv")

In [None]:
# main hastags to be used
main_hashtags = ['climatechange',
                'climatecrisis',
                'climateaction',
                'saveearth',
                'globalwarming',
                'savetheplanet',
                'climateemergency']

hashtags = []
groups_of_hashtags = [] # needed for #5 to create edges
individual_hashtags = []
for msg in data['Hashtags']:
    hashtags_in_msg = []
    for hashtag in msg.split():
        tag = hashtag.strip('\'.,[]').lower().replace('\\', '').replace('/', '').replace(':', '')
        if "…" not in tag and "http" not in tag:
            hashtags.append(tag)
            hashtags_in_msg.append(tag)
            if tag not in individual_hashtags:
                individual_hashtags.append(tag)
    groups_of_hashtags.append(hashtags_in_msg)
                
# all individual usernumbers per main hashtag
individual_users = dict.fromkeys(main_hashtags, [])

# number of tweets per main hashtag
num_of_tweets = dict.fromkeys(main_hashtags, 0)

# count of individual users per main hashtag
individual_users_count = dict.fromkeys(main_hashtags, 0)

users = data['Username']

for index, tweet_hashtags in enumerate(data['Hashtags']):
    for hashtag in main_hashtags:
        if hashtag in tweet_hashtags:
            num_of_tweets[hashtag] += 1
            if users[index] not in individual_users[hashtag]:
                individual_users[hashtag].append(users[index])
                individual_users_count[hashtag] += 1

print(len(individual_hashtags))
print(num_of_tweets)
print(individual_users_count)



In [None]:
#show number of tweets per hashtag
plt.figure(figsize=(12,8))
number_of_tweets = [key for key, val in num_of_tweets.items() for _ in range(val)]
plt.hist(number_of_tweets, bins=7, color='green',edgecolor='black')
plt.ylabel('Number of tweets')
plt.xlabel('Hashtag used')
plt.show()

In [None]:
#show number of users per hashtag
plt.figure(figsize=(12,8))
number_of_users = [key for key, val in individual_users_count.items() for _ in range(val)]
plt.hist(number_of_users, bins=7, color='green',edgecolor='black')
plt.ylabel('Number of individual users')
plt.xlabel('Hashtag used')
plt.show()

In [None]:


# list of languages in tweets
languages = []
for tweet_lang in data['Language']:
    if tweet_lang not in languages:
        languages.append(tweet_lang)
        
# dict for number of tweets per language
lang_count = dict.fromkeys(languages, 0)

for tweet in data['Language']:
    for lang in languages:
        if lang in tweet:
            lang_count[lang] += 1

# sort dictionary by value
lang_count = dict(sorted(lang_count.items(), key=lambda item: item[1], reverse=True))

# labels dict for showing 5 most popular languages
# init 'other' to prevent keyerror
labels = {'other': 0}

for index, lang in enumerate(lang_count.keys()):
    if index >= 4:
        labels['other'] += lang_count[lang]
    else:
        labels[lang] = lang_count[lang]

fig1, ax1 = plt.subplots()
ax1.pie(labels.values(), labels=labels.keys(), autopct='%1.1f%%', startangle=90)

plt.show()

In [None]:
import ast
import reverse_geocoder as rg
import pycountry

#show geographic information of the tweets

countries = {}

for geodata in data['Geo']:
    if type(geodata) != float:
        geodata = ast.literal_eval(geodata)
        if 'coordinates' in geodata:
            latlong = (geodata['coordinates']['coordinates'][1], geodata['coordinates']['coordinates'][0])
            location = rg.search(latlong)
            country = pycountry.countries.get(alpha_2=location[0]['cc']).name
            if country not in countries:
                countries[country] = 1
            else:
                countries[country] += 1
        
fig1, ax1 = plt.subplots()
ax1.pie(countries.values(), labels=countries.keys(), autopct='%1.1f%%', startangle=90)

plt.show()


In [None]:
#vader tool analysis
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

analyzer = SentimentIntensityAnalyzer()
vader_data = []
for sentence in data['Text']:
    vs = analyzer.polarity_scores(sentence)
    vader_data.append(vs)
    #print("{} {}".format(index, vs))
    
fig = px.scatter_ternary(vader_data, a="neg", b="neu", c="pos")
fig.show()

In [None]:
#construct a social graph
import networkx as nx

G = nx.Graph()

for hashtag in individual_hashtags:
    G.add_node(hashtag)

for group in groups_of_hashtags: # groups_of_hashtags is initialized at #1
    for hashtag in group:
        for other_hashtag in group:
            if hashtag != other_hashtag:
                G.add_edge(hashtag, other_hashtag)

pos = nx.spring_layout(G, k=0.06, iterations=20)
plt.figure(figsize=(256, 256))
nx.draw(G, pos,node_size = 30, font_size = 15,with_labels=True,font_color='red')

#get the top 10 in centrality for task 9      
max_degrees = []
for i in list(G.nodes):
    degree_centrality = nx.degree_centrality(G)
    currentmax = max(degree_centrality, key=degree_centrality.get)
    max_degrees.append(currentmax)
    G.remove_node(currentmax)
    j += 1
    if j == 10:
        break
        
top10 = ['climatechange', 'climatecrisis', 'climateemergency', 'globalwarming', 'climateaction', 'savetheplanet', 'climate', 'climateactionnow', 'environment', 'sustainability']


In [None]:
#calculate main global properties of the graph
# number of nodes in variable
num_nodes = G.number_of_nodes()
# number of edges in variable 
num_edges = G.number_of_edges()
# degree centrality in a variable
degree_centrality = nx.degree_centrality(G)
degree = nx.degree(G)

# average degree centrality
average_degree_centrality = sum(degree_centrality.values())/num_nodes

try:
        dia = nx.diameter(G)
except nx.NetworkXError:
        dia = "infinite (not connected)"

clustering_coefficient = nx.average_clustering(G)

# size of the largest component
components = [x for x in nx.connected_components(G)]
largest_component = max(components, key=len)

graph_data = {'Number of nodes': num_nodes,
              'Number of edges': num_edges,
              'Average degree centrality': average_degree_centrality,
              'Diameter': dia,
              'Clustering coefficient': clustering_coefficient,
              'Size of largest component': len(largest_component)}

main_properties_dataframe = pd.DataFrame(graph_data, index=['Normal properties'])
print(G.degree['climatechange'], G.degree['climatecrisis'], G.degree['globalwarming'],G.degree['climateemergency'])
print(main_properties_dataframe)

In [None]:
# plot clustering coefficient and degree
local_clustering_sequence = sorted(nx.clustering(G).values(), reverse=True)
degrees = [val for (node, val) in sorted(G.degree(), key=lambda pair: pair[0])]

degrees = sorted(degrees)
fig = plt.figure(figsize=(16, 8))
ax1 = fig.add_subplot(121)
ax1.hist(degrees, label=None, stacked=False, density=False, bins=100)
ax1.set_title("Degree Distribution")
ax1.set_xlabel("Degree")
ax1.set_ylabel("Frequency")
ax1.set_xlim(0, 600)
ax1.tick_params(axis='x', which='minor', bottom=True, labelbottom=False)

ax2 = fig.add_subplot(122)
ax2.hist(local_clustering_sequence, bins=20)
ax2.set_title("Local Clustering Coefficient Distribution")
ax2.set_xlabel("Local Clustering Coefficient")
ax2.set_xlim(0, 1)
ax2.set_ylabel("Frequency")

plt.tight_layout()
plt.show()

In [None]:
# label propagation algorithm for finding communities
communities = nx.algorithms.community.label_propagation.label_propagation_communities(G)
communities = [list(x) for x in communities]

# graph of communities
G_com = nx.Graph()
        
for community in communities:
    G_com.add_nodes_from(community)
    edges = [(a, b) for idx, a in enumerate(community) for b in community[idx + 1:]]
    G_com.add_edges_from(edges)
    

# number of nodes in variable
num_nodes_com = G_com.number_of_nodes()
# number of edges in variable 
num_edges_com = G_com.number_of_edges()

degree_centrality_com = nx.degree_centrality(G_com)
degree_com = nx.degree(G_com)

# average degree centrality
average_degree_centrality_com = sum(degree_centrality_com.values())/num_nodes_com

try:
        dia_com = nx.diameter(G_com)
except nx.NetworkXError:
        dia_com = "infinite (not connected)"

#clustering_coefficient_com = nx.average_clustering(G_com)
clustering_coefficient_com = 0

# size of the largest component
components_com = [x for x in nx.connected_components(G_com)]
largest_component_com = max(components_com, key=len)


com_data = {'Number of nodes': num_nodes_com,
        'Number of edges': num_edges_com,
        'Average degree centrality': average_degree_centrality_com,
        'Diameter': dia_com,
        'Clustering coefficient': clustering_coefficient_com,
        'Size of largest component': len(largest_component_com)}

com_properties_dataframe = pd.DataFrame(com_data, index=['Community properties'])
compare_properties_dataframe = pd.concat([main_properties_dataframe, com_properties_dataframe])
print(compare_properties_dataframe)

In [None]:
#Task 9
import tweepy
import botometer

bots = [207, 2130, 7]
index = 2344
j = 0

#find tweets connected to the top 10 hashtags
key_players_pos = []
for i in groups_of_hashtags:
    if 'climatechange' in i or 'climatecrisis' in i or 'climateemergency' in i or 'globalwarming' in i or 'climateaction' in i or 'savetheplanet' in i or 'climate' in i or 'climateactionnow' in i or 'environment' in i or 'sustainability' in i:
        key_players_pos.append(j)
    j += 1
    
#draw pie chart
labels = 'bots', 'humans'
fig1, ax1 = plt.subplots()
ax1.pie(bots, labels=labels, autopct='%1.1f%%', startangle=90)

plt.show()

#Initialize rapid API
rapidapi_key = "xxxx"
twitter_app_auth = {
    'consumer_key': 'xxxx',
    'consumer_secret': 'xxxx',
    'access_token': 'xxxx',
    'access_token_secret': 'xxxx',
  }
bom = botometer.Botometer(wait_on_ratelimit=True,
                          rapidapi_key=rapidapi_key,
                          **twitter_app_auth

#request data
while True:
    try:
        ID = data['UserID'][index]
        print(ID)
        result = bom.check_account(ID)
        if result["raw_scores"]["universal"]["overall"] >= result["cap"]["universal"]:
            bots[0] += 1
        else:
            bots[1] += 1
        index += 1
    except tweepy.error.TweepError:
        print("protected")
        index += 1
        bots[2] += 1

In [None]:
#task 10: rank tweets according to the number of likes and retweets
tweet_score_rankings = []

#initialize list for calculating rankings
for tag in individual_hashtags:
    tweet_score_rankings.append((tag, 0))
    

for index, tags in enumerate(data['Hashtags']):
    #if retweet, dont calculate score since all the data transfers from the original tweet to retweets
    if data['Text'][index].startswith("RT"):
        continue
    
    likes = np.sqrt(data['Likes'][index])
    retweets = np.sqrt(data['Retweets'][index])
    
    final_score = likes+(retweets*3)
    
    #loop through the hashtags in this tweet
    for tag in tags.split():
        #clean the tag
        tag = tag.strip('\'.,[]').lower().replace('\\', '').replace('/', '').replace(':', '')
        
        #add new score to all hashtags that were included in this tweet
        for j, one_tag in enumerate(tweet_score_rankings):
            if tweet_score_rankings[j][0] == tag:
                tweet_score_rankings[j] = (tweet_score_rankings[j][0], tweet_score_rankings[j][1]+final_score)

#sort the ranking list
sorted_rankings = sorted(tweet_score_rankings, key=lambda tup: tup[1], reverse=True)

#50 best ranked tweets
for i in range(50):
    print(sorted_rankings[i])