The goal is to extract info about twitter users.
1. extract their past tweets
2. extract thier network (followers and following)
3. extract a random sample of the tweets of the follwers
4. check if the tweets include text about climate change

#Technical considerations

To compute the measures within the scope of this Sciathon, and due to twitter API limitations, we restricted our analysis to 8 accounts. These accounts represent users that are climate change activists, deniers, related to the lindau nobel meetings or should be agnostic to the subject. The percentage of tweets on climate change was measured on a subset of 200 tweets per account where retweets were counted the same as regular tweets. Followers with less than 10 tweets in total were excluded from the analysis. For the network analysis, we extracted the first 10,000 listed followers of each account and compared them to the first 10,000 followers of Greta Thunberg, who has many followers and actively tweets about climate change.
The analyses were done using twitter developer API wrapped in the Python package tweepy and the R package rtweet to calculate the account metrics. Plots and analyses were performed in Python with numpy, pandas and matplotlib and R. 

**NOTE:** As real-time data was used, the analysis cannot be replicated with identical results. Though the general trends are expected to be consistent over time.

In [None]:
# alternative to conda
# !pip install tweepy


In [None]:
# libraries
import numpy as np
import pandas as pd
import tweepy
import json
import os
import time



In [None]:
ll ~/twitter-keys.json

In [None]:
# twitter auth to use

# setting variables: tokens: # NOTE: THESE ARE SENSITIVE INFORMATION AND THEREFORE NOT SHARED HERE.

# copy the keys-template.json file from the code directory to your home directory (or modify the path below), 
# and enter the respective keys and tokens. 

with open(os.path.join(os.path.expanduser("~"),
                       'twitter-keys.json'),
         'r') as ifile:
    keys = json.load(ifile)

auth = tweepy.OAuthHandler(keys['consumer_key'], keys['consumer_secret'])
auth.set_access_token(keys['access_token'], keys['access_token_secret'])

api = tweepy.API(auth)

# verify, this works with one's own timelines last 5 tweets (and casually check twitter while at work)
public_tweets = api.home_timeline()
for n, tweet in enumerate(public_tweets):
    print(tweet.text)
    if n == 5:
        break

In [None]:
def limit_handled(cursor):
    while True:
        try:
            yield cursor.next()
        except tweepy.TweepError as msg: #tweepy.TweepError as msg:
            print(f'Pausing for 1 min due to {msg}.')
            time.sleep(60)
        except StopIteration:
            return

In [None]:
cc_hashtags = "#globalwarming #climatebrawl #climategate #climatechange #savetheplanet #environment #nature #climate #climatecrisis #ecofriendly #earth #sustainability #climatechangeisreal #climateemergency #climateaction #climatestrike #gogreen #zerowaste #gretathunberg #fridaysforfuture #green #savetheearth #plasticfree #pollution #sustainable #climatejustice #recycle #saveourplanet #globalwarmingisreal #eco #bhfyp"
cc_hashtags = cc_hashtags.split("#")
cc_hashtags.extend(['climatedenial', 'nca4', 'gobalchange', 'bigoil', 'ecofraud', 'climatechangehoax', 'climateaction', 'climatechange', 'climatecrisis', 'globalwarming', 'climateaction', 'climate', 'climateemergency', 'climatehysteria', 'climatehoax', 'climatealarmism', 'climatechangealarmism', 'climatechangehoax', 'climatechangehysteria', 'globalwarminghysteria', 'climatechangefraud', 'climateemergencyhoax', 'climatechangescam', 'globalwarminghoax', 'globalwarmingalarmism', 'globalwarmingcult', 'climatechangefrenzy', 'globalwarmingfraud', 'globalwarmingscam', 'globalwarmingnonsense', 'globalwarmingbullshit', 'climatefraud', 'climatescam', 'climatecult', 'climatenonsense', 'climatebullshit', 'climatechangebullshit', 'climatechangenonsense', 'climatechangecult'])
cc_hashtags = [x.strip() for x in cc_hashtags if x]

cc_hashtags = set(cc_hashtags)

cc_keywords = {'climate change', 'global warning', 'carbon dioxide', 'greenhouse gas', 'emissions', 'weather vs climate', 'fossil fuels', 'sea-level rise', 'global average temperature', 'renewable energy', 'unfccc', 'indc', 'ipcc', 'greenhouse effect', 'the denial machine', 'clexit coalition'}
print(cc_hashtags)
print(cc_keywords)

In [None]:
def cc_tweet_percentage(user_handle, hashtag_set=cc_hashtags, word_set=cc_keywords, max_number_of_tweets=200):
    # print(f'calculating the cc_percentage for {user_handle}')
    # hashtags = []
    mentions = []
    tweets_text = []
    tweet_count = 0
    cc_tweets = 0

    for status in limit_handled(tweepy.Cursor(api.user_timeline, screen_name=user_handle).items()):
      tweet_count += 1
      if max_number_of_tweets and tweet_count > max_number_of_tweets:
        break
      text = status.text
      
      hashtags = {tag['text'].lower() for tag in status._json['entities']['hashtags']}

      if not hashtag_set.isdisjoint(hashtags): # there is an overlap between the hashtags in the tweet and our hashtag list
        cc_tweets += 1
        continue # we don't have to look at the text now
      
      for pharse in word_set:
        if pharse in text.lower():
          cc_tweets += 1
          break
     #  print(text)
    print(f'{user_handle} -> cc_tweets: {cc_tweets}, tweet_count: {tweet_count}')
    return cc_tweets, tweet_count



In [None]:

def influencer_cc_score(influencer_handle, max_followers=200, min_statuses=10, country_codes={'US'}):
  print(f'calculating the influencer_cc_score for {influencer_handle}.')
  cc_tweet_percentages = []
  followers = limit_handled(tweepy.Cursor(api.followers, screen_name=influencer_handle, count=max_followers).items())
  for n, follower in enumerate(followers):
    follower_handle = follower.screen_name
    # print(f'looking at {follower_handle}')
    # filter for the followers here: exclude people with less than 
    if follower._json['statuses_count'] < min_statuses:
      #print('not enough tweets')
      continue
    if follower.protected:
      continue
    # if not follower._json['derived']['country_code'] in country_codes:
    #  continue

    # print(f'{influencer_handle} s follower: {follower}')
    # try:
    cc, total = cc_tweet_percentage(follower_handle)
    # print(f'{cc} / {total} climate change tweets')
    perc = cc / total
    cc_tweet_percentages.append(perc)
    # except tweepy.TweepError:
    #   time.sleep(60) 
    if n == max_followers:
      break
  print(f'Got cc percentages: {cc_tweet_percentages}')
  return cc_tweet_percentages

In [None]:
def get_follower_count(handle):
  user = api.get_user(handle)
  return user._json['followers_count']

In [None]:
def percentage_above_threshold(values, threshold):
  if len(values) == 0:
    return np.nan
  above = sum(np.asarray(values) > threshold) # summing boolean variables
  return above / len(values)

In [None]:
# reworked:

def calculate_influencer_df(influencer_list, influencer_dict):
  # init stuff

  for influencer in influencer_list:
    influencer_dict[influencer] = {}
    cc_tweets, tweet_count = cc_tweet_percentage(influencer)
    follower_count = get_follower_count(influencer)

    influencer_dict[influencer]['FollowerCount'] = follower_count

    print(f'Influencer: {influencer} -> {cc_tweets} / {tweet_count} tweets mentioning climate change')

    influencer_dict[influencer]['cc_tweet_percentage'] = cc_tweets / tweet_count
    influencer_dict[influencer]['cc_tweetcc_tweet_absolute_percentage'] = cc_tweets
    influencer_dict[influencer]['total_tweets'] =  tweet_count
    
    follower_cc_percentages = influencer_cc_score(influencer)
    # follower_cc_percentages = [0.1, 0.2, 0.1, 0.0, 0.0] # testing only

    influencer_dict[influencer]['follower_cc_score_raw'] = follower_cc_percentages
    influencer_dict[influencer]['follower_cc_score_n_entries'] = len(follower_cc_percentages)
    influencer_dict[influencer]['follower_cc_score_mean'] = np.mean(follower_cc_percentages)
    influencer_dict[influencer]['follower_cc_score_any'] = percentage_above_threshold(follower_cc_percentages, 0.0)


    print(f'. Follower cc tweet percentages: {follower_cc_percentages}')

  return influencer_dict

In [None]:


influencer_list = ['ClimateDepot', 'LeoDiCaprio', 'Beyonce', 'R_Thaler', 'williamusa22', 'GretaThunberg', 'realdonaldtrump', 'paulkrugman', 'billieeilish', 'bdemelle',
                   'junkscience', 'lindaunobel']
influencer_dict = {}

filled_influencer_dict = calculate_influencer_df(influencer_list, influencer_dict) 
filled_influencer_dict 


In [None]:
# Due to errors, the dict object could not be retrieved. Instead we parsed the logs with the following methods:


In [None]:
# parsing the logs

columns = ['Name', 'FollowerCount', 'CCTweets', 'TotalTweets']

def parse_logs(logs, influencers, columns):
  out = {influencer: {} for influencer in influencers}
  for line in logs.split('\n'):
    if '__main__' in line:
      continue
    if line.startswith('Pausing'):
      continue
    if line.startswith('/'): # warnings and errors
      continue
    if line.startswith('calculating'):
      continue
    if line.startswith('Got'):
      continue
    if 'Follower cc tweet' in line:
      continue
    if line.startswith('Influencer'): 
      splitted = line.split() # Influencer: williamusa22 -> 0 / 1 tweets mentioning climate change
      influencer_name = splitted[1]
      cc_tweets = splitted[3]
      total_tweets = splitted[5]
      out[influencer_name]['CCTweets'] = cc_tweets
      out[influencer_name]['TotalTweets'] = total_tweets
      out[influencer_name]['CCPercentage'] = int(cc_tweets) / int(total_tweets)

      out[influencer_name]['FollowerCCPercentages'] = []
      out[influencer_name]['FollowerCCPercentagesRaw'] = []
    # majority of lines: last influencers followers:
    splits = line.split()
    if len(splits) == 0:
      continue
    if splits[0] in influencers: # log for calculating the next influencers metrics
      continue

    cc_tweets = int(splits[3].replace(',', ''))
    total_tweets = int(splits[5].strip())

    out[influencer_name]['FollowerCCPercentagesRaw'].append([cc_tweets, total_tweets])
    out[influencer_name]['FollowerCCPercentages'].append(cc_tweets / total_tweets)
  return out


In [None]:
influencer_list = ['williamusa22', 'GretaThunberg', 'realdonaldtrump', 'paulkrugman']


log = """
/usr/local/lib/python3.6/dist-packages/ipykernel_launcher.py:9: DeprecationWarning: generator 'limit_handled' raised StopIteration
  if __name__ == '__main__':
williamusa22 -> cc_tweets: 0, tweet_count: 1
Influencer: williamusa22 -> 0 / 1 tweets mentioning climate change
calculating the influencer_cc_score for williamusa22.
Pausing for 1 min due to [{'message': 'Rate limit exceeded', 'code': 88}].
Pausing for 1 min due to [{'message': 'Rate limit exceeded', 'code': 88}].
...
"""

In [None]:
parsed = parse_logs(log, influencer_list, columns)

print('Printing return')
print(parsed)


# process the parsed data
for influencer in parsed:
  try:
    parsed[influencer]['FollowerNonZeroCCTweetsPercentage'] = sum(np.asarray(parsed[influencer]['FollowerCCPercentages']) > 0) / len(parsed[influencer]['FollowerCCPercentages'])
    parsed[influencer]['FollowerCCTweetAverage'] = np.mean(parsed[influencer]['FollowerCCPercentages'])
  except KeyError:
    print(f'Problem for {influencer}')
df = pd.DataFrame(parsed).T


print(df.drop(columns=['FollowerCCPercentages', 'FollowerCCPercentagesRaw']).to_csv())

In [None]:
# the CSV files generated by this function were further analysed in a google spreadsheet.