In [1]:
import json_lines
import pandas as pd
import pickle
import numpy as np

import lxml.etree as etree # to create gexl file
import datetime

import geopandas
import geopy

# Find 10,000 influential users ID

In [15]:
# Define function to extract and measure user influence

def load_jsonl(file):
    tweets = []
    with open(file, 'rb') as f:
        for tweet in json_lines.reader(f, broken=True): 
            reduced_tweet = {
                'created_at'    : tweet['created_at'],
                'id'            : tweet['id_str'],
                'text'          : tweet['full_text'],
                'retweet_count' : tweet['retweet_count'],               
                'like_count'    : tweet['favorite_count'],
                
                'reply_user_id' : tweet['in_reply_to_user_id_str'],
                'reply_id'      : tweet['in_reply_to_status_id_str'],
                'reply_username': tweet['in_reply_to_screen_name'],

                'username'    : tweet['user']['screen_name'],
                'user_joined' : tweet['user']['created_at'][-4:],
                'user_id'     : tweet['user']['id_str'],
                'location'    : tweet['user']['location'],
                'followers'   : tweet['user']['followers_count'],
                'friends'     : tweet['user']['friends_count']
            }               
            if 'retweeted_status' in tweet:
                reduced_tweet['retweets'] = {
                    'created_at'    : tweet['retweeted_status']['created_at'],
                    'id'            : tweet['retweeted_status']['id_str'],
                    'text'          : tweet['retweeted_status']['full_text'],
                    'retweet_count' : tweet['retweeted_status']['retweet_count'],               
                    'like_count'    : tweet['retweeted_status']['favorite_count'],               

                    'username'    : tweet['retweeted_status']['user']['screen_name'],
                    'user_joined' : tweet['retweeted_status']['user']['created_at'][-4:],
                    'user_id'     : tweet['retweeted_status']['user']['id_str'],
                    'location'    : tweet['retweeted_status']['user']['location'],         
                    'followers'   : tweet['retweeted_status']['user']['followers_count'],
                    'friends'     : tweet['retweeted_status']['user']['friends_count'],
                }
            if 'quoted_status' in tweet:
                reduced_tweet['quotes'] = {
                    'created_at'    : tweet['quoted_status']['created_at'],
                    'id'            : tweet['quoted_status']['id_str'],
                    'text'          : tweet['quoted_status']['full_text'],
                    'retweet_count' : tweet['quoted_status']['retweet_count'],               
                    'like_count'    : tweet['quoted_status']['favorite_count'],               

                    'username'    : tweet['quoted_status']['user']['screen_name'],
                    'user_joined' : tweet['quoted_status']['user']['created_at'][-4:],
                    'user_id'     : tweet['quoted_status']['user']['id_str'],
                    'location'    : tweet['quoted_status']['user']['location'],         
                    'followers'   : tweet['quoted_status']['user']['followers_count'],
                    'friends'     : tweet['quoted_status']['user']['friends_count'],
                }    

            tweets.append(reduced_tweet)
        return (tweets)

In [None]:
# extract and measure user influence by file
for i in range(0,4):
    
    tweets = load_jsonl(r"D:\twarc\climatetweetID\hydratefile\climate_id_0%d.jsonl" % (i))
    pdtweet = pd.DataFrame(tweets)
    
    ori_tweet = pdtweet[pdtweet['retweets'].apply(lambda x: type(x) != dict)]
    
    # user unique information: (user name, id, location, followers)
    user_info = ori_tweet.drop_duplicates(subset = ['user_id'])
    user_info = user_info[["username","user_id","location","followers","friends"]]
    user_info.to_csv(r"D:\twarc\climatetweetID\tweets_prediction\influence\allusers_info\output_2\user_%d.csv" % (i))
    
    # user tweets summary: (retweet count, like count, id)
    tweetinfluence = ori_tweet.groupby('user_id').agg({
        'retweet_count' : 'sum',
        'like_count'    : 'sum',
        'id' : 'count'
    }).sort_values(by = ['retweet_count'], ascending=False)
    tweetinfluence.to_csv(r"D:\twarc\climatetweetID\tweets_prediction\influence\allusers_info\output_2\influence_%d.csv" % (i))
  
    print(i)

In [3]:
# Merge files and select top 10,000 influential users
influence = pd.DataFrame()
for i in range(0,4):
    influence_ = pd.read_csv(r"D:\twarc\climatetweetID\tweets_prediction\influence\allusers_info\output_2\influence_%d.csv" % (i),dtype=object)
    influence = influence.append(influence_)
influence = influence.reset_index(drop=True)
influence = influence.astype({'retweet_count': 'int32',
                              'like_count': 'int32',
                              'id': 'int32'})
influence = influence.groupby('user_id', as_index=False).agg({
        'retweet_count' : 'sum',
        'like_count'    : 'sum',
        'id' : 'sum'})
influence['influence'] = influence['retweet_count'] + influence['like_count']
influence = influence.sort_values(by = ['influence'], ascending=False)

# users info
userinfo = pd.DataFrame()
for i in range(0,4):
    userinfo_ = pd.read_csv(r"D:\twarc\climatetweetID\tweets_prediction\influence\allusers_info\output_2\user_%d.csv" % (i),dtype=object)
    userinfo  = userinfo.append(userinfo_)

userinfo = userinfo.drop_duplicates(subset = ['user_id'])
userinfo = userinfo.reset_index(drop=True)
userinfo = userinfo.drop(columns=['Unnamed: 0'])

# join top users and user information
influence = influence.set_index('user_id').join(userinfo.set_index('user_id'))
influence = influence.sort_values(by = ['influence'], ascending=False).reset_index()
influence = influence.rename(columns = {'id':'total_tweet'})

In [11]:
# select the top 10,000 and save
with open(r"D:\twarc\climatetweetID\tweets_prediction\influence\allusers_info\output_3\topinfluencer.txt", "wb") as fp:   #Pickling
        pickle.dump(influence.head(5000), fp)

In [65]:
locator = geopy.Nominatim(user_agent= 'myGeocoder')
location = locator.geocode("michigan")
location

Location(Michigan, United States, (43.6211955, -84.6824346, 0.0))

In [12]:
# select tweets only in the US
with open(r"D:\twarc\climatetweetID\tweets_prediction\influence\allusers_info\output_3\topinfluencer.txt", "rb") as fp:   #Pickling
    user = pickle.load(fp)

In [13]:
# geocode address by geopy
locator = geopy.Nominatim(user_agent= 'myGeocoder')
from geopy.extra.rate_limiter import RateLimiter

# 1 - conveneint function to delay between geocoding calls
geocode = RateLimiter(locator.geocode, min_delay_seconds = 0.1)
# 2- - create location column
user['geocode'] = user['location'].apply(geocode)
# 3 - create longitude, laatitude and altitude from location column (returns tuple)
user['point'] = user['geocode'].apply(lambda loc: tuple(loc.point) if loc else None)
# 4 - split point column into latitude, longitude and altitude columns
user[['latitude', 'longitude', 'altitude']] = pd.DataFrame(user['point'].tolist(), index=user.index)
user = user.drop(['point','geocode','altitude'], axis=1)
user

RateLimiter caught an error, retrying (0/2 tries). Called with (*('L.A., Calif.',), **{}).
Traceback (most recent call last):
  File "C:\Apps\Anaconda\lib\site-packages\urllib3\connectionpool.py", line 426, in _make_request
    six.raise_from(e, None)
  File "<string>", line 3, in raise_from
  File "C:\Apps\Anaconda\lib\site-packages\urllib3\connectionpool.py", line 421, in _make_request
    httplib_response = conn.getresponse()
  File "C:\Apps\Anaconda\lib\http\client.py", line 1332, in getresponse
    response.begin()
  File "C:\Apps\Anaconda\lib\http\client.py", line 303, in begin
    version, status, reason = self._read_status()
  File "C:\Apps\Anaconda\lib\http\client.py", line 264, in _read_status
    line = str(self.fp.readline(_MAXLINE + 1), "iso-8859-1")
  File "C:\Apps\Anaconda\lib\socket.py", line 669, in readinto
    return self._sock.recv_into(b)
  File "C:\Apps\Anaconda\lib\ssl.py", line 1241, in recv_into
    return self.read(nbytes, buffer)
  File "C:\Apps\Anaconda\lib\

RateLimiter swallowed an error after 2 retries. Called with (*('L.A., Calif.',), **{}).
Traceback (most recent call last):
  File "C:\Apps\Anaconda\lib\site-packages\urllib3\connectionpool.py", line 426, in _make_request
    six.raise_from(e, None)
  File "<string>", line 3, in raise_from
  File "C:\Apps\Anaconda\lib\site-packages\urllib3\connectionpool.py", line 421, in _make_request
    httplib_response = conn.getresponse()
  File "C:\Apps\Anaconda\lib\http\client.py", line 1332, in getresponse
    response.begin()
  File "C:\Apps\Anaconda\lib\http\client.py", line 303, in begin
    version, status, reason = self._read_status()
  File "C:\Apps\Anaconda\lib\http\client.py", line 264, in _read_status
    line = str(self.fp.readline(_MAXLINE + 1), "iso-8859-1")
  File "C:\Apps\Anaconda\lib\socket.py", line 669, in readinto
    return self._sock.recv_into(b)
  File "C:\Apps\Anaconda\lib\ssl.py", line 1241, in recv_into
    return self.read(nbytes, buffer)
  File "C:\Apps\Anaconda\lib\ssl

RateLimiter caught an error, retrying (1/2 tries). Called with (*('San Francisco/New York',), **{}).
Traceback (most recent call last):
  File "C:\Apps\Anaconda\lib\site-packages\urllib3\connectionpool.py", line 426, in _make_request
    six.raise_from(e, None)
  File "<string>", line 3, in raise_from
  File "C:\Apps\Anaconda\lib\site-packages\urllib3\connectionpool.py", line 421, in _make_request
    httplib_response = conn.getresponse()
  File "C:\Apps\Anaconda\lib\http\client.py", line 1332, in getresponse
    response.begin()
  File "C:\Apps\Anaconda\lib\http\client.py", line 303, in begin
    version, status, reason = self._read_status()
  File "C:\Apps\Anaconda\lib\http\client.py", line 264, in _read_status
    line = str(self.fp.readline(_MAXLINE + 1), "iso-8859-1")
  File "C:\Apps\Anaconda\lib\socket.py", line 669, in readinto
    return self._sock.recv_into(b)
  File "C:\Apps\Anaconda\lib\ssl.py", line 1241, in recv_into
    return self.read(nbytes, buffer)
  File "C:\Apps\Ana

RateLimiter caught an error, retrying (0/2 tries). Called with (*('London / New York / Hong Kong',), **{}).
Traceback (most recent call last):
  File "C:\Apps\Anaconda\lib\site-packages\urllib3\connectionpool.py", line 426, in _make_request
    six.raise_from(e, None)
  File "<string>", line 3, in raise_from
  File "C:\Apps\Anaconda\lib\site-packages\urllib3\connectionpool.py", line 421, in _make_request
    httplib_response = conn.getresponse()
  File "C:\Apps\Anaconda\lib\http\client.py", line 1332, in getresponse
    response.begin()
  File "C:\Apps\Anaconda\lib\http\client.py", line 303, in begin
    version, status, reason = self._read_status()
  File "C:\Apps\Anaconda\lib\http\client.py", line 264, in _read_status
    line = str(self.fp.readline(_MAXLINE + 1), "iso-8859-1")
  File "C:\Apps\Anaconda\lib\socket.py", line 669, in readinto
    return self._sock.recv_into(b)
  File "C:\Apps\Anaconda\lib\ssl.py", line 1241, in recv_into
    return self.read(nbytes, buffer)
  File "C:\A

RateLimiter swallowed an error after 2 retries. Called with (*('London / New York / Hong Kong',), **{}).
Traceback (most recent call last):
  File "C:\Apps\Anaconda\lib\site-packages\urllib3\connectionpool.py", line 426, in _make_request
    six.raise_from(e, None)
  File "<string>", line 3, in raise_from
  File "C:\Apps\Anaconda\lib\site-packages\urllib3\connectionpool.py", line 421, in _make_request
    httplib_response = conn.getresponse()
  File "C:\Apps\Anaconda\lib\http\client.py", line 1332, in getresponse
    response.begin()
  File "C:\Apps\Anaconda\lib\http\client.py", line 303, in begin
    version, status, reason = self._read_status()
  File "C:\Apps\Anaconda\lib\http\client.py", line 264, in _read_status
    line = str(self.fp.readline(_MAXLINE + 1), "iso-8859-1")
  File "C:\Apps\Anaconda\lib\socket.py", line 669, in readinto
    return self._sock.recv_into(b)
  File "C:\Apps\Anaconda\lib\ssl.py", line 1241, in recv_into
    return self.read(nbytes, buffer)
  File "C:\Apps

RateLimiter caught an error, retrying (1/2 tries). Called with (*('1420405751 ...Home',), **{}).
Traceback (most recent call last):
  File "C:\Apps\Anaconda\lib\site-packages\urllib3\connectionpool.py", line 426, in _make_request
    six.raise_from(e, None)
  File "<string>", line 3, in raise_from
  File "C:\Apps\Anaconda\lib\site-packages\urllib3\connectionpool.py", line 421, in _make_request
    httplib_response = conn.getresponse()
  File "C:\Apps\Anaconda\lib\http\client.py", line 1332, in getresponse
    response.begin()
  File "C:\Apps\Anaconda\lib\http\client.py", line 303, in begin
    version, status, reason = self._read_status()
  File "C:\Apps\Anaconda\lib\http\client.py", line 264, in _read_status
    line = str(self.fp.readline(_MAXLINE + 1), "iso-8859-1")
  File "C:\Apps\Anaconda\lib\socket.py", line 669, in readinto
    return self._sock.recv_into(b)
  File "C:\Apps\Anaconda\lib\ssl.py", line 1241, in recv_into
    return self.read(nbytes, buffer)
  File "C:\Apps\Anacond

RateLimiter caught an error, retrying (0/2 tries). Called with (*('🇳🇬 in Cape Town 🇿🇦',), **{}).
Traceback (most recent call last):
  File "C:\Apps\Anaconda\lib\site-packages\urllib3\connectionpool.py", line 426, in _make_request
    six.raise_from(e, None)
  File "<string>", line 3, in raise_from
  File "C:\Apps\Anaconda\lib\site-packages\urllib3\connectionpool.py", line 421, in _make_request
    httplib_response = conn.getresponse()
  File "C:\Apps\Anaconda\lib\http\client.py", line 1332, in getresponse
    response.begin()
  File "C:\Apps\Anaconda\lib\http\client.py", line 303, in begin
    version, status, reason = self._read_status()
  File "C:\Apps\Anaconda\lib\http\client.py", line 264, in _read_status
    line = str(self.fp.readline(_MAXLINE + 1), "iso-8859-1")
  File "C:\Apps\Anaconda\lib\socket.py", line 669, in readinto
    return self._sock.recv_into(b)
  File "C:\Apps\Anaconda\lib\ssl.py", line 1241, in recv_into
    return self.read(nbytes, buffer)
  File "C:\Apps\Anacond

RateLimiter swallowed an error after 2 retries. Called with (*('🇳🇬 in Cape Town 🇿🇦',), **{}).
Traceback (most recent call last):
  File "C:\Apps\Anaconda\lib\site-packages\urllib3\connectionpool.py", line 426, in _make_request
    six.raise_from(e, None)
  File "<string>", line 3, in raise_from
  File "C:\Apps\Anaconda\lib\site-packages\urllib3\connectionpool.py", line 421, in _make_request
    httplib_response = conn.getresponse()
  File "C:\Apps\Anaconda\lib\http\client.py", line 1332, in getresponse
    response.begin()
  File "C:\Apps\Anaconda\lib\http\client.py", line 303, in begin
    version, status, reason = self._read_status()
  File "C:\Apps\Anaconda\lib\http\client.py", line 264, in _read_status
    line = str(self.fp.readline(_MAXLINE + 1), "iso-8859-1")
  File "C:\Apps\Anaconda\lib\socket.py", line 669, in readinto
    return self._sock.recv_into(b)
  File "C:\Apps\Anaconda\lib\ssl.py", line 1241, in recv_into
    return self.read(nbytes, buffer)
  File "C:\Apps\Anaconda\l

Unnamed: 0,user_id,retweet_count,like_count,total_tweet,influence,username,location,followers,friends,latitude,longitude
0,138203134,147567,706180,20,853747,AOC,"Bronx + Queens, NYC",10989494,2958,40.809458,-73.793548
1,15088481,329320,436053,1549,765373,MikeHudema,Unceded Squamish Territory,131345,38822,,
2,36711022,150998,554973,27,705971,DanRather,"New York, NY",1772936,688,40.712728,-74.006015
3,950477244,358553,323199,2706,681752,PaulEDawson,"Glasgow, Scotland",57193,22793,55.860982,-4.248879
4,19725644,129920,437552,4,567472,neiltyson,New York City,14412201,39,40.712728,-74.006015
...,...,...,...,...,...,...,...,...,...,...,...
4995,28259151,326,652,7,978,Timpmurray,"Sydney, New South Wales",4200,971,-33.854816,151.216454
4996,47739450,378,600,2,978,ByronYork,"Washington, D.C.",469012,1374,38.894992,-77.036558
4997,155240932,256,721,2,977,knightwatchman_,dev repository,8296,208,,
4998,827237152836685826,471,506,1,977,Dr_Woga,"Lübeck, Deutschland",76703,76202,53.866444,10.684738


In [16]:
user.to_csv(r"D:\twarc\climatetweetID\tweets_prediction\influence\allusers_info\output_3\top5000.csv")
# Then go to R to select US users