In [37]:
import pandas as pd
import numpy as np
import networkx as nx
import pickle
import json
import requests
import tweepy
import os

key = ""
secret = ""
token = ""

api = tweepy.Client(bearer_token=token, consumer_key=key, consumer_secret=secret, 
              return_type=requests.Response,
              wait_on_rate_limit=True)

In [254]:
def extract_uid(full_path):
    centerNode = file.replace('../','').replace('.pkl','').replace('edgelist_2_deg_','')\
    .replace('edgelist_mentions_','').replace('/','').replace('human','')\
    .replace('bot','').replace('mentions','').replace('edgelist_2_edges_from_start_', '')
    
    return int(centerNode)

In [255]:
# Returns the followers/following of the center node in the network (because these aren't in the pickle files)
def getCenterNodeFollows(file):
    centerNode = extract_uid(file)
    r = api.get_user(id=centerNode, user_fields='public_metrics')
    user = r.json()

    followers = user['data']['public_metrics']['followers_count']
    following = user['data']['public_metrics']['following_count']
    
    return followers, following, centerNode

In [256]:
def is_valid_network(df):
    if df.shape[0] == 0:
        print(f'No Network: {full_path}')
        return False
    return True

def is_valid(file, uid):
    edgelist = pd.read_pickle(file)
    edgelist.originating_id = edgelist.originating_id.astype(str)
    edgelist.receiving_id = edgelist.receiving_id.astype(str)
    t = edgelist.originating_id.append(edgelist.receiving_id).unique()
    if str(uid) in t:
        return True
    print(f'Error - Main twitter id not in edgelist: {uid} ')
    return False
    

In [257]:
# Takes in a pickle object and returns in/out degree centralities, 
# clustering coefficient, local homophily metrics, and PageRank for account ID.
def node_calculations(file):
    full_list = pd.read_pickle(file)
    
    # Check if valid
    if not is_valid_network(full_list):
        feats = [0 for i in range(8)]
        return feats
    
    if 'edge_weight' in full_list.columns:
        weightlist = full_list['edge_weight']
    else:
        weightlist = [1]*len(full_list['originating_id'])
    
    edgelist = pd.DataFrame({
        "source": full_list['originating_id'],
        "target": full_list['receiving_id'],
        "weight": weightlist,
    })
    
    # store originating ID followers
    attr = dict()
    for i, row in full_list.iterrows():
        if not pd.isna(row['originating_following_count']):
            attr[row['originating_id']] = {'in': row['originating_follower_count'], 
                                           'out': row['originating_following_count']}
        if not pd.isna(row['receiving_following_count']):
            attr[row['receiving_id']] = {'in': row['receiving_follower_count'], 
                                         'out': row['receiving_following_count']}
            
    IDfollowers, IDfollowing, ID = getCenterNodeFollows(file)
    attr[ID] = {'in': IDfollowers, 'out': IDfollowing}
    
    # NOTE: nothing in "attr" represents the degrees of nodes in OUR network; the attributes represent
    # the degrees of nodes in the FULL network
    
    #print(attr)
            
    G = nx.from_pandas_edgelist(edgelist, create_using=nx.DiGraph(), edge_attr=True)
    nx.set_node_attributes(G, attr)
    
    nodes = list(G.nodes)
    
    
    clustercoeff = nx.clustering(G, ID, weight='weight')
    pagerank = nx.pagerank(G, weight='weight')[ID]

    
    # compute "local homophily" for ID's degree structure in two different ways
    indeg = len(list(G.predecessors(ID)))
    outdeg = len(list(G.successors(ID)))
    
    outdeg_Assort1 = 0
    outdeg_Assort2 = 0
    indeg_Assort1 = 0
    indeg_Assort2 = 0
    
    if outdeg != 0:
        for node in G.neighbors(ID):
            nodeOutdeg = attr[node]['out']
            outdeg_Assort1 += nodeOutdeg
            if nodeOutdeg >= IDfollowing/2 and nodeOutdeg <= IDfollowing*2:
                outdeg_Assort2 += 1
                
        outdeg_Assort1 = outdeg_Assort1 / (outdeg*IDfollowing)
        outdeg_Assort2 = outdeg_Assort2 / outdeg

    if indeg != 0: 
        for node in G.predecessors(ID):    
            nodeIndeg = attr[node]['in']
            indeg_Assort1 += nodeIndeg
            if nodeIndeg >= IDfollowers/2 and nodeIndeg <= IDfollowers*2:
                indeg_Assort2 += 1
    
        indeg_Assort1 = indeg_Assort1 / (indeg*IDfollowers)
        indeg_Assort2 = indeg_Assort2 / indeg
    
    
    return (IDfollowers, IDfollowing, clustercoeff, pagerank, indeg_Assort1, outdeg_Assort1, \
            indeg_Assort2, outdeg_Assort2)

In [260]:
def extract_feats(full_path, df, invalid):
    # Get twitter id
    uid = extract_uid(full_path)

    # Calc metrics
    edge = pd.read_pickle(full_path)
    if not is_valid_network(edge):
        feats = [0 for i in range(8)]
    elif not is_valid(full_path, uid):
        return df, invalid
    else:
        feats = list(node_calculations(full_path))

    # Create row for df
    row = [uid]
    row.extend(feats)
    row.append(label)
    rowdf = pd.DataFrame(row).T
    rowdf.columns = cols
    df = pd.concat([df, rowdf])
    
    return df, invalid

In [261]:
# Create pd dataframe for mentions
cols = ['twitter_id', 'indeg', 'outdeg', 'clustercoeff', 'pagerank', 'indeg_A1', 
        'outdeg_A1', 'indeg_A2', 'outdeg_A2', 'label']
df = pd.DataFrame(columns=cols)

invalid = []
dirr = '../mentions/'
for label in ['bot', 'human']:
    path = dirr + label + '/'
    for file in os.listdir(dirr + label + '/'):
        if os.path.isdir(file):
            continue
        full_path = path + file
        
        # Get twitter id
        uid = extract_uid(full_path)
        
        # Another check if valid
        if not is_valid(full_path, uid):
            invalid.append(uid)
            continue
        
        # Calc metrics; create row for df
        feats = list(node_calculations(full_path))
        row = [uid]
        row.extend(feats)
        row.append(label)
        rowdf = pd.DataFrame(row).T
        rowdf.columns = cols
        df = pd.concat([df, rowdf])

KeyboardInterrupt: 

In [None]:
'mentions', 

In [271]:
pd.read_pickle('../likes/bot/2855732920')

Unnamed: 0,originating_id,receiving_id,edge_weight,originating_follower_count,originating_following_count,receiving_follower_count,receiving_following_count
0,1643218550,2494344283,4,29116,1583,37083.0,21057.0
1,23737582,2494344283,4,1315676,836,37083.0,21057.0
2,49110693,2494344283,3,1864163,588,37083.0,21057.0
3,259773507,2494344283,3,22816,202,37083.0,21057.0
4,2863774303,2494344283,3,272751,61,37083.0,21057.0
5,256659585,2494344283,2,1310915,9525,37083.0,21057.0
6,750804771750150144,2494344283,1,205,790,37083.0,21057.0
7,724989967798292480,2494344283,1,32468,647,37083.0,21057.0
8,31583882,2494344283,1,481146,42459,37083.0,21057.0
9,951934601949270016,2494344283,1,22781,5809,37083.0,21057.0


In [269]:
# Create pd dataframes for features
cols = ['twitter_id', 'indeg', 'outdeg', 'clustercoeff', 'pagerank', 'indeg_A1', 
        'outdeg_A1', 'indeg_A2', 'outdeg_A2', 'label']

networks = ['likes', 'follows']

for net in networks:
    invalid = []

    df = pd.DataFrame(columns=cols)
    dirr = '../' + net + '/'
    for label in ['bot', 'human']:
        path = dirr + label + '/'
        for file in os.listdir(dirr + label + '/'):
            if os.path.isdir(file):
                continue
            full_path = path + file
            print(full_path)

            #df, d[net] = extract_feats(full_path, df, invalid)

    #df.to_pickle('feature_data/' + net + '.pkl')
    
    
    

../likes/bot/790017240733278208
../likes/bot/2855732920
../likes/bot/4307623643
../likes/bot/709807650
../likes/bot/875984489138200577
../likes/bot/1016773517587537920
../likes/bot/2491508162
../likes/bot/913028718377005056
../likes/bot/1524730152
../likes/bot/1661404440
../likes/bot/837316817324556289
../likes/bot/844779426260791296
../likes/bot/2928556470
../likes/bot/1567185626
../likes/bot/490645569
../likes/bot/1416121214
../likes/bot/976973737
../likes/bot/352132252
../likes/bot/2385417602
../likes/bot/1004764640486854656
../likes/bot/449470627
../likes/bot/likes_bot_170568448
../likes/bot/1433472060
../likes/bot/1421859620
../likes/bot/972532340
../likes/bot/4808016715
../likes/bot/likes_bot_139954867
../likes/bot/300415996
../likes/bot/2901953015
../likes/bot/1334673584
../likes/bot/821215184660393986
../likes/human/1655904638
../likes/human/2266976480
../likes/human/757821654571425792
../likes/human/884092539418152960
../likes/human/4307623643
../likes/human/1914493741
../like

In [265]:
pd.read_pickle('feature_data/likes.pkl').shape[0]

63

In [267]:
pd.read_pickle('feature_data/follows.pkl').shape[0]

76

In [268]:
pd.read_pickle('feature_data/mentions.pkl').shape[0]

451

In [272]:
451+76+63

590

In [248]:
full_path = '../likes/bot/790017240733278208'

In [249]:
uid = extract_uid(full_path)

# Another check if valid
is_valid(full_path, uid)

Error - Main twitter id not in edgelist: 2184009647 


False

In [251]:
edgelist = pd.read_pickle(full_path)
edgelist.originating_id = edgelist.originating_id.astype(str)
edgelist.receiving_id = edgelist.receiving_id.astype(str)
t = edgelist.originating_id.append(edgelist.receiving_id).unique()


In [253]:
edgelist

Unnamed: 0,originating_id,receiving_id,originating_following_count,originating_follower_count
