In [1]:
import torch
import numpy as np
import pandas as pd
from torch_geometric.data import Data
from torch_geometric.nn import GCNConv, BatchNorm
import torch.nn.functional as F
from sklearn.preprocessing import StandardScaler

In [2]:
dtypes = {
    'text': str,
    'user_id': str,
    'in_reply_to_user_id': str,
    'in_reply_to_screen_name': str,
    'place': str,
    'retweet_count': str,
    'reply_count': str,
    'favorite_count': str,
    'possibly_sensitive': str,
    'num_hashtags': str,
    'num_urls': str,
    'num_mentions': str,
    'bot': str,
}

df = pd.read_csv('twitter_data.csv', index_col=False, dtype=dtypes)
df = df.drop(columns=['Unnamed: 0'])
df.shape

  df = pd.read_csv('twitter_data.csv', index_col=False, dtype=dtypes)


(24806824, 13)

In [3]:
df.head()

Unnamed: 0,text,user_id,in_reply_to_user_id,in_reply_to_screen_name,place,retweet_count,reply_count,favorite_count,possibly_sensitive,num_hashtags,num_urls,num_mentions,bot
0,I Pooh - In silenzio 1968 http://t.co/ahvQxUqTws,24858289.0,0,,,0,0,0,,0.0,1,0,1
1,http://t.co/HyI5EQKz6Q,24858289.0,0,,,0,0,0,,0.0,1,0,1
2,"Tutti a tavola, con il filetto di baccalà. htt...",24858289.0,0,,,0,0,0,,0.0,1,0,1
3,http://t.co/NAHQ4l2pUy,24858289.0,0,,,0,0,0,,0.0,1,0,1
4,Gold - Spandau Ballet http://t.co/o8ZJHt7Neu,24858289.0,0,,,0,0,0,,0.0,1,0,1


# Clean up the data

### Convert user_id and in_reply_to_user_id column to int. Drop rows where conversion failed.

In [4]:
df['user_id'] = pd.to_numeric(df['user_id'], errors='coerce').fillna(-1).astype(int)
df['in_reply_to_user_id'] = pd.to_numeric(df['in_reply_to_user_id'], errors='coerce').fillna(-1).astype(int)

In [5]:
rows_before = df.shape[0]

df = df.drop(df[df['user_id'] == -1].index)
df = df.drop(df[df['in_reply_to_user_id'] == -1].index)

rows_after = df.shape[0]

dropped_rows = rows_before - rows_after
print(f'Dropped {dropped_rows:,} rows')
print(f'Rows remaining: {rows_after:,}')

Dropped 11,919,554 rows
Rows remaining: 12,887,270


### Convert retweet_count, reply_count, favorite_count, num_hashtags, num_urls, num_mentions to int. Drops rows where conversion failed.

In [6]:
df['retweet_count'] = pd.to_numeric(df['retweet_count'], errors='coerce').fillna(-1).astype(int)
df[df['retweet_count'] == -1].shape

df['reply_count'] = pd.to_numeric(df['reply_count'], errors='coerce').fillna(-1).astype(int)
df[df['reply_count'] == -1].shape

df['favorite_count'] = pd.to_numeric(df['favorite_count'], errors='coerce').fillna(-1).astype(int)
df[df['favorite_count'] == -1].shape

df['num_hashtags'] = pd.to_numeric(df['num_hashtags'], errors='coerce').fillna(-1).astype(int)
df[df['num_hashtags'] == -1].shape

df['num_urls'] = pd.to_numeric(df['num_urls'], errors='coerce').fillna(-1).astype(int)
df[df['num_urls'] == -1].shape

df['num_mentions'] = pd.to_numeric(df['num_mentions'], errors='coerce').fillna(-1).astype(int)
df[df['num_mentions'] == -1].shape

(741486, 13)

In [7]:
rows_before = df.shape[0]

df = df.drop(df[df['retweet_count'] == -1].index)
df = df.drop(df[df['reply_count'] == -1].index)
df = df.drop(df[df['favorite_count'] == -1].index)
df = df.drop(df[df['num_hashtags'] == -1].index)
df = df.drop(df[df['num_urls'] == -1].index)
df = df.drop(df[df['num_mentions'] == -1].index)

rows_after = df.shape[0]

dropped_rows = rows_before - rows_after
print(f'Dropped {dropped_rows:,} rows')
print(f'Rows remaining: {rows_after:,}')

Dropped 741,488 rows
Rows remaining: 12,145,782


### Convert labels to int. Drop rows where conversion failed.

In [8]:
# convert bot labels to int
df['bot'] = pd.to_numeric(df['bot'], errors='coerce').fillna(-1).astype(int)

In [9]:
rows_before = df.shape[0]

df = df.drop(df[df['bot'] == -1].index)

rows_after = df.shape[0]

dropped_rows = rows_before - rows_after
print(f'Dropped {dropped_rows:,} rows')
print(f'Rows remaining: {rows_after:,}')

Dropped 0 rows
Rows remaining: 12,145,782


In [10]:
df.user_id.unique().shape

(12508,)

In [11]:
df

Unnamed: 0,text,user_id,in_reply_to_user_id,in_reply_to_screen_name,place,retweet_count,reply_count,favorite_count,possibly_sensitive,num_hashtags,num_urls,num_mentions,bot
0,I Pooh - In silenzio 1968 http://t.co/ahvQxUqTws,24858289,0,,,0,0,0,,0,1,0,1
1,http://t.co/HyI5EQKz6Q,24858289,0,,,0,0,0,,0,1,0,1
2,"Tutti a tavola, con il filetto di baccalà. htt...",24858289,0,,,0,0,0,,0,1,0,1
3,http://t.co/NAHQ4l2pUy,24858289,0,,,0,0,0,,0,1,0,1
4,Gold - Spandau Ballet http://t.co/o8ZJHt7Neu,24858289,0,,,0,0,0,,0,1,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...
24806819,RT @tyleroakley: ahhhh! I see #BringYoutubersT...,147900782,0,,,4678,0,0,,1,0,1,0
24806820,RT @9GAG: Heaven is a place on earth with you ...,147900782,0,,,9368,0,0,,0,0,1,0
24806821,WOAH. #LowerPricesForHoneymoonTourPH needs to ...,147900782,0,,,0,0,0,,1,0,0,0
24806822,Let me start off with a clean slate.,147900782,0,,,0,0,0,,0,0,0,0


Group the dataframe by user ids

In [12]:
# group by user_id
user_id_group = df.groupby('user_id').agg({
    'user_id': 'first',
    'in_reply_to_user_id': lambda x: set(x),
    'retweet_count': 'sum',
    'reply_count': 'sum',
    'favorite_count': 'sum',
    'num_hashtags': 'sum',
    'num_urls': 'sum',
    'num_mentions': 'sum',
    'bot': 'first',
}).reset_index(drop=True)
user_id_group

Unnamed: 0,user_id,in_reply_to_user_id,retweet_count,reply_count,favorite_count,num_hashtags,num_urls,num_mentions,bot
0,678033,"{0, 57917441, 837641, 9510922, 14985228, 14231...",242719,0,1674,1177,950,3554,0
1,722623,"{0, 14645160, 822237703, 23779848, 130873354, ...",908396,0,1301,1153,523,2275,0
2,755116,"{0, 5626882, 8442372, 16737797, 18040845, 4831...",105864,0,522,129,397,3078,0
3,755746,"{0, 237397505, 12538372, 95723526, 15633414, 1...",644062,0,275,872,1473,2192,0
4,785080,"{0, 2987852291, 221837829, 194008583, 2676231,...",139956,0,1661,168,207,2531,0
...,...,...,...,...,...,...,...,...,...
12503,3156622237,"{0, 318063361, 1267010562, 310359174, 24933197...",5,0,77,348,6,122,0
12504,3158349782,"{0, 407422208, 525308289, 378295046, 709345802...",7573,0,145,21,1,75,0
12505,3159993463,"{0, 368494299, 2972913323}",1932,0,8,22,41,30,0
12506,3161171948,"{0, 2966327327}",0,0,1,0,1,2,0


In [13]:
unique_user_ids = df['user_id'].unique()

# remove zeros and users that are not in the user_id in the list in the in_reply_to_user_id column
user_id_group['in_reply_to_user_id'] = user_id_group['in_reply_to_user_id'].apply(lambda x: [i for i in x if i != 0 and i in unique_user_ids])
user_id_group

Unnamed: 0,user_id,in_reply_to_user_id,retweet_count,reply_count,favorite_count,num_hashtags,num_urls,num_mentions,bot
0,678033,[678033],242719,0,1674,1177,950,3554,0
1,722623,[722623],908396,0,1301,1153,523,2275,0
2,755116,"[12922462, 50737284, 118386377, 755116]",105864,0,522,129,397,3078,0
3,755746,"[17608721, 755746]",644062,0,275,872,1473,2192,0
4,785080,[785080],139956,0,1661,168,207,2531,0
...,...,...,...,...,...,...,...,...,...
12503,3156622237,[3156622237],5,0,77,348,6,122,0
12504,3158349782,[1689431460],7573,0,145,21,1,75,0
12505,3159993463,[],1932,0,8,22,41,30,0
12506,3161171948,[],0,0,1,0,1,2,0


Remap user ids for both columns.

In [14]:
# get unique user_id from both columns
unique_user_ids = set(user_id_group['user_id'].tolist() + [uid for sublist in user_id_group['in_reply_to_user_id'].tolist() for uid in sublist])

# create a mapping dictionary from old to new ids
id_mapping = {old_id: idx for idx, old_id in enumerate(unique_user_ids)}

# remapping
user_id_group['user_id'] = user_id_group['user_id'].map(id_mapping)
user_id_group['in_reply_to_user_id'] = user_id_group['in_reply_to_user_id'].apply(lambda x: [id_mapping[uid] for uid in x])

user_id_group

Unnamed: 0,user_id,in_reply_to_user_id,retweet_count,reply_count,favorite_count,num_hashtags,num_urls,num_mentions,bot
0,8741,[8741],242719,0,1674,1177,950,3554,0
1,700,[700],908396,0,1301,1153,523,2275,0
2,587,"[4622, 4810, 10887, 587]",105864,0,522,129,397,3078,0
3,838,"[4764, 838]",644062,0,275,872,1473,2192,0
4,11997,[11997],139956,0,1661,168,207,2531,0
...,...,...,...,...,...,...,...,...,...
12503,5882,[5882],5,0,77,348,6,122,0
12504,2381,[4544],7573,0,145,21,1,75,0
12505,4429,[],1932,0,8,22,41,30,0
12506,3978,[],0,0,1,0,1,2,0


# Create the data object.

In [15]:
# node features
node_features = ['retweet_count', 'reply_count', 'favorite_count', 'num_hashtags', 'num_urls', 'num_mentions']

standard_scaler = StandardScaler()
user_id_group[node_features] = standard_scaler.fit_transform(user_id_group[node_features])

nodes = torch.tensor(user_id_group[node_features].values, dtype=torch.float)
nodes

tensor([[-0.1036, -0.0368,  0.0103,  1.8403,  1.9243,  2.8369],
        [ 0.1250, -0.0368, -0.0078,  1.7958,  0.8810,  1.6332],
        [-0.1505, -0.0368, -0.0455, -0.1018,  0.5732,  2.3889],
        ...,
        [-0.1862, -0.0368, -0.0704, -0.3001, -0.2966, -0.4796],
        [-0.1869, -0.0368, -0.0707, -0.3409, -0.3943, -0.5060],
        [-0.0121, -0.0368, -0.0707, -0.2519, -0.2917, -0.3893]])

In [16]:
# labels (human or bot)
labels = torch.tensor(user_id_group['bot'].values, dtype=torch.float)
labels

tensor([0., 0., 0.,  ..., 0., 0., 0.])

In [17]:
# edges
edges = []

for index, row in user_id_group.iterrows():
    for reply_id in row['in_reply_to_user_id']:
        edges.append([row['user_id'], reply_id])

print(len(edges))
edges

5022


[[8741, 8741],
 [700, 700],
 [587, 4622],
 [587, 4810],
 [587, 10887],
 [587, 587],
 [838, 4764],
 [838, 838],
 [11997, 11997],
 [7760, 7760],
 [7896, 7896],
 [1022, 1022],
 [2464, 2464],
 [4274, 4274],
 [6906, 6906],
 [2529, 2529],
 [8427, 8427],
 [9682, 7877],
 [9682, 9682],
 [9682, 9344],
 [2394, 5],
 [2394, 2096],
 [2394, 2394],
 [4839, 4839],
 [4839, 12053],
 [7800, 7800],
 [9070, 9070],
 [9070, 8466],
 [2084, 2084],
 [9701, 9701],
 [7065, 1862],
 [9622, 9622],
 [10869, 10869],
 [10869, 10737],
 [10869, 2904],
 [2463, 2463],
 [12052, 222],
 [12052, 12052],
 [9040, 9040],
 [1839, 1839],
 [11726, 11726],
 [7698, 7698],
 [9217, 9217],
 [4338, 9762],
 [4338, 4338],
 [4338, 11728],
 [274, 274],
 [274, 2123],
 [5464, 5464],
 [11837, 11837],
 [9807, 9807],
 [12440, 3483],
 [12440, 12440],
 [4622, 4622],
 [4622, 4810],
 [7368, 7368],
 [4858, 4858],
 [2942, 2942],
 [5571, 5571],
 [5571, 8767],
 [10046, 7825],
 [10046, 10046],
 [4442, 4442],
 [1459, 6807],
 [1459, 1459],
 [5126, 11843],
 [5

In [18]:
edges = torch.tensor(edges, dtype=torch.long).t().contiguous()
print(edges.shape)
edges

torch.Size([2, 5022])


tensor([[8741,  700,  587,  ..., 4148, 5882, 2381],
        [8741,  700, 4622,  ..., 4148, 5882, 4544]])

In [19]:
data = Data(x=nodes, edge_index=edges, y=labels)
data

Data(x=[12508, 6], edge_index=[2, 5022], y=[12508])