TODO: To start, please save the shared drive as a shortcut to your drive, or make a copy of the entire drive. Then figure out the DRIVE_PATH below.

In [None]:
DRIVE_PATH = 'drive/My Drive/Colab Notebooks/SI 608'

In [None]:
from google.colab import drive
drive.mount('/content/drive') # connect to Google Drive

Mounted at /content/drive


In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import os
!pwd
os.chdir("/content/" + DRIVE_PATH)
!ls

/content/drive/My Drive/Colab Notebooks/SI 608
SI608_Group_Project.ipynb  VoterFraud2020-main


In [None]:
import pandas as pd
import numpy as np
import networkx as nx

Step 1: Loading Data from CSVs

In [None]:
"""Convenient for loading data."""
def load_chunks(directory):
    chunk_dfs = []
    for name in sorted(os.listdir(directory)):
        sub_directory = os.path.join(directory, name)
        if os.path.isdir(sub_directory):
            for filename in sorted(os.listdir(sub_directory)):
                with open(os.path.join(sub_directory, filename), "r", encoding="utf-8") as f:
                    chunk = pd.read_csv(f, encoding = "utf-8")
                    chunk_dfs.append(chunk)
        elif name.endswith(".csv"):
            chunk = pd.read_csv(os.path.join(directory, name), encoding = "utf-8")
            chunk_dfs.append(chunk)

    return pd.concat(chunk_dfs)

Step 2: Network Creation

In [None]:
def create_network(df_retweets):
    G = nx.DiGraph()
    for _, row in df_retweets.iterrows():
        G.add_edge(row['user_id'], row['retweeted_id'])  # Add an edge from retweeter to the original tweet's author
    return G

In [None]:
df_retweets = load_chunks("./VoterFraud2020-main/data/retweets/")
G = create_network(df_retweets) # be careful, takes longer time than expected, mine run about 17 mins

Step 3: Centrality Measures

In [None]:
def calculate_centrality_and_compare_influence(G, df_users):
    centrality_measures = {
        # centrality metrics, we can define our own
        'degree': nx.degree_centrality(G),
        'betweenness': nx.betweenness_centrality(G),
        'eigenvector': nx.eigenvector_centrality(G, max_iter=1000, tol=1e-06)
    }

    # Convert centrality measures to DataFrame
    centrality_df = pd.DataFrame(centrality_measures)

    # Merge with user data
    df_users_centrality = df_users.join(centrality_df, on='user_id')

    # Sort by different measures to identify key accounts
    for measure in centrality_measures:
        df_users_centrality.sort_values(by=measure, ascending=False, inplace=True)
        print(f"Top users by {measure} centrality:")
        print(df_users_centrality[['user_id', measure, 'follower_count']].head())

    return df_users_centrality

In [None]:
df_users = load_chunks("./VoterFraud2020-main/data/users/")
df_users.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2559018 entries, 0 to 59017
Data columns (total 17 columns):
 #   Column                                  Dtype  
---  ------                                  -----  
 0   user_id                                 int64  
 1   user_community                          float64
 2   user_active_status                      object 
 3   closeness_centrality_detractor_cluster  float64
 4   closeness_centrality_promoter_cluster   float64
 5   retweet_count_by_community_0            int64  
 6   quote_count_by_community_0              int64  
 7   retweet_count_by_community_1            int64  
 8   quote_count_by_community_1              int64  
 9   retweet_count_by_community_2            int64  
 10  quote_count_by_community_2              int64  
 11  retweet_count_by_community_3            int64  
 12  quote_count_by_community_3              int64  
 13  retweet_count_by_community_4            int64  
 14  quote_count_by_community_4          

In [None]:
df_users_centrality = calculate_centrality_and_compare_influence(G, df_users)