In [1]:
import pandas as pd

# Load users data

In [2]:
user_columns = ['User_Id', 'Type', 'Name']

df_users = pd.read_csv('./data/users.csv', delimiter='|', usecols=[0, 1, 2], names=user_columns, on_bad_lines='skip', header=None)

df_users.shape

(665384, 3)

# Load user connectivity data

In [3]:
following_columns = ['Followed_Id', 'Follower_Id']

df_following = pd.read_csv('data/following.csv', delimiter='\t', usecols=[0, 1], names=following_columns, header=None)

df_following.shape

(19547158, 2)

# Clean up users data

In [4]:
df_users.dropna(subset=['User_Id'], inplace=True)
df_users.drop_duplicates(subset=['User_Id'], inplace=True)
df_users['User_Id'] = df_users['User_Id'].str.lower()
df_users.shape

(645416, 3)

# Clean up connectivity data

In [5]:
df_following.drop_duplicates(inplace=True)
df_following['Followed_Id'] = df_following['Followed_Id'].str.lower()
df_following['Follower_Id'] = df_following['Follower_Id'].str.lower()
df_following.shape

(19547158, 2)

In [6]:
df_valid_followings = df_following[
    df_following['Followed_Id'].isin(df_users['User_Id']) &
    df_following['Follower_Id'].isin(df_users['User_Id'])
]

df_valid_followings.shape

(18477147, 2)

# Export data

In [7]:
data = { 'users': df_users, 'followings': df_valid_followings }

In [8]:
pd.to_pickle(data, "community_detection_data.pkl")