In [2]:
import pandas as pd

# Load Likes data

In [3]:
likes_columns = ['User_Id', 'Post_Id']

df_likes = pd.read_csv('./data/likes.csv', delimiter='\t', usecols=[0, 1], names=likes_columns)

df_likes.shape

(798112, 2)

# Load posts data

In [4]:
post_columns = ['Post_Id', 'User_Id', 'Service_Name', 'Text', 'NumImg', 'NumVdo']

In [5]:
df_posts_1 = pd.read_csv('data/entries1.csv', delimiter='\t', on_bad_lines='skip', usecols=[0, 1, 2, 7, 8, 10], names=post_columns, header=None)

df_posts_1.shape

(4000000, 6)

In [6]:
df_posts_2 = pd.read_csv('./data/entries2.csv', delimiter='\t', on_bad_lines='skip', usecols=[0, 1, 2, 7, 8, 10], names=post_columns, header=None)

df_posts_2.shape

(4000000, 6)

In [7]:
df_posts_3 = pd.read_csv('./data/entries3.csv', delimiter='\t', on_bad_lines='skip', usecols=[0, 1, 2, 7, 8, 10], names=post_columns, header=None)

df_posts_3.shape

(4450658, 6)

# Load comments data

In [8]:
comments_columns = ['Post_Id', 'Entry_Id', 'Posted_By', 'Source_Name']

df_comments = pd.read_csv('./data/comments.csv', on_bad_lines='skip', delimiter='\t', usecols=[0, 1, 2, 3], names=comments_columns,  header=None)

df_comments.shape

(3749891, 4)

# Load valid followings data

In [9]:
community_detection_data = pd.read_pickle("community_detection_data.pkl")

In [10]:
df_followings = community_detection_data['followings']

In [11]:
valid_users_for_activities = set(df_followings['Follower_Id']).union(df_followings['Followed_Id'])

# Clean up posts data

In [12]:
df_posts_1.dropna(subset=['Text', 'NumImg', 'NumVdo'], how='all', inplace=True)

df_posts_1.shape

(3999954, 6)

In [13]:
df_posts_2.dropna(subset=['Text', 'NumImg', 'NumVdo'], how='all', inplace=True)

df_posts_2.shape

(3999975, 6)

In [14]:
df_posts_3.dropna(subset=['Text', 'NumImg', 'NumVdo'], how='all', inplace=True)

df_posts_3.shape

(4450640, 6)

In [15]:
df_posts_1['User_Id'] = df_posts_1['User_Id'].str.lower()

In [16]:
df_posts_1['Post_Id'] = df_posts_1['Post_Id'].str.lower()

In [17]:
df_posts_2['User_Id'] = df_posts_2['User_Id'].str.lower()

In [18]:
df_posts_2['Post_Id'] = df_posts_2['Post_Id'].str.lower()

In [19]:
df_posts_3['User_Id'] = df_posts_3['User_Id'].str.lower()

In [20]:
df_posts_3['Post_Id'] = df_posts_3['Post_Id'].str.lower()

In [21]:
df_valid_posts_1 = df_posts_1[
    df_posts_1['User_Id'].isin(valid_users_for_activities)
]

In [22]:
df_valid_posts_1.head()

Unnamed: 0,Post_Id,User_Id,Service_Name,Text,NumImg,NumVdo
0,e/29af803d670fb8d67692095f3ee623e6,newsroom1,Ottawa Citizen - News,RIM 'will not compromise' BlackBerry security ...,1.0,0.0
1,e/9c8413a376bec6389be4d46d0812c2bb,mehmetinnet,Mehmetin.Net,Fethullah Gülen: Referandum siyasî olarak görü...,1.0,0.0
2,e/2d658d97842a466a9513f587f85b0e59,mehmetergin,Mehmetin.Net,Fethullah Gülen: Referandum siyasî olarak görü...,1.0,0.0
3,e/b269ab5d56be4e5e90c1954ecc1ef63a,afriki,,Мне тут недавно один дизайнер сказал: «Ну прот...,0.0,0.0
4,e/74f0a50c374a4ad6bddcbcc60c60cad9,hamsafar,,عرض سلام و خسته نباشيد:),0.0,0.0


In [23]:
df_valid_posts_2 = df_posts_2[
    df_posts_2['User_Id'].isin(valid_users_for_activities) 
]

In [24]:
df_valid_posts_2.head()

Unnamed: 0,Post_Id,User_Id,Service_Name,Text,NumImg,NumVdo
0,e/4ec845b38287101deb6739d0fa64566d,starya,Lolcats 'n' Funny Pictures of Cats - I Can Has...,"VIDEO: Kitteh Tricks - <a rel=""nofollow"" href=...",0.0,0.0
1,e/3ca878bcc2f5ed65c7217e4e8d1b6300,vkamutzki,Google Reader,"Scene: Torontohenge Sunrise - <a rel=""nofollow...",0.0,0.0
2,e/19130fd6b5444f9ba321c58bdf7242a2,biznetsuk,HelloTxt,autobuynow.info/cars Car Buying: How to buy ne...,0.0,0.0
3,e/62945bae3a1433880c0a681081ee8cbb,monikap19,Books Nonfiction,"Check Out Psychology for $89.28 - <a rel=""nofo...",0.0,0.0
4,e/41d2105e23fc70c1976387ab53f51c89,pendar,Google Reader,رکورد گینس تایپ سریع پیام با استفاده از گوشی گ...,0.0,0.0


In [25]:
df_valid_posts_3 = df_posts_3[
    df_posts_3['User_Id'].isin(valid_users_for_activities) 
]

In [26]:
df_valid_posts_3.head()

Unnamed: 0,Post_Id,User_Id,Service_Name,Text,NumImg,NumVdo
0,e/56dfd8c1beb24b52b33e5369a1004603,healthrockstar,Ping.fm,"HealthRockstar: weightloss,#fitness 11 News An...",0.0,0.0
1,e/6d231b54bd0289060de9be6905900961,aggregatore,Aggregatore di blog,Pop Star Coreana Cade Rovinosamente Sul Palco ...,0.0,0.0
3,e/0e38d17bf5a47b55ae5782c8a6d985bc,cleanthinking,Cleanthinking.de - Cleantech und Energie News,"Hintergrund Geothermie: Die Quelle, die nie ve...",0.0,0.0
4,e/5cb0fbb41a0567b2e3f228d0d6cec52e,gnomebeatz,Twitter,"@<a rel=""nofollow"" href=""http://twitter.com/ma...",0.0,0.0
5,e/26ae17b2b74c73542e9e763456fe9d92,fastake,Twitter,New York Fashion Week to Include Designer Sex ...,0.0,0.0


In [27]:
df_valid_posts = pd.concat([df_valid_posts_1[['Post_Id', 'User_Id', 'Service_Name']],
                            df_valid_posts_2[['Post_Id', 'User_Id', 'Service_Name']],
                            df_valid_posts_3[['Post_Id', 'User_Id', 'Service_Name']]])

In [28]:
df_valid_posts.shape

(10864613, 3)

In [29]:
df_valid_posts.head()

Unnamed: 0,Post_Id,User_Id,Service_Name
0,e/29af803d670fb8d67692095f3ee623e6,newsroom1,Ottawa Citizen - News
1,e/9c8413a376bec6389be4d46d0812c2bb,mehmetinnet,Mehmetin.Net
2,e/2d658d97842a466a9513f587f85b0e59,mehmetergin,Mehmetin.Net
3,e/b269ab5d56be4e5e90c1954ecc1ef63a,afriki,
4,e/74f0a50c374a4ad6bddcbcc60c60cad9,hamsafar,


# Clean up likes data

In [30]:
df_likes.drop_duplicates(inplace=True)

In [31]:
df_likes['User_Id'] = df_likes['User_Id'].str.lower()
df_likes['Post_Id'] = df_likes['Post_Id'].str.lower()

In [32]:
df_likes.shape

(797290, 2)

In [33]:
df_valid_likes = df_likes[df_likes['User_Id'].isin(valid_users_for_activities)]

In [34]:
df_valid_likes.head()

Unnamed: 0,User_Id,Post_Id
0,yasinde,e/2d4fcaed2d2a4ac1a36571b66da3f16c
1,socialnewsturkey,e/2d4fcaed2d2a4ac1a36571b66da3f16c
2,janzu,e/2d4fcaed2d2a4ac1a36571b66da3f16c
3,mugecerman,e/2d4fcaed2d2a4ac1a36571b66da3f16c
4,miocaro,e/2d4fcaed2d2a4ac1a36571b66da3f16c


# Clean up comments data

In [35]:
df_comments.dropna(subset=['Posted_By'], inplace=True)

In [36]:
df_comments.drop_duplicates(inplace=True)

In [37]:
df_comments['Posted_By'] = df_comments['Posted_By'].str.lower()

In [38]:
df_comments.shape

(3749891, 4)

In [39]:
df_valid_comments = df_comments[df_comments['Posted_By'].isin(valid_users_for_activities)]

In [40]:
df_comments.head()

Unnamed: 0,Post_Id,Entry_Id,Posted_By,Source_Name
0,e/ed12adf025b5491da54c4ff2c8c5377a/c/c2dbc8151...,e/ed12adf025b5491da54c4ff2c8c5377a,koenigdublin,
1,e/7f6fb13b5a99449bb9dcbb3f8693be73/c/7477fb677...,e/7f6fb13b5a99449bb9dcbb3f8693be73,ilportalinux,
2,e/624ca9226b6526ebdb69f9b46df482c7/c/32c6bf5bc...,e/624ca9226b6526ebdb69f9b46df482c7,guardianuk,
3,e/2fdf59e075094fe1847137af34eda0f7/c/eb22afe81...,e/2fdf59e075094fe1847137af34eda0f7,nahi,m.ctor.org
4,e/4d8de05f989d43a4b90bfbfc59751e1c/c/d472f7b81...,e/4d8de05f989d43a4b90bfbfc59751e1c,miocaro,


# Export data

In [41]:
likes_data = { 'likes': df_valid_likes }

pd.to_pickle(likes_data, 'user_activity_data_likes.pkl')

In [42]:
comments_data = { 'comments': df_valid_comments }

pd.to_pickle(comments_data, 'user_activity_data_comments.pkl')

In [43]:
posts_data = { 'posts': df_valid_posts }

pd.to_pickle(posts_data, 'user_activity_data_posts.pkl')