In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from tqdm import tqdm
import heapq
from collections import namedtuple

In [2]:
all_profiles = pd.read_csv('data/instagram_posts.csv', sep='\t', usecols=['profile_id', 'cts'], parse_dates=['cts'])

In [3]:
all_profiles = all_profiles[all_profiles.cts.notnull()]

In [4]:
profiles = all_profiles.profile_id.unique()

In [5]:
Profile = namedtuple('Profile', ['id', 'min_post_time', 'max_post_time', 'n_posts', 'avg_post_time'])

In [32]:
from queue import PriorityQueue
top_n = PriorityQueue()

In [25]:
top_n.put(1)
top_n.put(3)
top_n.put(5)
top_n.put(2)

In [31]:
top_n.get()

5

In [35]:
n = 3
for profile_id in tqdm(profiles):
    user_posts = all_profiles[all_profiles.profile_id == profile_id]
    min_post_time = user_posts.cts.min().timestamp()
    max_post_time = user_posts.cts.max().timestamp()
    n_posts = len(user_posts)
    avg_post_time = (max_post_time - min_post_time) / n_posts
    
    if avg_post_time == 0.0:
        continue
        
    profile = Profile(profile_id, min_post_time, max_post_time, n_posts, avg_post_time)
    
    top_n.put((-avg_post_time, profile))
    
    if top_n.qsize() == n:
        top_n.get()

  0%|                                                                      | 2310/11833766 [02:09<184:47:30, 17.78it/s]


KeyboardInterrupt: 

In [None]:
everything = pd.read_csv('data/instagram_posts.csv', sep='\t', usecols=['profile_id', ])

In [5]:
len(profiles.profile_id.unique())

11833766

In [2]:
posts = pd.read_csv('data/instagram_posts.csv', sep='\t', nrows=100000, usecols=['profile_id', 'cts'], parse_dates=['cts'])

In [17]:
unique_profiles = len(all_profiles.profile_id.unique())

post_times = np.empty((unique_profiles, 4))

profile_id_to_row = {}

len_posts = 0
min_post_time_idx = 1
max_post_time_idx = 2
n_posts_idx = 3

for posts in tqdm(pd.read_csv('data/instagram_posts.csv', sep='\t', chunksize=50000, usecols=['profile_id', 'cts'], parse_dates=['cts'])):

    posts = posts[posts.cts.notnull()]
    profiles = posts.profile_id.unique()
    
    for profile_id in profiles:
        user_posts = posts[posts.profile_id == profile_id]
        current_min_post_time = user_posts.cts.min().timestamp()
        current_max_post_time = user_posts.cts.max().timestamp()
        n_posts = len(user_posts)

        try:
            row = profile_id_to_row[profile_id]
            post_times[row][min_post_time_idx] = min(current_min_post_time, post_times[index][min_post_time_idx])
            post_times[row][max_post_time_idx] = max(current_max_post_time, post_times[index][max_post_time_idx])
            post_times[row][n_posts_idx] += n_posts

        except KeyError:
            # if no entry exists, create one
            new_row = np.array([profile_id, current_min_post_time, current_max_post_time, n_posts])
            post_times[len_posts] = new_row
            profile_id_to_row[profile_id] = len_posts
            len_posts += 1
            

2it [01:10, 35.05s/it]


KeyboardInterrupt: 

In [154]:
post_times[:len_posts, :]

array([[2.23794778e+09, 1.50205002e+09, 1.50205002e+09, 1.00000000e+00],
       [5.57933502e+09, 1.49786468e+09, 1.49786468e+09, 1.00000000e+00],
       [3.13429634e+08, 1.48926270e+09, 1.48926270e+09, 1.00000000e+00],
       ...,
       [9.17891449e+08, 1.50127174e+09, 1.50127174e+09, 1.00000000e+00],
       [1.48713668e+09, 1.52864187e+09, 1.52864187e+09, 1.00000000e+00],
       [2.47983272e+08, 1.53353762e+09, 1.53353762e+09, 1.00000000e+00]])

In [126]:
post_times = np.empty((len(posts.profile_id.unique()), 4))

len_posts = 0

min_post_time_idx = 1
max_post_time_idx = 2
n_posts_idx = 3

for profile_id in posts.profile_id.unique():
    user_posts = posts[posts.profile_id == profile_id]
    current_min_post_time = user_posts.cts.min().timestamp()
    current_max_post_time = user_posts.cts.max().timestamp()
    n_posts = len(user_posts)

    found_row = np.where(post_times[:, 0] == profile_id)[0]

    if found_row.size == 0:
        # if no entry exists, create one
        new_row = np.array([profile_id, current_min_post_time, current_max_post_time, n_posts])
        post_times[len_posts] = new_row
        len_posts += 1                               

    else:
        index = found_row[0]
        post_times[index][min_post_time_idx] = min(current_min_post_time, post_times[index][min_post_time_idx])
        post_times[index][max_post_time_idx] = max(current_max_post_time, post_times[index][max_post_time_idx])
        post_times[index][n_posts_idx] += n_posts

In [127]:
post_times = pd.DataFrame(alll, columns=['profile_id', 'min_post_time', 'max_post_time', 'n_posts'])

post_times['avg_post_interval'] = (post_times.max_post_time - post_times.min_post_time) / post_times.n_posts
post_times = post_times[post_times.avg_post_interval > 0]

Unnamed: 0,profile_id,min_post_time,max_post_time,n_posts,avg_post_interval
24991,5.567932e+08,1.554047e+09,1.554047e+09,2.0,0.5
45590,5.641679e+08,1.558413e+09,1.558413e+09,2.0,2.0
17950,1.360019e+10,1.558739e+09,1.558739e+09,2.0,2.5
44207,8.968048e+09,1.549372e+09,1.549372e+09,2.0,3.5
58204,1.398241e+10,1.559143e+09,1.559143e+09,3.0,4.0
...,...,...,...,...,...
31870,1.131132e+06,1.379091e+09,1.503439e+09,2.0,62173750.0
14392,5.484254e+07,1.404294e+09,1.531494e+09,2.0,63599684.0
58671,7.605579e+06,1.384339e+09,1.525366e+09,2.0,70513521.5
15485,7.794597e+06,1.417660e+09,1.559062e+09,2.0,70700952.5


In [141]:
top = post_times.sort_values(by=['avg_post_interval'])