In [None]:
from IPython.display import display, Markdown, Latex, HTML

import os
import pandas as pd
import csv
import subprocess

This data is available both via public download and from Toolforge. For the latter:
* Create account on Toolforge https://wikitech.wikimedia.org/wiki/Help:Toolforge/Quickstart 
* data is in this folder: /data/project/retention/data from danilo. Should be public. 

As for format (both files and description courtesy of user:Danilo.mac ):
* each line is in format "u1111 22344 ..." where 1111 is user id, 22 is year (2022), 3 is month in hexadecimal and 44 is the number of edits in that month

The whole sequence probably takes about an hour to run.

In [None]:
raw_folder = 'pre-aggregated_editcounts/tools-static.wmflabs.org/retention/data/'
raw_extension = '.dat'
semi_folder = 'pre-aggregated_editcounts/semi/'
semi_folder2 = 'pre-aggregated_editcounts/semi2/'


## Download raw files
This will be very verbose.

In [None]:
result = subprocess.run(['wget', '-N', '-r', '-P', 'pre-aggregated_editcounts', 'https://tools-static.wmflabs.org/retention/data'])


In [None]:
result

## Convert raw files to semi-processed
* Get all filenames in pre-aggregated_editcounts/tools-static.wmflabs.org/retention/data 
* For each file, stream line by line and create rows from it in a dataframe
* Then store the dataframe as csv

In [None]:
filenames = [f for f in os.listdir(raw_folder) if f.endswith(raw_extension)] 

In [None]:
# For each file, read line by line and then create multiple lines based on that line, with the format
# user_id, year_month_hash, count
def semi_process(path_in, path_out, wiki):
    with open(path_in, 'r') as f, open(path_out, 'w') as w:
        reader = csv.reader(f)
        w.write('user_id, wiki, yearmonth, count')
        for row in reader:
            tmp = row[0].split()
            w.write('\n')
            w.write('\n'.join([tmp[0][1:] + ',' + wiki + ',' + val[:3] + ',' + val[3:] for val in tmp[1:]]))

In [None]:
# Takes a few minutes to run
for filename in filenames:
    semi_process(
        path_in = raw_folder + filename,
        path_out = semi_folder + filename.split('.')[0] + '.csv',
        wiki = filename.split('.')[0]
    )

Some of these files are too big to just ingest as a dataframe. Lets convert it first to a df-like situation, and then we filter for only the users that have any edits.

In [None]:
def semi_process2(path_in, path_out, df_users, returnme = False):
    list_months = []
    with open(path_in, 'r') as f:
        reader = csv.reader(f)
        next(reader)
        for row in reader:
            if row[2] in list_months:
                pass
            else:
                list_months += [row[2]]
    df_users2 = df_users.merge(pd.DataFrame(columns=list_months), how='left', left_index = True, right_index = True)
    with open(path_in, 'r') as f:
        reader = csv.reader(f)
        next(reader)
        for row in reader:
            u_id = int(row[0])
            if u_id in df_users2.index:
                df_users2.loc[u_id,row[2]] = row[3]
            else:
                pass
    df_users2 = df_users2.set_index('username').dropna(axis=0, how='all')
    df_users2.to_csv(path_out)
    if returnme:
        return(df_users2.set_index('username'))

In [None]:
# should take less than an hour to run?
df_list_ids = pd.read_csv('output/df_user_actor_id.csv')
for filename in filenames:
    wiki = filename.split('.')[0]
    semi_process2(
        path_in = semi_folder + filename.split('.')[0] + '.csv',
        path_out = semi_folder2 + filename.split('.')[0] + '.csv',
        df_users = df_list_ids.query('wiki == @wiki')[['username', 'user_id']].set_index('user_id')
    )

In [None]:
def process_usercounts(folder_in, path_out, filenames, returnme = False):
    '''
    Takes pre-processed editcounts per user per month. Each file represents one wiki, with a username per row and a month per column. 
    Month columns are coded as three digits, where the first two digits represent the year and the third represents the month: 1,2,3,4,5,6,7,8,9,a,b,c .
    
    Input:
    path_in: string with the path to the folder with the semi-processed files
    path_out: string with the path to the folder where the ouput file should be returned to
    filenames: list of strings with each string being filenames. The file extension is ignored so that the same list of filenames can be used by multiple functions.
    returnme: optional parameter to return a pandas dataframe rather than write to file.
    '''
    df_out = pd.DataFrame()
    for filename in filenames:
        tmp_filename = filename.split('.')[0]
        df_temp = pd.read_csv(folder_in + tmp_filename + '.csv')
        df_out = pd.concat([df_out, df_temp], axis = 0).groupby('username').sum().reset_index()
    df_out.set_index('username', inplace=True)
    df_out = df_out.reindex(sorted(df_out.columns), axis=1)
    if returnme:
        return(df_out)
    else: 
        df_out.to_csv(path_out)

In [None]:
# this takes about 4 minutes to run
process_usercounts(
    folder_in = 'pre-aggregated_editcounts/semi2/',
    path_out = 'output/df_user_editcounts.csv',
    filenames = filenames,
    returnme = False
)

In [None]:
df_editcounts = pd.read_csv('output/df_user_editcounts.csv').set_index('username')
df_editcounts