In [1]:
import os
os.chdir('../')

In [2]:
import pandas as pd
from pathlib import Path
import numpy as np
import sys
import glob
import warnings
import random
from glob import glob 
import datetime
import itertools
import time
from multiprocessing import pool
from source.lib.helpers import *

In [118]:
indir_abandoned = Path('drive/output/derived/project_outcomes/abandoned_projects')
indir_departures = Path('drive/output/derived/contributor_stats/departed_contributors')
outdir_departures_filtered = Path('drive/output/derived/contributor_stats/filtered_departed_contributors')

In [124]:
def FilterSpecification(df_specification, idx, time_period, rolling_window, indir_departures, indir_abandoned, outdir_departures_filtered):
    criteria_pct = df_specifications.loc[idx,'criteria_pct']
    consecutive_periods = df_specifications.loc[idx,'consecutive_periods']
    post_period_length = df_specifications.loc[idx,'post_period_length']
    decline_type = df_specifications.loc[idx,'decline_type']
    decline_stat = df_specifications.loc[idx,'decline_stat']
    if decline_stat == 0 or decline_type == "threshold_gap_qty":
        decline_stat = int(decline_stat)
    
    df_departed = pd.read_parquet(indir_departures / f'departed_contributors_major_months{time_period}_window{rolling_window}D_criteria_commits_{criteria_pct}pct_consecutive{consecutive_periods}_post_period{post_period_length}_{decline_type}_{decline_stat}.parquet')
    df_departure_range = CleanDepartures(df_departed, decline_type)
    df_departure_range = LabelScrapedAbandonment(df_departure_range, indir_abandoned)
    df_departure_range = LabelDataExtractedAbandonment(df_departure_range, indir_abandoned, time_period)
    
    df_departure_range['repo_count'] = df_departure_range.groupby('repo_name')['actor_id'].transform('count')
    df_departure_range['last_pre_period'] = df_departure_range['time_range'].apply(lambda x: x[0])
    df_departure_range['treatment_period'] = df_departure_range['time_range'].apply(lambda x: x[1])
    df_departure_range.to_parquet(outdir_departures_filtered / f'filtered_departed_contributors_major_months{time_period}_window{rolling_window}D_criteria_commits_{criteria_pct}pct_consecutive{consecutive_periods}_post_period{post_period_length}_{decline_type}_{decline_stat}.parquet')

    return df_departure_range

def CleanDepartures(df_departed, decline_type):
    if decline_type == "threshold_gap_qty":
        df_departed = df_departed.query('below_qty_mean_gap0 == 1 | below_qty_mean_gap1 == 1')

    df_departed = pd.merge(df_departed, df_departed.query('time_period == final_period')\
                           [['repo_name','actor_id','grouped_index']].rename({'grouped_index':'final_index'}, axis = 1))
    if decline_type == "threshold_gap_qty":
        df_departed['final_index'] = df_departed.apply(
            lambda x: x['final_index'] if x['below_qty_mean_gap0'] == 1 else x['final_index']+1, axis = 1)
    df_departed = pd.merge(df_departed.drop('final_period', axis = 1), df_departed.query('grouped_index == final_index')[['actor_id','repo_name','time_period']]\
                          .rename({'time_period':'final_period'}, axis = 1))
    df_departed['first_post_period_index'] = df_departed['final_index'] + 1
    df_departed['relative_time'] = (df_departed['grouped_index'] - df_departed['final_index'])-1

        
    df_departed['time_period'] = pd.to_datetime(df_departed['time_period'])
    df_departure_range = df_departed.query('relative_time == -1 | relative_time == 0')\
        .groupby(['repo_name','actor_id']).agg({'time_period':list}).reset_index()
    df_departure_range['time_range'] = df_departure_range['time_period'].apply(lambda x: [x[0].date(), x[1].date()])
    df_departure_range = df_departure_range.drop('time_period', axis = 1) 
    return df_departure_range

def LabelScrapedAbandonment(df_departure_range, indir_abandoned):
    df_abandoned_scraped = pd.read_csv(indir_abandoned / 'scraped_abandoned_repo_data.csv', index_col = 0).query('status == "abandoned"')
    df_abandoned_scraped['abandoned_date'] = pd.to_datetime(df_abandoned_scraped['abandoned_date']).apply(lambda x: x.date())
    
    df_departure_range = pd.merge(df_departure_range, df_abandoned_scraped[['repo_name','abandoned_date']], how = 'left')
    df_departure_range['abandoned_scraped'] = df_departure_range.apply(
            lambda x: not pd.isnull(x['abandoned_date']) and x['time_range'][0]<=x['abandoned_date']<=x['time_range'][1], axis = 1)
    return df_departure_range

def LabelDataExtractedAbandonment(df_departure_range, indir_abandoned, time_period):
    for consecutive in [2, 3, 4]:
        for permanent in [True, False]:
            df_abandoned_data = pd.read_parquet(indir_abandoned / f'abandoned_projects_consecutive_req{consecutive}_permanent{permanent}.parquet')
            df_abandoned_data['abandoned_date'] = pd.to_datetime(df_abandoned_data['abandoned_date']).apply(lambda x: x.date())
            abandoned_date_col = f'abandoned_date_consecutive_req{consecutive}_permanent{permanent}'
            df_departure_range = pd.merge(df_departure_range, 
                                          df_abandoned_data.rename({'abandoned_date':abandoned_date_col}, axis = 1), how = 'left')
            df_departure_range[f'abandoned_consecutive_req{consecutive}_permanent{permanent}'] = df_departure_range.apply(
                lambda x: not pd.isnull(x[abandoned_date_col]) and x[abandoned_date_col] == x['time_range'][1], axis = 1)
            for periods_after in [1, 2, 3]:
                df_departure_range[f'abandoned_within_{periods_after}periods_consecutive_req{consecutive}_permanent{permanent}'] = df_departure_range.apply(
                    lambda x: not pd.isnull(x[abandoned_date_col]) and x[abandoned_date_col] <= (x['time_range'][1] + pd.DateOffset(months=periods_after*time_period)).date(), axis = 1)
    
    return df_departure_range

In [125]:
%%time
for time_period in [6]: #[2,3,6]:
    for rolling_window in [732, 1828]:
        print(time_period, rolling_window)
        df_specifications = pd.read_csv(indir_departures / f'departed_contributors_specification_summary_major_months{time_period}_window{rolling_window}D.csv').query('criteria_col == "commits"')
        for idx in df_specifications.index:
            df_departure_range = FilterSpecification(df_specifications, idx, time_period, rolling_window, indir_departures, indir_abandoned, outdir_departures_filtered)

6 732
6 1828
CPU times: user 1min 18s, sys: 2.84 s, total: 1min 21s
Wall time: 1min 16s


In [None]:
# filter out abandoned projects
# mark projects that ARE one repo

Unnamed: 0,repo_name,actor_id,time_range,abandoned_date,abandoned_scraped,abandoned_date_consecutive_req2_permanentTrue,abandoned_consecutive_req2_permanentTrue,abandoned_within_1periods_consecutive_req2_permanentTrue,abandoned_within_2periods_consecutive_req2_permanentTrue,abandoned_within_3periods_consecutive_req2_permanentTrue,...,abandoned_consecutive_req4_permanentTrue,abandoned_within_1periods_consecutive_req4_permanentTrue,abandoned_within_2periods_consecutive_req4_permanentTrue,abandoned_within_3periods_consecutive_req4_permanentTrue,abandoned_date_consecutive_req4_permanentFalse,abandoned_consecutive_req4_permanentFalse,abandoned_within_1periods_consecutive_req4_permanentFalse,abandoned_within_2periods_consecutive_req4_permanentFalse,abandoned_within_3periods_consecutive_req4_permanentFalse,repo_count
0,02strich/pykerberos,44383.0,"[2016-01-01, 2016-07-01]",,False,,False,False,False,False,...,False,False,False,False,,False,False,False,False,1
1,23andMe/Yamale,757503.0,"[2015-07-01, 2016-01-01]",,False,,False,False,False,False,...,False,False,False,False,,False,False,False,False,1
2,3-manifolds/Spherogram,5454848.0,"[2023-01-01, 2023-07-01]",,False,,False,False,False,False,...,False,False,False,False,,False,False,False,False,1
3,4teamwork/ftw.simplelayout,437933.0,"[2016-07-01, 2017-01-01]",,False,,False,False,False,False,...,False,False,False,False,,False,False,False,False,1
4,4teamwork/ftw.upgrade,7469.0,"[2018-01-01, 2018-07-01]",,False,,False,False,False,False,...,False,False,False,False,,False,False,False,False,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6147,zopefoundation/zope.testrunner,159967.0,"[2016-07-01, 2017-01-01]",,False,,False,False,False,False,...,False,False,False,False,,False,False,False,False,1
6148,zpoint/idataapi-transform,18083296.0,"[2018-07-01, 2019-01-01]",,False,,False,False,False,False,...,False,False,False,False,,False,False,False,False,1
6149,zulip/python-zulip-api,142908.0,"[2018-01-01, 2018-07-01]",,False,,False,False,False,False,...,False,False,False,False,,False,False,False,False,3
6150,zulip/python-zulip-api,7950151.0,"[2018-01-01, 2018-07-01]",,False,,False,False,False,False,...,False,False,False,False,,False,False,False,False,3
