In [9]:
import os
os.chdir('../')

In [10]:
import pandas as pd
from source.lib.helpers import *
import ast
import glob
import pyarrow as pa
import pyarrow.parquet as pq
import os

In [11]:
download_post_date = '2018-09-01'
time_period = 6

In [17]:
df_pypi_github_mapping = pd.read_csv('output/derived/collect_github_repos/linked_pypi_github.csv')
monthly_downloads = pd.read_parquet('drive/output/scrape/pypi_monthly_downloads/pypi_monthly_downloads.parquet')

In [18]:
df_github_downloads = pd.merge(df_pypi_github_mapping.rename(columns={'package':'project'}), monthly_downloads)
df_github_downloads = df_github_downloads[df_github_downloads['github repository'] != 'Unavailable']
df_github_downloads = df_github_downloads.rename(columns = {'github repository':'repo_name', 'month': 'created_at'})
df_github_downloads = ImputeTimePeriod(df_github_downloads, time_period)
df_github_downloads = df_github_downloads.groupby(['repo_name', 'time_period','project'])['num_downloads'].sum().reset_index()

df_github = df_github_downloads.groupby(['repo_name', 'time_period']).agg(
    total_downloads=('num_downloads', 'sum'),
    total_downloads_one_project=('num_downloads', 'max'),
    unique_projects=('project', lambda x: set(x)),
    project_count=('project', 'nunique')
).reset_index()
df_github.to_parquet('issue/github_downloads.parquet')

In [None]:
def FlattenScorecard(row_str):
    if pd.isnull(row_str):
        return
    row = ast.literal_eval(row_str)
    data = {"score": row.get("score"),
            "repo_name": row.get("repo").get("name")}
    for check in row.get("checks", []):
        name = check.get("name", "").lower().replace("-", "_").replace(" ", "_")
        if name:
            data[f"{name}_name"] = check.get("name")
            data[f"{name}_score"] = check.get("score")
            data[f"{name}_reason"] = check.get("reason")
            data[f"{name}_details"] = check.get("details")
    return data

dfs = []
for file in glob.glob("drive/output/scrape/get_weekly_scorecard_data/scorecard/*.csv"):
    df = pd.read_csv(file).T
    df.columns = df.iloc[0]
    df = df[1:].reset_index(drop=True)
    if 'scorecard_data' in df.columns:
        flat = df["scorecard_data"].apply(FlattenScorecard).apply(pd.Series)
        dfs.append(pd.concat([df, flat], axis=1).assign(source_file=file))
        print(file)
df_scorecard = pd.concat(dfs, ignore_index=True)
df_scorecard.to_parquet('issue/github_scorecard_full.parquet')

In [None]:
# export detailed score data
# and also export the score data

In [None]:
def ProcessScorecard(df_scorecard, time_period):
    """
    Process the scorecard DataFrame by:
      - Grouping by [time_period, repo_name] and averaging score columns
      - Computing the percentage change in score from the previous time period
      - Flagging a trend ('increase' or 'decrease') if the change is at least 10%,
        otherwise 'stable'
    
    Parameters:
      df_scorecard: DataFrame containing raw scorecard data.
      time_period: Value (or column name) to be used by ImputeTimePeriod.
      
    Returns:
      A DataFrame aggregated by time period and repo with mean scores and trend flags.
    """
    df_scorecard['created_at'] = pd.to_datetime(df_scorecard['date'])
    df_scorecard = ImputeTimePeriod(df_scorecard, time_period)
    df_scorecard = df_scorecard.drop(columns=['time', 'commit_sha', 'date', 'week', 'year', 'scorecard_data'])
    df_scorecard['overall_score'] = df_scorecard['score']
    agg_cols = {'overall_score': 'mean'}
    # If you have another score-like column (e.g., '_score') include it:
    for col in df_scorecard.columns:
        if '_score' in col:
            agg_cols[col] = 'mean'
        
    df_scorecard_scores = df_scorecard.groupby(['time_period', 'repo_name']).agg(agg_cols).reset_index()
    df_scorecard_scores = df_scorecard_scores.sort_values(by=['repo_name', 'time_period'])
    df_scorecard_scores['overall_score_change'] = df_scorecard_scores.groupby('repo_name')['overall_score'].diff()
    
    def ClassifyTrend(change):
        if pd.isna(change):
            return None 
        if change >= 1:
            return "increase"
        elif change <= -1:
            return "decrease"
        else:
            return "stable"
    
    df_scorecard_scores['overall_trend'] = df_scorecard_scores['overall_score_change'].apply(ClassifyTrend)
    df_scorecard_scores['overall_increase'] = (df_scorecard_scores['overall_trend'] == 'increase').astype(int)
    df_scorecard_scores['overall_decrease'] = (df_scorecard_scores['overall_trend'] == 'decrease').astype(int)
    df_scorecard_scores['overall_stable'] = (df_scorecard_scores['overall_trend'] == 'stable').astype(int)
    return df_scorecard_scores


In [None]:
df_scorecard_scores = ProcessScorecard(df_scorecard, time_period)
df_scorecard_scores.to_parquet('issue/github_scorecard_scores.parquet')

In [2]:
def ReadParquetHandlingDbdate(file_path):
    table = pq.read_table(file_path)
    new_columns = []
    for i, field in enumerate(table.schema):
        # If the field's type string contains "dbdate", cast it to string.
        if "dbdate" in str(field.type).lower():
            new_columns.append(table.column(i).cast(pa.string()))
        else:
            new_columns.append(table.column(i))
    new_table = pa.Table.from_arrays(new_columns, table.schema.names)
    return new_table.to_pandas()

def ProcessSoftwareDownloads(file_path, time_period):
    """
    Reads in a software downloads file and processes it to:
      1) Identify, for each time period, when a new version is released.
         A new release is defined as the first appearance of a library_version in a given time period 
         (i.e. it does not appear in any previous time period).
      2) Count the number of new releases overall and by release type 
         (major, minor, patch, and other).
         
    Parameters:
      file_path: Path to the input file (parquet format).
      time_period: Value (or column name) to be used by ImputeTimePeriod.
      
    Returns:
      A summary DataFrame with one row per time period containing new release counts by release type.
    """
    df = ReadParquetHandlingDbdate(file_path)
    project = df['project'].tolist()[0]
    if 'date' in df.columns:
        df.rename(columns={'date': 'created_at'}, inplace=True)
    df['created_at'] = pd.to_datetime(df['created_at'])
    
    def ClassifyReleaseType(release_version):
        parts = release_version.split('.')
        if not all(p.isdigit() for p in parts):
            return "other"
        if len(parts) == 2:
            return "major" if parts[1] == "0" else "minor"
        elif len(parts) >= 3:
            if parts[1] == "0" and parts[2] == "0":
                return "major"
            elif parts[2] == "0":
                return "minor"
            else:
                return "patch"
        return "other"
    
    df['release_type'] = df['library_version'].apply(ClassifyReleaseType)
    
    df_first = df.sort_values('created_at').groupby('library_version', as_index=False).first()
    df_first = ImputeTimePeriod(df_first, time_period)
    time_periods = sorted(df_first['time_period'].unique())
    summary_list = []
    
    for tp in time_periods[1:]:
        df_tp = df_first[df_first['time_period'] == tp]
        overall_count = len(df_tp)
        major_count = (df_tp['release_type'] == 'major').sum()
        minor_count = (df_tp['release_type'] == 'minor').sum()
        patch_count = (df_tp['release_type'] == 'patch').sum()
        other_count = (df_tp['release_type'] == 'other').sum()
        major_minor_count = (df_tp['release_type'].isin(['major','minor'])).sum()
        major_minor_patch_count = (df_tp['release_type'].isin(['major','minor','patch'])).sum()

        df_sorted = df_tp.sort_values('created_at', ascending=False)
        
        df_valid = df_sorted[df_sorted['release_type'].isin(['major', 'minor', 'patch'])]
        latest_overall_downloads = df_valid.iloc[0]['num_downloads'] if not df_valid.empty else 0
        latest_dict = df_sorted.drop_duplicates(subset=['release_type']).set_index('release_type')['num_downloads'].to_dict()
        latest_major_downloads = latest_dict.get('major', 0)
        latest_minor_downloads = latest_dict.get('minor', 0)
        
        summary_list.append({
            'time_period': tp,
            'overall_new_release_count': overall_count,
            'major_new_release_count': major_count,
            'minor_new_release_count': minor_count,
            'patch_new_release_count': patch_count,
            'other_new_release_count': other_count,
            'major_minor_release_count': major_minor_count,
            'major_minor_patch_release_count': major_minor_patch_count,
            'latest_major_downloads': latest_major_downloads,
            'latest_minor_downloads': latest_minor_downloads,
            'latest_mmp_downloads': latest_overall_downloads # excludes others
        })
    
    summary_df = pd.DataFrame(summary_list)
    summary_df['project'] = project
    
    return summary_df


In [16]:
indir_project_downloads = "drive/output/scrape/pypi_package_downloads"
file_list = glob.glob(os.path.join(indir_project_downloads, "*.parquet"))
summary_dfs = []
for file_path in file_list:
    summary_df = ProcessSoftwareDownloads(file_path, time_period)
    summary_dfs.append(summary_df)

combined_downloads_summary = pd.concat(summary_dfs, ignore_index=True)
combined_downloads_summary = pd.merge(df_pypi_github_mapping.rename(columns={'package':'project'}), combined_downloads_summary)
combined_downloads_summary = combined_downloads_summary[combined_downloads_summary['github repository'] != 'Unavailable']
combined_downloads_summary.drop(['project','license'], axis=1, inplace=True)
combined_downloads_summary = combined_downloads_summary.rename(columns = {'github repository':'repo_name'})
combined_downloads_summary.to_parquet('issue/github_downloads_detailed.parquet')