In [166]:
import pandas as pd
import statsmodels.api as sm
import numpy as np

data = pd.read_csv(f'./quarterly_repo_metrics.csv')

# Extract the relevant columns (numeric ones) for correlation analysis
numeric_columns = data.select_dtypes(include=['float64', 'int64']).columns
# drop anything with "Unnamed" in the name
numeric_columns = [col for col in numeric_columns if 'Unnamed' not in col]

# Shift the numeric columns to get the next quarter's data for each row
for col in numeric_columns:
    data[f'next_{col}'] = data.groupby('Unnamed: 1')[col].shift(-1)

# sort columns to make it easier to read
# first show columns that end with '_count', then '_ratio', then everything else alphabetically
numeric_columns = sorted(numeric_columns, key=lambda col: (col.endswith('_count'), col.endswith('_ratio'), col.endswith('_growth_rate'), col))

# Drop rows where the next quarter's data is NaN (end of the sequence for each repo)
extended_data = data.dropna(subset=[f'next_{col}' for col in numeric_columns], how='all')
data = extended_data.fillna(0)

# format is YYYYQ1, but let's just parse only the year
data['year'] = pd.to_datetime(data['Unnamed: 0'].str[:4] + '-01-01') # quarterlly
data = data[(data['year'] >= '2013-01-01') & (data['year'] <= '2023-01-01')]

# data['year'] = pd.to_datetime(data['Unnamed: 0'], format="%Y") # yearly
# data = data[(data['year'] >= '2013-01-01') & (data['year'] <= '2023-01-01')]

# take log of all growth rate columns
for col in data.columns:
    if col.endswith('_growth_rate'):
        data[col] = np.log1p(data[col])

# unused repo size filtering
# SMALL_REPOS = [
#     ('joshmoody24', 'sitcom-simulator'),
#     ('nodejs', 'nodejs.org'),
#     ('spacejam', 'sled'),
#     ('Auto1111SDK', 'Auto1111SDK'),
#     ('stitionai', 'devika'),
#     ('zzzprojects', 'System.Linq.Dynamic.Core'),
#     ('elnormous', 'HTTPRequest'),
# ]

# LARGE_REPOS = [
#     ('pallets', 'flask'),
#     ('facebook', 'react'),
#     ('hotwired', 'turbo'),
#     ('rails', 'rails'),
#     ('vuejs', 'core'), #
#     ('Zulko', 'moviepy'),
#     ('withastro', 'astro'),
#     ('bigskysoftware', 'htmx'),
#     ('phoenixframework', 'phoenix'),
#     ('ethereum', 'go-ethereum'),
#     ('twbs', 'bootstrap'),
#     ('django', 'django'),
# ]

# # filter data to only contain small repos
# filtered_data = pd.DataFrame()
# for owner, repo in SMALL_REPOS:
#     filtered_data = pd.concat([filtered_data, data[data['Unnamed: 1'] == f'{owner}/{repo}']])
# data = filtered_data
        
print("Total rows", len(data))

for dependent_variable in ['next_' + col for col in numeric_columns] + ['star_growth_rate']:
    print(f"Model for {dependent_variable}")
    X = data.drop([dependent_variable, 'Unnamed: 0', 'Unnamed: 1', 'code_comment_count_growth_rate', 'issue_comment_count_growth_rate'], axis=1) # remove the raw sums
    # dummy code repo (Unnamed: 1) (not useful because we're interested in generalities that apply to all repos)
    # X = pd.get_dummies(X, columns=['Unnamed: 1']).astype(float)
    X = X.drop([col for col in X.columns if 'next_' in col or 'star' in col], axis=1)
    X = X.select_dtypes(include=['float64', 'int64'])

    # normalize the data
    X = (X - X.mean()) / X.std()
    
    y = data[dependent_variable]
    X = sm.add_constant(X)
    
    model = sm.OLS(y, X).fit()

    # p-hacking (unused)
    # while True:
    #     model = sm.OLS(y, X).fit()
    #     p_values = model.pvalues.drop('const', errors='ignore')  # ignore errors in case 'const' is not present
    #     max_p_value = p_values.max()

    #     if max_p_value < 0.5 or len(X.columns) == 1: # get rid of the super lame features
    #         break

    #     # Drop the feature with the highest p-value
    #     feature_to_drop = p_values.idxmax()
    #     X = X.drop(columns=feature_to_drop)
    
    print(f"R-squared: {round(model.rsquared, 4)}")
    print(f"Adjusted R-squared: {round(model.rsquared_adj, 4)}")
    print("Significant Coefficients:")
    
    at_least_one_significant = False
    for col in X.columns:
        if col == 'const':
            continue
        if model.pvalues[col] < 0.1:  # Assuming significant level at 0.1
            at_least_one_significant = True
            print(f"{col}: {round(model.params[col], 4)} (p-value: {round(model.pvalues[col], 4)})")
    
    if not at_least_one_significant:
        print("(no significant coefficients found)")
    print()

Total rows 452
Model for next_star_count_end
R-squared: 0.2129
Adjusted R-squared: 0.1783
Significant Coefficients:
code_comment_deprecated_ratio: 6359.9076 (p-value: 0.0007)
code_comment_explanation_ratio: 14127.8709 (p-value: 0.0)
code_comment_future work_ratio: 7050.3998 (p-value: 0.0002)
issue_comment_bug report_count_growth_rate: -3386.6348 (p-value: 0.092)
issue_comment_bug report_ratio: 4696.279 (p-value: 0.0187)
issue_comment_conclusion_ratio: 3924.7871 (p-value: 0.0423)
issue_comment_discussion_ratio: 7750.438 (p-value: 0.0001)
issue_comment_question_ratio: 11420.0423 (p-value: 0.0)
positivity_ratio: 3502.4377 (p-value: 0.0652)

Model for next_star_count_start
R-squared: 0.208
Adjusted R-squared: 0.1731
Significant Coefficients:
code_comment_deprecated_ratio: 6165.1463 (p-value: 0.0008)
code_comment_explanation_ratio: 13620.1788 (p-value: 0.0)
code_comment_future work_ratio: 6676.1897 (p-value: 0.0003)
issue_comment_bug report_count_growth_rate: -3336.2853 (p-value: 0.0893)
is

In [165]:
comments = pd.read_csv('github_comments_with_future_stars.csv')
data = comments

# Function to run regression analysis
import statsmodels.api as sm
def run_regression_analysis(X, y):
    results = {}
    # Add constant to the model for the intercept
    X = sm.add_constant(X)
    model = sm.OLS(y, X).fit()

    # Store results
    results = model.summary()
    return results

for interval in [3]: # could be 1, 3, 6, 12
    data_testing = data.dropna(subset=[f'future_stars_{interval}_month'])
    data_testing = data_testing.dropna(subset=[f'future_stars_{interval}_month_growth_rate'])
    data_testing = data_testing[~data_testing[f'future_stars_{interval}_month_growth_rate'].isin([np.nan, np.inf, -np.inf])] # div by zero
    X = pd.get_dummies(data_testing[['sentiment_class', 'purpose']], drop_first=False).astype(int)
    X = X.drop(columns=['sentiment_class_NEGATIVE', 'purpose_explanation'])
    y = data_testing[f'future_stars_{interval}_month_growth_rate']
    y = np.log1p(y)

    print(run_regression_analysis(X,y))


                                   OLS Regression Results                                   
Dep. Variable:     future_stars_3_month_growth_rate   R-squared:                       0.066
Model:                                          OLS   Adj. R-squared:                  0.066
Method:                               Least Squares   F-statistic:                     189.6
Date:                              Thu, 11 Apr 2024   Prob (F-statistic):               0.00
Time:                                      14:38:31   Log-Likelihood:                -15186.
No. Observations:                             24155   AIC:                         3.039e+04
Df Residuals:                                 24145   BIC:                         3.047e+04
Df Model:                                         9                                         
Covariance Type:                          nonrobust                                         
                               coef    std err          t      P>|t|  