In [1]:
import pandas as pd

star_history_path = 'github_star_history.csv'
data = pd.read_csv('final_dataset_samplesize_1500.csv')
star_history = pd.read_csv(star_history_path)

# Apply both regex patterns to remove the timezone strings from the 'Date' column
star_history['Date'] = star_history['Date'].str.replace(r'\s+\(Mountain.*Time\)', '', regex=True)
star_history['Date'] = star_history['Date'].str.replace(r'\s+GMT-.*$', '', regex=True)

# Parse the dates
star_history['Date'] = pd.to_datetime(star_history['Date'], format='%a %b %d %Y %H:%M:%S', errors='coerce')
data['timestamp'] = pd.to_datetime(data['timestamp'], errors='coerce')

data.head()

Unnamed: 0.1,Unnamed: 0,timestamp,type,status,text,issue_id,repo,interpolated_stars,category,purpose,sentiment_class
0,0,2024-03-01,code_comment,,if model_path is None:,,Auto1111SDK/Auto1111SDK,312,code,explanation,NEGATIVE
1,1,2024-03-05,code_comment,,"pipe = StableDiffusionPipeline(""model.safetens...",,Auto1111SDK/Auto1111SDK,316,code,explanation,NEGATIVE
2,2,2024-02-29,code_comment,,"""mask"": ""Placeholder for mask array data""",,Auto1111SDK/Auto1111SDK,312,code,explanation,NEGATIVE
3,3,2024-01-29,code_comment,,Check if the URL format is correct,,Auto1111SDK/Auto1111SDK,12,code,explanation,NEGATIVE
4,4,2024-03-05,code_comment,,"negative_prompt = ""(deformed iris, deformed pu...",,Auto1111SDK/Auto1111SDK,316,code,explanation,NEGATIVE


In [64]:
import pandas as pd
from tqdm import tqdm

def find_future_star_count(row, months_ahead):
    repo = row['repo']
    date = row['timestamp']
    future_date = date + pd.DateOffset(months=months_ahead)
    repo_history = star_history[star_history['Repository'] == repo].sort_values('Date')
    
    if repo_history.empty:
        return None

    repo_history.set_index('Date', inplace=True)

    if future_date > repo_history.index[-1]:
        return None

    idx = repo_history.index.searchsorted(future_date)
    if idx == 0:
        return repo_history.iloc[0]['Stars'] if future_date == repo_history.index[0] else None

    before = repo_history.iloc[idx - 1]
    after = repo_history.iloc[idx]

    time_diff = (after.name - before.name).total_seconds()
    before_weight = (after.name - future_date).total_seconds() / time_diff
    after_weight = (future_date - before.name).total_seconds() / time_diff

    return int(round(before['Stars'] * before_weight + after['Stars'] * after_weight))

# Add the current star count and future star counts for 1, 3, 6, 9, and 12 months
for months in tqdm([1, 3, 6, 12, 24]):
    data[f'future_stars_{months}_month'] = data.apply(lambda row: find_future_star_count(row, months), axis=1)

# now also add the growth rate for each of the future star counts
for months in [1, 3, 6, 12, 24]:
    data[f'future_stars_{months}_month_growth_rate'] = (data[f'future_stars_{months}_month'] - data['interpolated_stars']) / data['interpolated_stars']
    # normalize the growth rate to annualized growth rate
    data[f'future_stars_{months}_month_growth_rate'] = data[f'future_stars_{months}_month_growth_rate'] * 12 / months

data.to_csv('github_comments_with_future_stars.csv', index=False)
data.rename(columns={'interpolated_stars': 'stars'}, inplace=True)
data

100%|██████████| 5/5 [01:47<00:00, 21.44s/it]


Unnamed: 0.1,Unnamed: 0,timestamp,type,status,text,issue_id,repo,interpolated_stars,category,purpose,...,future_stars_6_month,future_stars_12_month,future_stars_24_month,star_count,year,future_stars_1_month_growth_rate,future_stars_3_month_growth_rate,future_stars_6_month_growth_rate,future_stars_12_month_growth_rate,future_stars_24_month_growth_rate
29,29,2013-09-16,code_comment,,Compute the audio if any,,Zulko/moviepy,48,code,explanation,...,444.0,848.0,1593.0,48,2013,16.250000,16.583333,16.500000,16.666667,16.093750
30,30,2021-01-18,code_comment,,TRY USING SCIPY AS RESIZER,,Zulko/moviepy,7013,code,explanation,...,7695.0,8713.0,9897.0,7013,2021,0.198489,0.193355,0.194496,0.242407,0.205618
31,31,2021-05-27,code_comment,,Tool tests meant to be run with pytest.\n\nDem...,,Zulko/moviepy,7502,code,explanation,...,8554.0,9107.0,10387.0,7502,2021,0.182351,0.250600,0.280459,0.213943,0.192282
32,32,2013-09-17,code_comment,,~ FOURCC = cv2.VideoWriter_fourcc,,Zulko/moviepy,51,code,explanation,...,447.0,850.0,1594.0,51,2013,15.058824,15.529412,15.529412,15.666667,15.127451
33,33,2014-02-22,code_comment,,"Similar to VideoClip.fl_image, but for ImageCl...",,Zulko/moviepy,398,code,explanation,...,793.0,1197.0,1810.0,398,2014,1.809045,1.939698,1.984925,2.007538,1.773869
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
27310,27310,2018-07-31,issue_comment,open,@isabekyan \r\nThe purpose is not really relev...,3.349042e+08,zzzprojects/System.Linq.Dynamic.Core,256,issue,discussion,...,363.0,474.0,690.0,256,2018,0.796875,0.812500,0.835938,0.851562,0.847656
27311,27311,2023-08-27,issue_comment,open,@StefH Addressed your comment and added test c...,1.851548e+09,zzzprojects/System.Linq.Dynamic.Core,1381,issue,discussion,...,1474.0,,,1381,2023,0.139030,0.136133,0.134685,,
27312,27312,2020-08-31,issue_comment,open,I'm not exactly sure but this answer might hel...,6.846438e+08,zzzprojects/System.Linq.Dynamic.Core,706,issue,discussion,...,815.0,934.0,1170.0,706,2020,0.254958,0.260623,0.308782,0.322946,0.328612
27313,27313,2021-08-03,issue_comment,open,We will wait for the 6.0 release for a complet...,9.572880e+08,zzzprojects/System.Linq.Dynamic.Core,916,issue,discussion,...,1039.0,1152.0,1368.0,916,2021,0.262009,0.266376,0.268559,0.257642,0.246725


In [61]:
import numpy as np

future_amount = 12
data_testing = data.dropna(subset=[f'future_stars_{future_amount}_month'])
data_testing[f'growth_rate_{future_amount}_month'] = np.log1p(data_testing[f'future_stars_{future_amount}_month'] / (data_testing['star_count'] + 1))
data_testing = data_testing.dropna(subset=[f'growth_rate_{future_amount}_month'])
# data_testing = data_testing[data_testing['star_count'] < 1000]
data_testing = data_testing[data_testing['type'].str.contains('issue')]
data_testing = data_testing[data_testing[f'growth_rate_{future_amount}_month'] < 2]

X = pd.get_dummies(data_testing[['sentiment_class']], drop_first=True).astype(int)
# X = X.drop(['purpose_explanation'], axis=1)
y = data_testing[f'growth_rate_{future_amount}_month']

# Function to run regression analysis
import statsmodels.api as sm
def run_regression_analysis(X, y):
    results = {}
    # Add constant to the model for the intercept
    X = sm.add_constant(X)
    model = sm.OLS(y, X).fit()

    # Store results
    results = model.summary()
    return results

data_testing[f'growth_rate_{future_amount}_month']
run_regression_analysis(X,y)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_testing[f'growth_rate_{future_amount}_month'] = np.log1p(data_testing[f'future_stars_{future_amount}_month'] / (data_testing['star_count'] + 1))


0,1,2,3
Dep. Variable:,growth_rate_12_month,R-squared:,0.002
Model:,OLS,Adj. R-squared:,0.002
Method:,Least Squares,F-statistic:,11.81
Date:,"Thu, 11 Apr 2024",Prob (F-statistic):,0.000592
Time:,14:04:38,Log-Likelihood:,5346.5
No. Observations:,6204,AIC:,-10690.0
Df Residuals:,6202,BIC:,-10680.0
Df Model:,1,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,0.8052,0.001,561.568,0.000,0.802,0.808
sentiment_class_POSITIVE,-0.0116,0.003,-3.437,0.001,-0.018,-0.005

0,1,2,3
Omnibus:,4557.678,Durbin-Watson:,0.672
Prob(Omnibus):,0.0,Jarque-Bera (JB):,78276.252
Skew:,3.427,Prob(JB):,0.0
Kurtosis:,18.995,Cond. No.,2.7


In [59]:
# Convert 'timestamp' to datetime and extract year and month for grouping
# rename interpolated_stars to star_count
data['star_count'] = data['interpolated_stars']
# replace every instance of 'issue_body' with 'issue_comment' in the 'type' column
data['type'] = data['type'].str.replace('issue_body', 'issue_comment')
data['timestamp'] = pd.to_datetime(data['timestamp'])
data['year'] = data['timestamp'].dt.to_period('Y') # weekly is: 'W', yearly is 'Y'
# only keep between 2010 and 2023 inclusive
data = data[(data['timestamp'].dt.year >= 2010) & (data['timestamp'].dt.year <= 2023)]

# Define the function to calculate the ratios and counts
def calculate_metrics(group):
    metrics = {}

    # Positivity ratio
    # first make sure star_count is sorted
    group = group.sort_values('timestamp')
    ending_star_count = max(group['star_count'])
    positive_count = group[group['sentiment_class'] == 'POSITIVE'].shape[0]
    total_count = group.shape[0]
    metrics['positivity_ratio'] = positive_count / total_count if total_count else 0
    metrics['star_count_end'] = ending_star_count
    metrics['star_count_start'] = min(group['star_count'])
    metrics['star_growth_rate'] = ((ending_star_count / metrics['star_count_start']) - 1) if metrics['star_count_start'] else 0

    # Counts for specific types and purposes
    for type_name in group['type'].unique():
        type_group = group[group['type'] == type_name]

        # Total count for this type
        metrics[f"{type_name}_count"] = type_group.shape[0]

        # Ratio calculation for this type
        if metrics[f"{type_name}_count"]:
            for purpose in type_group['purpose'].unique():
                purpose_count = type_group[type_group['purpose'] == purpose].shape[0]
                metrics[f"{type_name}_{purpose}_count"] = purpose_count
                metrics[f"{type_name}_{purpose}_ratio"] = purpose_count / metrics[f"{type_name}_count"]

    return pd.Series(metrics)

# Group by 'year_month' and 'repo', and apply the metrics calculation
result = data.groupby(['year', 'repo']).apply(calculate_metrics).reset_index()

# Pivot the result to have each metric as a column
wide_result = result.pivot(index=['year', 'repo'], columns='level_2', values=0)

# Identify count columns and calculate the percentage change
count_columns = [col for col in wide_result.columns if col.endswith('_count')]
growth_rates = wide_result[count_columns].groupby('repo').pct_change()

growth_rates = growth_rates.add_suffix('_growth_rate')

# drop columns that end with _count so the results are repo size agnostic
wide_result = wide_result.drop(columns=count_columns)

# Combine the original data with the growth rates
# Ensure to exclude original count columns as we want to rename them
combined_result = pd.concat([wide_result, growth_rates], axis=1)

# Melt the combined DataFrame back to the original format
final_result = combined_result.reset_index().melt(id_vars=['year', 'repo'], var_name='level_2', value_name='value')
result = final_result

result[result['level_2'].str.contains('_growth_rate')]

  result = data.groupby(['year', 'repo']).apply(calculate_metrics).reset_index()
  growth_rates = wide_result[count_columns].groupby('repo').pct_change()


Unnamed: 0,year,repo,level_2,value
1764,2010,django/django,star_growth_rate,0.000000
1765,2010,pallets/flask,star_growth_rate,0.000000
1766,2010,rails/rails,star_growth_rate,0.383207
1767,2011,django/django,star_growth_rate,0.000000
1768,2011,pallets/flask,star_growth_rate,0.970290
...,...,...,...,...
3523,2023,spacejam/sled,issue_comment_solution_count_growth_rate,0.000000
3524,2023,twbs/bootstrap,issue_comment_solution_count_growth_rate,0.571429
3525,2023,vuejs/core,issue_comment_solution_count_growth_rate,0.238095
3526,2023,withastro/astro,issue_comment_solution_count_growth_rate,


In [60]:
result_lr = result.pivot(index=['year', 'repo'], columns='level_2').reset_index()
result_lr = result_lr.droplevel(0, axis=1)
result_lr.to_csv('yearly_repo_metrics.csv', index=False)
result_lr

level_2,Unnamed: 1,Unnamed: 2,code_comment_count_growth_rate,code_comment_deprecated_count_growth_rate,code_comment_deprecated_ratio,code_comment_explanation_count_growth_rate,code_comment_explanation_ratio,code_comment_future work_count_growth_rate,code_comment_future work_ratio,issue_comment_bug report_count_growth_rate,...,issue_comment_feature request_count_growth_rate,issue_comment_feature request_ratio,issue_comment_question_count_growth_rate,issue_comment_question_ratio,issue_comment_solution_count_growth_rate,issue_comment_solution_ratio,positivity_ratio,star_count_end,star_count_start,star_growth_rate
0,2010,django/django,,,0.060606,,0.888889,,0.050505,,...,,,,,,,0.151515,0.0,0.0,0.000000
1,2010,pallets/flask,,,,,0.951049,,0.048951,,...,,,,,,,0.167832,914.0,0.0,0.000000
2,2010,rails/rails,,,0.071429,,0.910714,,0.017857,,...,,,,,,,0.160714,8764.0,6336.0,0.383207
3,2011,django/django,-0.424242,0.000000,,-0.375000,0.964912,-0.600000,0.035088,,...,,,,,,,0.157895,0.0,0.0,0.000000
4,2011,pallets/flask,-0.503497,,0.042254,-0.536765,0.887324,-0.285714,0.070423,,...,,,,,,,0.154930,2719.0,1380.0,0.970290
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
142,2023,spacejam/sled,0.000000,,,0.000000,,,,0.000000,...,-0.666667,0.040000,0.727273,0.760000,0.000000,0.160000,0.200000,7463.0,6519.0,0.144807
143,2023,twbs/bootstrap,-0.800000,-0.500000,0.100000,-0.818182,0.800000,-0.750000,0.100000,6.000000,...,0.022727,0.120321,0.172840,0.508021,0.571429,0.088235,0.250000,164185.0,152755.0,0.074826
144,2023,vuejs/core,7.857143,4.000000,0.080645,11.000000,0.774194,3.500000,0.145161,0.157895,...,0.897436,0.112462,0.770833,0.516717,0.238095,0.079027,0.216667,42105.0,33698.0,0.249481
145,2023,withastro/astro,-0.969231,-0.980392,0.025000,-0.969271,0.875000,-0.963636,0.100000,,...,,0.039216,52.000000,0.519608,,0.107843,0.154930,37895.0,23520.0,0.611182


In [6]:
# pivot wider the table to have the 'positivity_ratio' as the independent variable
# (all the columns are in the 'level_2' column)
result_lr = result.pivot(index=['year', 'repo'], columns='level_2').reset_index()

# make it a flat table
result_lr = result_lr.droplevel(0, axis=1)
result_lr.fillna(0, inplace=True)
result_lr = result_lr[result_lr['star_count_start'] < 1000]
result_lr = result_lr[result_lr['star_count_end'] > 100]
raw_X = result_lr.drop(['star_count_end', 'star_count_start', 'star_growth_rate'], axis=1)
# numeric columns
raw_X = raw_X.select_dtypes(include=['float64', 'int64'])
columns_to_keep_suffix = "count"
type_to_keep = "code" # or "code" or "both"
if type_to_keep != "both":
    raw_X = raw_X.filter(like=type_to_keep)
# for counts, take log
if columns_to_keep_suffix == "count":
    raw_X = np.log1p(raw_X)
if columns_to_keep_suffix != 'both':
    raw_X = raw_X.filter(like=columns_to_keep_suffix)
# if type_to_keep == "issue":
#     raw_X = raw_X.drop([f'issue_comment_{columns_to_keep_suffix}', f'issue_body_{columns_to_keep_suffix}'], axis=1) # multicollinearity
# elif type_to_keep == "code":
#     raw_X = raw_X.drop([f'code_comment_{columns_to_keep_suffix}'], axis=1) # multicollinearity
# else:
#     # drop both
#     raw_X = raw_X.drop([f'code_comment_{columns_to_keep_suffix}', f'issue_comment_{columns_to_keep_suffix}', f'issue_body_{columns_to_keep_suffix}'], axis=1)
# do linear regression to see if there is a relationship between the positivity ratio and next month's star count
X = pd.get_dummies(raw_X, drop_first=True).astype(float)
X = X.filter(like="future")
y = result_lr['star_growth_rate']

run_regression_analysis(X, y)

NameError: name 'np' is not defined