In [1]:
import pandas as pd

df_logged_CDA = pd.read_csv('data/df_logged_CDA.csv')
print(df_logged_CDA.head())

   is_sustaining  total_forks_count  avg_annual_forks_growth_rate  \
0              1          10.159253                     -0.096791   
1              1          10.049404                      0.018223   
2              1           9.811263                     -0.046718   
3              1           9.567105                      0.294218   
4              1           9.533438                      2.041317   

   contributed_back_forks_ratio  hard_forks_ratio  avg_merged_commits_count  \
0                      0.069179          0.017083                  2.587723   
1                      0.030346          0.009633                  2.157486   
2                      0.013832          0.004486                  2.953515   
3                      0.179272          0.026115                  2.257632   
4                      0.078231          0.033950                  2.546229   

   avg_not_merged_commits_count  avg_not_contributed_back_commits_count  \
0                      1.903720    

In [3]:
print(df_logged_CDA['is_sustaining'].value_counts())

is_sustaining
1    174
0    106
Name: count, dtype: int64


In [5]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.metrics import accuracy_score, classification_report
import statsmodels.api as sm

In [7]:
#train-test split
X = df_logged_CDA.drop(columns=["is_sustaining"])
y = df_logged_CDA["is_sustaining"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

y_test.value_counts()

is_sustaining
1    38
0    18
Name: count, dtype: int64

In [9]:
from sklearn.linear_model import LogisticRegression

model = LogisticRegression()
model.fit(X_train, y_train)

y_pred = model.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
class_report = classification_report(y_test, y_pred)

print(f"Accuracy: {accuracy:.4f}")

Accuracy: 0.8571


In [11]:
X_train_const = sm.add_constant(X_train)  

model_sm = sm.Logit(y_train, X_train_const).fit()

print(model_sm.summary())

Optimization terminated successfully.
         Current function value: 0.328011
         Iterations 8
                           Logit Regression Results                           
Dep. Variable:          is_sustaining   No. Observations:                  224
Model:                          Logit   Df Residuals:                      213
Method:                           MLE   Df Model:                           10
Date:                Mon, 17 Mar 2025   Pseudo R-squ.:                  0.5104
Time:                        19:19:27   Log-Likelihood:                -73.474
converged:                       True   LL-Null:                       -150.08
Covariance Type:            nonrobust   LLR p-value:                 8.120e-28
                                                          coef    std err          z      P>|z|      [0.025      0.975]
-----------------------------------------------------------------------------------------------------------------------
const                     

In [13]:
#Compore it to a random baseline model
import numpy as np

np.random.seed(42)

random_preds = np.random.choice([0, 1], size=56, p=[0.5, 0.5])

random_accuracy = accuracy_score(y_test, random_preds)
print(f"Random Baseline Accuracy: {random_accuracy:.4f}")

Random Baseline Accuracy: 0.6071
