In [1]:
import pandas as pd

df_logged_CDA = pd.read_csv('data/df_logged_CDA.csv')
print(df_logged_CDA.head())

   is_sustaining  total_forks_count  avg_annual_forks_growth_rate  \
0              1          10.159253                     -0.096791   
1              1          10.049404                      0.018223   
2              1           9.811263                     -0.046718   
3              1           9.567105                      0.294218   
4              1           9.533438                      2.041317   

   contributed_back_forks_count  hard_forks_count  merged_commits_count  \
0                      7.523481          6.100319             10.032540   
1                      6.570883          5.416100              8.604288   
2                      5.541264          4.418841              8.437500   
3                      7.939515          5.937536             10.086351   
4                      7.025538          6.169611              9.489335   

   not_merged_commits_count  not_contributed_back_commits_count  \
0                  9.265397                            9.115480   


In [2]:
print(df_logged_CDA['is_sustaining'].value_counts())

is_sustaining
1    174
0    106
Name: count, dtype: int64


In [3]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.metrics import accuracy_score, classification_report
import statsmodels.api as sm

In [4]:
#train-test split
X = df_logged_CDA.drop(columns=["is_sustaining"])
y = df_logged_CDA["is_sustaining"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

y_test.value_counts()

is_sustaining
1    38
0    18
Name: count, dtype: int64

In [5]:
from sklearn.linear_model import LogisticRegression

model = LogisticRegression()
model.fit(X_train, y_train)

y_pred = model.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
class_report = classification_report(y_test, y_pred)

print(f"Accuracy: {accuracy:.4f}")

Accuracy: 0.8571


In [6]:
X_train_const = sm.add_constant(X_train)  

model_sm = sm.Logit(y_train, X_train_const).fit()

print(model_sm.summary())

Optimization terminated successfully.
         Current function value: 0.337945
         Iterations 8
                           Logit Regression Results                           
Dep. Variable:          is_sustaining   No. Observations:                  224
Model:                          Logit   Df Residuals:                      213
Method:                           MLE   Df Model:                           10
Date:                Sat, 15 Mar 2025   Pseudo R-squ.:                  0.4956
Time:                        11:38:11   Log-Likelihood:                -75.700
converged:                       True   LL-Null:                       -150.08
Covariance Type:            nonrobust   LLR p-value:                 6.691e-27
                                                          coef    std err          z      P>|z|      [0.025      0.975]
-----------------------------------------------------------------------------------------------------------------------
const                     

In [7]:
#Compore it to a random baseline model
import numpy as np

np.random.seed(42)

random_preds = np.random.choice([0, 1], size=56, p=[0.5, 0.5])

random_accuracy = accuracy_score(y_test, random_preds)
print(f"Random Baseline Accuracy: {random_accuracy:.4f}")

Random Baseline Accuracy: 0.6071
