In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
import pandas as pd

df_logged_CDA = pd.read_csv('/content/drive/MyDrive/df_logged_CDA.csv')
print(df_logged_CDA.head())

   is_sustaining  total_forks_count  contributed_back_forks_count  \
0              1          10.159253                     11.319972   
1              1          10.049404                      9.821084   
2              1           9.811263                      9.019785   
3              1           9.567105                     10.596385   
4              1           9.533438                     10.417089   

   hard_forks_count  merged_commits_count  not_merged_commits_count  \
0         10.641417             10.121377                 10.896276   
1          8.933400              9.194516                  8.981807   
2          8.151910              8.971829                  5.673323   
3          9.851510             10.106632                  9.537195   
4         10.108263              9.604070                  9.687816   

   not_contributed_back_commits_count  
0                            9.115480  
1                            8.458716  
2                            6.617403 

In [3]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.metrics import accuracy_score

In [4]:
#train-test split
X = df_logged_CDA.drop(columns=["is_sustaining"])
y = df_logged_CDA["is_sustaining"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [10]:
#Multi-variate linear regression
model = LinearRegression()
model.fit(X_train, y_train)

y_pred = model.predict(X_test)

mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"Mean Squared Error (MSE): {mse}")
print(f"R-squared (R²): {r2}")

print("\nFeature Coefficients:")
for feature, coef in zip(X.columns, model.coef_):
    print(f"{feature}: {coef}")

y_pred_binary = [1 if pred >= 0.5 else 0 for pred in y_pred]

accuracy = accuracy_score(y_test, y_pred_binary)

print(f"Accuracy (after converting to binary): {accuracy:.4f}")

Mean Squared Error (MSE): 0.15303392574695987
R-squared (R²): 0.2983707731835291

Feature Coefficients:
total_forks_count: 0.11444071841901433
contributed_back_forks_count: 0.09012512964668627
hard_forks_count: 0.016841164969210322
merged_commits_count: -0.041154703449409184
not_merged_commits_count: -0.026334374476721233
not_contributed_back_commits_count: -0.02391612555061519
Accuracy (after converting to binary): 0.7679


In [5]:
import statsmodels.api as sm

X = sm.add_constant(X)

model = sm.OLS(y, X).fit()

print(model.summary())

                            OLS Regression Results                            
Dep. Variable:          is_sustaining   R-squared:                       0.346
Model:                            OLS   Adj. R-squared:                  0.332
Method:                 Least Squares   F-statistic:                     24.11
Date:                Fri, 07 Mar 2025   Prob (F-statistic):           7.32e-23
Time:                        07:39:00   Log-Likelihood:                -135.19
No. Observations:                 280   AIC:                             284.4
Df Residuals:                     273   BIC:                             309.8
Df Model:                           6                                         
Covariance Type:            nonrobust                                         
                                         coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------------------------------
cons

In [6]:
from sklearn.svm import SVC
# Train SVM model

svm_model = SVC(kernel='linear')
svm_model.fit(X_train, y_train)

# Make predictions on the test set
y_pred_svm = svm_model.predict(X_test)

# Evaluate the model
accuracy_SVM = accuracy_score(y_test, y_pred_svm)
print(f"SVM Accuracy: {accuracy_SVM}")

SVM Accuracy: 0.7678571428571429


In [7]:
from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb
# Train Random Forest model

rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)

# Make predictions
y_pred_rf = rf_model.predict(X_test)

# Evaluate the model
accuracy_rf = accuracy_score(y_test, y_pred_rf)
print(f"Random Forest Accuracy: {accuracy_rf}")

Random Forest Accuracy: 0.7678571428571429


In [8]:
# Train XGBoost model

xgb_model = xgb.XGBClassifier(objective='binary:logistic', random_state=42)
xgb_model.fit(X_train, y_train)

# Make predictions
y_pred_xgb = xgb_model.predict(X_test)

# Evaluate the model
accuracy_xgb = accuracy_score(y_test, y_pred_xgb)
print(f"XGBoost Accuracy: {accuracy_xgb}")

XGBoost Accuracy: 0.7678571428571429
