In [None]:
import pandas as pd
import numpy as np
from fractions import Fraction
import statsmodels.api as sm

# Import Data

In [None]:
commits = pd.read_csv("../inputs-outputs/commits.csv")
commits.dropna(inplace=True, axis=0)
commits

In [None]:
metrics = pd.read_csv("../inputs-outputs/metrics.csv")
metrics.dropna(inplace=True, axis=0)
metrics

In [None]:
projects = pd.read_csv("../inputs-outputs/models/repo_list.csv")
projects

# Preprocess Data

Let keep only commits fixing issues and get their frequency per file.

In [None]:
fix_commits = commits[commits["Subject"].str.contains("fix")]
fix_count = pd.DataFrame(fix_commits["PathFile"].value_counts())
fix_count.columns = ["FixCount"]
fix_count

Lets modify paths in the metrics data so that it matches the commit data.

In [None]:
for path in projects["Source"]:
    metrics["PathFile"] = metrics["PathFile"].str.removeprefix(path + '/')

metrics

Lets assign all files to a single class by dropping all files containing more than one class. We can then use this column as the index.

In [None]:
metrics = metrics[metrics["PathFile"].duplicated() == False]
metrics

Let convert the LCOM column to float.

In [None]:
metrics["LCOM"] = metrics["LCOM"].str.strip("()").apply(Fraction).astype('float')
metrics.dtypes

metrics["LCOM"].value_counts()

## Join the datasets

Lets associate each class to the the number of time it has been fixed. 

In [None]:
joined_data = metrics.join(fix_count, on="PathFile")
joined_data["FixCount"] = joined_data["FixCount"].fillna(0)
joined_data

## Create the Dataset
Lets define the dependent variables as `X` and the independent variable as `y` where `0` denotes that no bug has been found and `1` denotes that at least one bug has been found.

In [None]:
X = joined_data[["Attributes", "Methods", "LOC", "NOC", "fanOut", "WMC", "LCOM"]]
y = (joined_data["FixCount"] > 0).astype(int)

None of the dependent variables seem to have a normal distribution (shown below). We must either normalize them or use a model that does not assume a normal distribution.

In [None]:
X.hist(figsize=(7,7), bins=20)

Next lets verify the assumption that the dependent variables are independent of each other.

In [None]:
X.corr(method='spearman')

The `Attributes` and `NOC` data have a correlation above 0.75. We will drop the `NOC` feature to maintain the independence of dependent variables.

In [None]:
X.drop("NOC", axis=1, inplace=True)

# Model
We will use a logistic regression model for the following reasons:
- the dependent variables aren't normal, and
- the independent variable is binary.

In [None]:
logit_model = sm.Logit(y, X).fit()

logit_model.summary()