# Import Basics and Read Data

In [6]:
import pandas as pd
import numpy as np

data = pd.read_csv("bug_prediction_data.csv")

# Prepare Data

In [7]:
list(data.columns.values)

['classname',
 'numberOfBugsFoundUntil.',
 'numberOfNonTrivialBugsFoundUntil.',
 'numberOfMajorBugsFoundUntil.',
 'numberOfCriticalBugsFoundUntil.',
 'numberOfHighPriorityBugsFoundUntil.',
 'bugs.x',
 'numberOfVersionsUntil.',
 'numberOfFixesUntil.',
 'numberOfRefactoringsUntil.',
 'numberOfAuthorsUntil.',
 'linesAddedUntil.',
 'maxLinesAddedUntil.',
 'avgLinesAddedUntil.',
 'linesRemovedUntil.',
 'maxLinesRemovedUntil.',
 'avgLinesRemovedUntil.',
 'codeChurnUntil.',
 'maxCodeChurnUntil.',
 'avgCodeChurnUntil.',
 'ageWithRespectTo.',
 'weightedAgeWithRespectTo.',
 'CvsEntropy',
 'CvsWEntropy',
 'CvsLinEntropy',
 'CvsLogEntropy',
 'CvsExpEntropy']

In [8]:
training_set = data.sample(frac=0.8, random_state = 1)
test_set = data.loc[~data.index.isin(training_set.index)]

data_columns = ['linesAddedUntil.', 'maxLinesAddedUntil.', 'avgLinesAddedUntil.', 'linesRemovedUntil.', 'maxLinesRemovedUntil.',
                'avgLinesRemovedUntil.', 'codeChurnUntil.', 'maxCodeChurnUntil.', 'avgCodeChurnUntil.', 'ageWithRespectTo.',
                'weightedAgeWithRespectTo.', 'CvsEntropy', 'CvsWEntropy', 'CvsLinEntropy', 'CvsLogEntropy', 'CvsExpEntropy']


training_data = training_set.as_matrix(columns = data_columns)
nan_locs = np.isnan(training_data)
training_data[nan_locs] = 0

training_target = training_set['numberOfBugsFoundUntil.'].values
nan_locs = np.isnan(training_target)
training_target[nan_locs] = 0

test_data = test_set.as_matrix(columns = data_columns)
nan_locs = np.isnan(test_data)
test_data[nan_locs] = 0

test_target = test_set['numberOfBugsFoundUntil.'].values
nan_locs = np.isnan(test_target)
test_target[nan_locs] = 0

# Regression

## Multi-layer Perceptron

In [9]:
from sklearn.neural_network import MLPRegressor
reg = MLPRegressor()
reg.fit(training_data, training_target)

print('R squared: ' + str(reg.score(training_data, training_target)))

R squared: 0.640988326203


## Support vector Regression

In [10]:
from sklearn.svm import SVR
svr = SVR()
svr.fit(training_data, training_target)

print('R squared: ' + str(svr.score(training_data, training_target)))

R squared: -0.0494088797154


## Kernel Ridge Regression

In [11]:
from sklearn.kernel_ridge import KernelRidge
kr = KernelRidge()
kr.fit(training_data, training_target)

print('R squared: ' + str(kr.score(training_data, training_target)))

R squared: 0.890861672907


## Decision Tree Regression

In [12]:
from sklearn.tree import DecisionTreeRegressor
dt = DecisionTreeRegressor()
dt.fit(training_data, training_target)

print('R squared: ' + str(dt.score(training_data, training_target)))

R squared: 1.0


## Random Forest Regression

In [13]:
from sklearn.ensemble import RandomForestRegressor
rf = RandomForestRegressor(n_estimators = 100)
rf.fit(training_data, training_target)

print('R squared: ' + str(rf.score(training_data, training_target)))

R squared: 0.979818694663


## Gradient Boosting Regression

In [14]:
from sklearn.ensemble import GradientBoostingRegressor
gb = GradientBoostingRegressor()
gb.fit(training_data, training_target)

print('R squared: ' + str(gb.score(training_data, training_target)))

R squared: 0.989124091456


## Polynomial Regression

In [15]:
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import Pipeline

pr = Pipeline([('poly', PolynomialFeatures(degree = 3)),
              ('linear', LinearRegression(fit_intercept = False))])

pr.fit(training_data, training_target)

print('R squared: ' + str(pr.score(training_data, training_target)))

R squared: 0.997819140862


## Lasso Regression

In [16]:
from sklearn.linear_model import Lasso
lm = Lasso()
lm.fit(training_data, training_target)

print('R squared: ' + str(lm.score(training_data, training_target)))

R squared: 0.8874959113


## Elastic Net

In [17]:
from sklearn.linear_model import ElasticNet
en = ElasticNet()
en.fit(training_data, training_target)

print('R squared: ' + str(en.score(training_data, training_target)))

R squared: 0.887693033752


## Least Angle Regression

In [18]:
from sklearn.linear_model import LassoLars
ll = LassoLars()
ll.fit(training_data, training_target)

print('R squared: ' + str(ll.score(training_data, training_target)))

R squared: 0.0


## Bayesian Regression

In [19]:
from sklearn.linear_model import BayesianRidge
br = BayesianRidge()
br.fit(training_data, training_target)

print('R squared: ' + str(br.score(training_data, training_target)))

R squared: 0.887922796977


## Stochastic Gradient Descent

In [20]:
from sklearn.linear_model import SGDRegressor
sgd = SGDRegressor()
sgd.fit(training_data, training_target)

print('R squared: ' + str(sgd.score(training_data, training_target)))

R squared: -3.769246513e+29


