In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import pandas as pd
import json

In [None]:
#READ DATA
f_assigned = open('datasets/v02/mozilla/assigned_to.json','r')
data_assigned = json.load(f_assigned)

f_bugstatus = open('datasets/v02/mozilla/bug_status.json','r')
data_status = json.load(f_bugstatus)

#f_cc = open('datasets/v02/mozilla/cc.json','r')
f_component = open('datasets/v02/mozilla/component.json','r')
data_component = json.load(f_component)

f_os = open('datasets/v02/mozilla/op_sys.json','r')
data_os = json.load(f_os)

f_priority = open('datasets/v02/mozilla/priority.json','r')
data_priority = json.load(f_priority)

f_product = open('datasets/v02/mozilla/product.json','r')
data_product = json.load(f_product)

#f_reports = open('datasets/v02/mozilla/reports.json','r')
f_resolution = open('datasets/v02/mozilla/resolution.json','r')
data_resolution = json.load(f_resolution)

f_severity = open('datasets/v02/mozilla/severity.json','r')
data_severity = json.load(f_severity)

#f_shortdesc = open('datasets/v02/mozilla/short_desc.json','r')
f_version = open('datasets/v02/mozilla/version.json','r')
data_version = json.load(f_version)


In [None]:
#arrays to sava the informations
keys = []
bugstatus = []
component = []
os = []
priority = []
product = []
resolution = []
severity = []
version = []

In [None]:
#save keys,which have severity status
for i in data_severity['severity']:
    keys.append(i)
print(len(keys))

In [None]:
#save informations only for keys
for count, key in enumerate(keys):
    component.append(data_component['component'][key][0]['what'])
    os.append(data_os['op_sys'][key][0]['what'])
    priority.append(data_priority['priority'][key][0]['what'])
    product.append(data_product['product'][key][0]['what'])
    resolution.append(data_resolution['resolution'][key][0]['what'])
    severity.append(data_severity['severity'][key][0]['what'])
    version.append(data_version['version'][key][0]['what'])


In [None]:
#make sure that every array has the same length.
print(len(version) == len(severity) and len(severity) == len(resolution) and len(resolution)==len(product)
     and len(product) == len(priority) and len(priority) == len(os) and len(os) == len(component))

In [None]:
#create a pandas dataframe with all the arrays as columns.
data = pd.DataFrame({
  'version':version[:],
  'resolution':resolution[:],
  'product':product[:],
  'priority':priority[:], 
  'os':os[:],
  'component':component[:],
  'severity':severity[:]  
})
data.head()

In [None]:
#exclude resolution which hasnt any useful information
data = data.drop('resolution',axis = 1)

In [None]:
#exclude all rows which have nan values
index = data[data.isin(['']).any(axis=1)].index
print(index)
data.drop(index,inplace = True)

In [None]:
#transform categorical data to numerical
data['priority'] = data['priority'].fillna(value='None')
map_prio = {'priority':{'P1':5, 'P2':4,'P3':3,'P4':2,'P5':1,'None':0,'--':0}}
map_sev = {'severity':{'blocker':6, 'critical':5,'major':4,'normal':3,'minor':2,'trivial':1,'enchancement':0}}
data.replace(map_sev,inplace = True)
data.replace(map_prio,inplace = True)
data.head()

In [None]:
#for the rest data use one hot encoding
data = pd.get_dummies(data, columns=['os'], prefix=['OS'])
data = pd.get_dummies(data, columns=['component'], prefix=['component'])
data = pd.get_dummies(data, columns=['product'],prefix=['product'])
data = pd.get_dummies(data, columns=['version'],prefix=['version'])

In [None]:
#split on training and testing set
X=data.drop('severity',axis=1)
Y=data['priority']
X_train,X_test,Y_train,Y_test = train_test_split(X,Y,test_size = 0.3)

In [None]:
#train the model
clf = RandomForestClassifier(n_estimators = 15)
clf.fit(X_train,Y_train)
#predict on test set
Y_pred = clf.predict(X_test)
print("Accuracy:", accuracy_score(Y_test,Y_pred))