In [None]:
#Imports
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
import re
import seaborn as sb
%matplotlib inline
%config InlineBackend.figure_format = 'retina'


In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
!ls "/content/drive/My Drive/Bug Report classification"

In [None]:
data=pd.read_csv("Input/Bug report.csv")

In [None]:
data.loc[:,'Issue Type'] = "bug"
data.head()

In [None]:
cols_of_interest=["Summary", "Description", "Issue Type"]
dataset=data[cols_of_interest]
print(dataset.shape)
dataset.head()

In [None]:
# Removing contents of tags and all for further text processing
dataset['Description'].replace(regex=True,inplace=True, to_replace= r'<.+?>', value=r' ')
dataset['Summary'].replace(regex=True,inplace=True, to_replace= r'<.+?>', value=r' ')

# Removing links from all for further text processing
dataset['Description'].replace(regex=True,inplace=True, to_replace= r'(https?:\/\/)(\s)*(www\.)?(\s)*((\w|\s)+\.)*([\w\-\s]+\/)*([\w\-]+)((\?)?[\w\s]*=\s*[\w\%&]*)*', value=r' ')
dataset['Summary'].replace(regex=True,inplace=True, to_replace= r'(https?:\/\/)(\s)*(www\.)?(\s)*((\w|\s)+\.)*([\w\-\s]+\/)*([\w\-]+)((\?)?[\w\s]*=\s*[\w\%&]*)*', value=r' ')

# Replace email addresses
dataset['Description'].replace(regex=True,inplace=True, to_replace= r'^.+@[^\.].*\.[a-z]{2,}$', value=r'')
dataset['Summary'].replace(regex=True,inplace=True, to_replace= r'^.+@[^\.].*\.[a-z]{2,}$', value=r'')

# Replace URLs with 'web-address'
dataset['Description'].replace(regex=True,inplace=True, to_replace= r'^http\://[a-zA-Z0-9\-\.]+\.[a-zA-Z]{2,3}(/\S*)?$', value=r'')
dataset['Summary'].replace(regex=True,inplace=True, to_replace= r'^http\://[a-zA-Z0-9\-\.]+\.[a-zA-Z]{2,3}(/\S*)?$', value=r'')

# Replace 10 digit phone numbers
dataset['Description'].replace(regex=True,inplace=True, to_replace= r'\(?[\d]{3}\)?[\s-]?[\d]{3}[\s-]?[\d]{4}$', value=r'')
dataset['Summary'].replace(regex=True,inplace=True, to_replace= r'\(?[\d]{3}\)?[\s-]?[\d]{3}[\s-]?[\d]{4}$', value=r'')

# Replace numbers with 'numbr'
dataset['Description'].replace(regex=True,inplace=True, to_replace= r'\d+(\.\d+)?', value=r'')
dataset['Summary'].replace(regex=True,inplace=True, to_replace= r'\d+(\.\d+)?', value=r'')

# Remove punctuation
dataset['Description'].replace(regex=True,inplace=True, to_replace= r'[^\w\d\s]', value=r'')
dataset['Summary'].replace(regex=True,inplace=True, to_replace= r'[^\w\d\s]', value=r'')

#converting to lower case
dataset['Description'] = dataset['Description'].str.lower()
dataset['Summary'] = dataset['Summary'].str.lower()

# Removing non-english content
# dataset.drop(dataset[dataset.language!="english"].index,inplace=True)

# Removing rows with empty columns 
dataset.dropna(subset=['Description','Summary'],inplace=True)
dataset.reset_index(inplace = True) 

In [None]:
dataset.info()

In [None]:
dataset['Issue Type']

In [None]:
fig = plt.figure(figsize=(8,6))
dataset.groupby('Issue Type').Status.count().plot.bar(ylim=0)
plt.show()

In [None]:
final_c=['Bug','Improvement','New Feature','Sub-task','Task','Test','Wish']

In [None]:
from sklearn.preprocessing import LabelEncoder
label = LabelEncoder()
dataset["Issue Type"] = label.fit_transform(dataset["Issue Type"])
dataset.head()


In [None]:
dataset.info()

In [None]:
tfidf = TfidfVectorizer(stop_words='english',ngram_range=(1,2))
vectorizer = tfidf.fit(dataset.Summary)
transformed_summary = vectorizer.transform(dataset.Summary)
#transformed_title = vectorizer.transform(dataset.Description)

In [None]:
dataset["Created"][26]

In [None]:
feature_names = tfidf.get_feature_names()

In [None]:
#Returns dictionary with term names and total tfidf scores for all terms in corpus
def get_tfidf_term_scores(feature_names):
    term_corpus_dict = {} 
    for term_ind, term in enumerate(feature_names):
        term_name = feature_names[term_ind]
        term_corpus_dict[term_name] = np.sum(transformed_summary.T[term_ind].toarray())
        
    return term_corpus_dict

In [None]:
term_corpus_dict = get_tfidf_term_scores(feature_names)

In [None]:
#Returns sort words from highest score to lowest score
def get_sorted_tfidf_scores(term_corpus_dict):
    sortedIndices = np.argsort( list(term_corpus_dict.values()))[::-1]
    termNames = np.array(list(term_corpus_dict.keys()))
    scores = np.array(list(term_corpus_dict.values()))
    termNames = termNames[sortedIndices]
    scores = scores[sortedIndices]
    
    return termNames, scores

In [None]:
termNames, scores = get_sorted_tfidf_scores(term_corpus_dict)

In [None]:
def plot_tfidf_scores(scores,termNames, n_words = 20):
    '''Returns one plot for Importance of Top N Terms
       and one plot for Importance of Select K Terms'''

    # Create a figure instance, and the two subplots
    fig = plt.figure(figsize = (15, 15))
    
    override = {'fontsize': 'large'}

    fig.add_subplot(221)   
    sb.set()
    sb.barplot(x = scores[:n_words], y = termNames[:n_words]);
    plt.title(" Top tfidf score of top 20 words in Summary ".format(n_words));
    plt.xlabel("TFIDF Score")

In [None]:
plot_tfidf_scores(scores, termNames, n_words = 20)

In [None]:
diff_corpus=list()
for i in range(0,7,1):
  diff_corpus.append(list())
for i in range(0,dataset.shape[0],1):
  diff_corpus[dataset["Issue Type"][i]].append(dataset.Summary[i])

In [None]:
most_freq_w_in_class=list()

In [None]:
for i in range(0,7,1):
  vectorizer = tfidf.fit(diff_corpus[i])
  transformed_summary = vectorizer.transform(diff_corpus[i])
  feature_names = tfidf.get_feature_names()
  term_corpus_dict = get_tfidf_term_scores(feature_names)
  termNames, scores = get_sorted_tfidf_scores(term_corpus_dict)
  most_freq_w_in_class.append(list(termNames[0:10]))
  plot_tfidf_scores(scores, termNames, n_words = 20)

In [None]:
#list(termNames[0:10])

In [None]:
most_freq_w_in_class

In [None]:
type(dataset.Summary)

In [None]:
import plotly.graph_objects as go
fig = go.Figure(data=[go.Table(header=dict(values=['Bug','Improvement','New Feature','Sub-task','Task','Test','Wish']),
                 cells=dict(values=most_freq_w_in_class))])
fig.show()
#fig.suptitle('test title', fontsize=20)

In [None]:
from nltk.corpus import stopwords
import nltk
nltk.download('stopwords')
stop=stopwords.words('english')

In [None]:
l_of_words=list()
for i in range(0,7,1):
  l_of_words.append(list())
for i in range(0,dataset.shape[0],1):
  for w in dataset.Summary[i].split():
    if w not in stop:
      l_of_words[dataset["Issue Type"][i]].append(w)

In [None]:
for i in range(0,7,1):
  print("class "+str(i))
  print(l_of_words[i])

In [None]:
word_list=list()
for i in range(0,7,1):
  word_list.append(list())
from collections import Counter 
for i in range(0,7,1):
  Counte = Counter(l_of_words[i]) 
  most_occur = Counte.most_common(10) 
  for ele in most_occur:
    word_list[i].append(ele[0])
  print(most_occur) 


In [None]:
import plotly.graph_objects as go

fig = go.Figure(data=[go.Table(header=dict(values=['Bug','Improvement','New Feature','Sub-task','Task','Test','Wish']),
                 cells=dict(values=word_list))
                     ])
fig.show()
print("Top 10 Most Frequent Words In Each Category")

In [None]:
def difference_in_date(create,resolve):
     vari=int(resolve[7:9])-int(create[7:9])
     vari=vari*8760
   
    
     dict={'Jan':1,'Feb':2,'Mar':3,'Apr':4,'May':5,'Jun':6,'Jul':7,'Aug':8,'Sep':9,'Oct':10,'Nov':11,'Dec':12}
     if(dict[resolve[3:6]]>=dict[create[3:6]]):
      vari+=(int(dict[resolve[3:6]])-int(dict[create[3:6]]))*720
     else:
      vari-=(int(dict[create[3:6]])-int(dict[resolve[3:6]]))*720
 
    
     if(int(create[0:2])<=int(resolve[0:2])):
      vari+=(int(resolve[0:2])-int(create[0:2]))*24
     else:
      vari-=(int(create[0:2])-int(resolve[0:2]))*24
     
    
     if(int(create[10:12])<=int(resolve[10:12])):
      vari+=int(resolve[10:12])-int(create[10:12])
     else:
      vari-=int(create[10:12])-int(resolve[10:12])
     
    
     if(int(create[13:15])<=int(resolve[13:15])):
      vari+=(int(resolve[13:15])-int(create[13:15]))/60
     else:
      vari-=(int(create[13:15])-int(resolve[13:15]))/60
     return vari

In [None]:
dataset.Created[3654],dataset.Resolved[3654]

In [None]:
dict={'Jan':1,'Feb':2,'Mar':3,'Apr':4,'May':5,'Jun':6,'Jul':7,'Aug':8,'Sep':9,'Oct':10,'Nov':11,'Dec':12}
dict[dataset.Created[1][3:6]]

In [None]:
x=[0,0,0,0,0,0,0]
y=[0,0,0,0,0,0,0]
for i in range(0,dataset.shape[0],1):
  z=difference_in_date(dataset.Created[i],dataset.Resolved[i])
  x[dataset["Issue Type"][i]]+=z
  #if(z<0):
   # print(i)
  y[dataset["Issue Type"][i]]+=1



In [None]:
final_mttr=[0,0,0,0,0,0,0]
for i in range(0,7,1):
  final_mttr[i]=x[i]/y[i]
final_mttr

In [None]:

   # this is for plotting mean time to repair
    plt.figure(figsize=(8,6))
    index = np.arange(len(final_c))
    plt.bar(index,final_mttr)
    plt.xlabel('Issue Type', fontsize=15)
    plt.ylabel('MTTR (in hours)', fontsize=15)
    plt.xticks(index, final_c, fontsize=15, rotation=80)
    plt.title('Mean Time To Repair for Each Issue Type')
    plt.show()

In [None]:
type(dataset)

In [None]:
from google.colab import files

dataset.to_csv('final1.csv')
#files.download('final1.csv')

In [None]:
dataset.head(10)

In [None]:
print("The data-set has %d rows and %d columns"%(dataset.shape[0],dataset.shape[1]))

# Finding out which columns has the missing values not needed although since we would be working with either summary or description and usme koi missing values nhi hai

In [None]:
from __future__ import print_function
print (dataset.columns)
for col_name in dataset.columns:
    print (col_name,end=": ")
    print (sum(dataset[col_name].isnull()))

# To see which rows are duplicated

In [None]:
sum(dataset.duplicated()) # which is indeed very good as the result is 0.

# finding out class distribution

In [None]:
category_counter={x:0 for x in set(dataset['Issue Type'])}

In [None]:
for each_cat in dataset['Issue Type']:
    category_counter[each_cat]+=1

In [None]:
print(category_counter)

#Combining the Columns of summary and description and then applying NLP

In [None]:
dataset['Description']

In [None]:
dataset['Summary']

In [None]:
dataset["Merger"] = dataset["Summary"].str.cat(dataset["Description"], sep =" \n ") 

In [None]:
dataset['Merger']

In [None]:
dataset

In [None]:
corpus=dataset.Merger
#corpus means collection of text. For this particular data-set, I will treat the newly created column merger
#as my corpus and will use that to create features.
vectorizer = TfidfVectorizer(stop_words='english')
#Initializing TFIDF vectorizer to conver the raw corpus to a matrix of TFIDF features and also enabling the removal of stopwords.
tfidf_matrix=vectorizer.fit_transform(corpus).todense()
#creating TFIDF features sparse matrix by fitting it on the specified corpus. 
tfidf_names=vectorizer.get_feature_names()
#grabbing the name of the features.

In [None]:
print("Number of TFIDF Features: %d"%len(tfidf_names)) #same info can be gathered by using tfidf_matrix.shape

In [None]:
training_time_container={'b_naive_bayes':0,'mn_naive_bayes':0,'random_forest':0,'linear_svm':0}
prediction_time_container={'b_naive_bayes':0,'mn_naive_bayes':0,'random_forest':0,'linear_svm':0}

accuracy_container={'b_naive_bayes':0,'mn_naive_bayes':0,'random_forest':0,'linear_svm':0}

#**Learning Classifiers, Making Predictions and Validating Results**

##**Set the GPU to on before training for lesser training time.**

In [None]:
dataset.columns=["index","Issue_key","Issue_id","Summary","Status","Description","Priority","Resolution","Assignee","Created","Updated","Last_Viewed","Resolved","Issue_Type","Merger"]

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
import sklearn.metrics

In [None]:
dataset

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
import sklearn.metrics
variables = tfidf_matrix
labels = dataset.Issue_Type
variables_train, variables_test, labels_train, labels_test = train_test_split(variables, labels, test_size=.3)

In [None]:
#analyzing the shape of the training and test data-set:
print('Shape of Training Data: '+str(variables_train.shape))
print('Shape of Test Data: '+str(variables_test.shape))

#**Applying Naive Bayes**

two types:-
*   Bernoulli
*   Multinomial



In [None]:
training_time_container.keys()

#**Bernoulli**

In [None]:
from time import time
from sklearn.naive_bayes import BernoulliNB
#loading Gaussian Naive Bayes from the sklearn library:
bnb_classifier=BernoulliNB()
#initializing the object
t0=time()
bnb_classifier=bnb_classifier.fit(variables_train,labels_train)
training_time_container['b_naive_bayes']=time()-t0
#fitting the classifier or training the classifier on the training data

In [None]:
#after the model has been trained, we proceed to test its performance on the test data:
t0=time()
bnb_predictions=bnb_classifier.predict(variables_test)
prediction_time_container['b_naive_bayes']=time()-t0

In [None]:
prediction_time_container['b_naive_bayes']

In [None]:
nb_ascore=sklearn.metrics.accuracy_score(labels_test, bnb_predictions)
accuracy_container['b_naive_bayes']=nb_ascore

In [None]:
print("Bernoulli Naive Bayes Accuracy Score: %f"%accuracy_container['b_naive_bayes'])
print("Training Time: %f"%training_time_container['b_naive_bayes'])
print("Prediction Time: %f"%prediction_time_container['b_naive_bayes'])

In [None]:
print("Confusion Matrix of Bernoulli Naive Bayes Classifier output: ")
sklearn.metrics.confusion_matrix(labels_test,bnb_predictions)

In [None]:
print("Classification Metrics: ")
print(sklearn.metrics.classification_report(labels_test,bnb_predictions))
#accuracy score can be misleading when there is class imbalance problem in the data-set. 
# F1-Score is a better measure of a classifier performance. The greater the F1-Score, the better. Also, we can see
#that F1-Score and Accuracy score are somewhat similar because the data-set has negligible class imbalance issue:

#**Multinomial Naive Bayes**
Bernoulli Naive Bayes just uses the fact that whether a feature is present or not. However if we somehow also take into account the occurrence weight or count of the feature as well (in our case, the TFIDF weight of each feature), we can hypothesize that the performance of such classifier will be equally good, if not better. 

In [None]:
from sklearn.naive_bayes import MultinomialNB
mn_bayes=MultinomialNB()
t0=time()
mn_bayes_fit=mn_bayes.fit(variables_train,labels_train)
training_time_container['mn_naive_bayes']=time()-t0
t0=time()
prediction_mn=mn_bayes_fit.predict(variables_test)
prediction_time_container['mn_naive_bayes']=time()-t0
mn_ascore=sklearn.metrics.accuracy_score(labels_test, prediction_mn) 
accuracy_container['mn_naive_bayes']=mn_ascore

In [None]:
print("Accuracy Score of Multi-Nomial Naive Bayes: %f" %(mn_ascore))
#and its training and prediction time are:
print("Training Time: %fs"%training_time_container['mn_naive_bayes'])
print("Prediction Time: %fs"%prediction_time_container['mn_naive_bayes'])

#**Random Forest Classifier**

In [None]:
from sklearn.ensemble import RandomForestClassifier

rf_classifier=RandomForestClassifier(n_estimators=50)
t0=time()
rf_classifier=rf_classifier.fit(variables_train,labels_train)

training_time_container['random_forest']=time()-t0
print("Training Time: %fs"%training_time_container['random_forest'])

t0=time()
rf_predictions=rf_classifier.predict(variables_test)
prediction_time_container['random_forest']=time()-t0
print("Prediction Time: %fs"%prediction_time_container['random_forest'])

accuracy_container['random_forest']=sklearn.metrics.accuracy_score(labels_test, rf_predictions)
print ("Accuracy Score of Random Forests Classifier: ")
print(accuracy_container['random_forest'])
print(sklearn.metrics.confusion_matrix(labels_test,rf_predictions))

#**Linear SVM using Stochastic Gradient Descent**
Stochastic Gradient Descent (SGD) is a one of the most efficient approaches used in linear classifiers under convex loss functions such as (linear) Support Vector Machines. It has proven to perform well in in large-scale and sparse machine learning problems.

In [None]:
from sklearn import linear_model

svm_classifier=linear_model.SGDClassifier(loss='hinge',alpha=0.0001)

t0=time()
svm_classifier=svm_classifier.fit(variables_train, labels_train)
training_time_container['linear_svm']=time()-t0
print("Training Time: %fs"%training_time_container['linear_svm'])

t0=time()
svm_predictions=svm_classifier.predict(variables_test)
prediction_time_container['linear_svm']=time()-t0
print("Prediction Time: %fs"%prediction_time_container['linear_svm'])

accuracy_container['linear_svm']=sklearn.metrics.accuracy_score(labels_test, svm_predictions)
print ("Accuracy Score of Linear SVM Classifier: %f"%accuracy_container['linear_svm'])
print(sklearn.metrics.confusion_matrix(labels_test,svm_predictions))

In [None]:
#if we train the SGD Classifier with elastic net penalty, it  brings more sparsity to the model not possible with the L2:
svm_classifier_enet=linear_model.SGDClassifier(loss='hinge',alpha=0.0001,penalty='elasticnet')
svm_classifier_enet=svm_classifier_enet.fit(variables_train, labels_train)
svm_enet_predictions=svm_classifier_enet.predict(variables_test)
print ("Accuracy Score of Linear SVM Classifier: %f"%sklearn.metrics.accuracy_score(labels_test,svm_enet_predictions))

In [None]:
import matplotlib.pyplot as plt
with plt.style.context('fivethirtyeight'):
    plt.figure(figsize=(20,8))
    plt.bar(range(4),training_time_container.values(),align='center')
    plt.xticks(range(4),training_time_container.keys(),fontsize = 15)
    plt.ylabel("Training time in seconds")
    plt.ylim(0,100)
    plt.grid(True)
    plt.title("Comparison of Training Time of different classifiers")

In [None]:
with plt.style.context('fivethirtyeight'):
    plt.figure(figsize=(20,8))
    plt.bar(range(4),prediction_time_container.values(),align='center',color='orange')
    plt.xticks(range(4),prediction_time_container.keys(),fontsize = 15)
    plt.ylabel("Prediction time in seconds")
    plt.grid(True)
    plt.ylim(0,2)
    plt.title("Comparison of Prediction Time of different classifiers")

In [None]:
with plt.style.context('fivethirtyeight'):
    plt.figure(figsize=(20,8))
    plt.bar(range(4),accuracy_container.values(),align='center',color='g')
    plt.xticks(range(4),accuracy_container.keys(),fontsize = 15)
    plt.ylabel("Accuracy Scores")
    plt.grid(True)
    plt.title("Comparison of Accuracy Scores of different classifiers")
    plt.ylim(0.5,1.0)

#**So far we are using the unbalanced dataset, thats why we get such less accuracy upon every classifier.**
#**Suggestions to improve the accuracy and improving the readability are welcomed**


#**Ensemble Learning**

##Voting Classifier

training_time_container={'b_naive_bayes':0,'mn_naive_bayes':0,'random_forest':0,'linear_svm':0}

prediction_time_container={'b_naive_bayes':0,'mn_naive_bayes':0,'random_forest':0,'linear_svm':0}

accuracy_container={'b_naive_bayes':0,'mn_naive_bayes':0,'random_forest':0,'linear_svm':0}

#Variables used earlier for classifiers.

bnb_classifier=BernoulliNB()

mn_bayes=MultinomialNB()

rf_classifier=RandomForestClassifier(n_estimators=50)

svm_classifier=linear_model.SGDClassifier(loss='hinge',alpha=0.0001)

nl_svm_classifier=SVC(C=1000000.0, gamma='auto_deprecated', kernel='rbf')

variables_train, variables_test, labels_train, labels_test  =   train_test_split(variables, labels, test_size=.3)



#**Hard Voting / Majority Voting**

In [None]:
from sklearn.ensemble import VotingClassifier
estimators = []
estimators.append(('Bernoulli',bnb_classifier))
estimators.append(('Multinomial',mn_bayes))
estimators.append(('RandomForest',rf_classifier))
estimators.append(('SVMLinear',svm_classifier))
#estimators.append(('SVMNonLinear',nl_svm_classifier))
ensemble = VotingClassifier(estimators,voting = 'hard')

In [None]:
#fit model to training data
t0=time()
ensemble.fit(variables_train, labels_train)

ensemble_training_time=time()-t0
training_time_container['Hard'] = ensemble_training_time
print("Training Time: %fs"%ensemble_training_time)
#test our model on the test data
ensemble.score(variables_test, labels_test)

In [None]:
t0=time()
ensemble_predictions=ensemble.predict(variables_test)
ensemble_prediction_time = time() - t0
prediction_time_container['Hard'] = ensemble_prediction_time
print("Prediction Time: %fs"%ensemble_prediction_time)

accuracy_container['Ensemble_hard']=sklearn.metrics.accuracy_score(labels_test, ensemble_predictions)
print("Accuracy Score of Hard-Voting Ensemble is : %f" %accuracy_container['Ensemble_hard'])

print(sklearn.metrics.confusion_matrix(labels_test,ensemble_predictions))

##If ‘soft’, predicts the class label based on the argmax of the sums of the predicted probabilities, which is recommended for an ensemble of well-calibrated classifiers.

So what we need is calibrated classifiers Svm me loss= hinge rahega to classifier is non calibrated first calibrate it.


SGDClassifier(loss = 'hinge') does not have probability by default.

You have to pass SGDclassifier(loss = 'hinge') to CalibratedClassifierCV() which will calculate the probability values of SGDclassifier(loss = 'hinge')

In [None]:
svm_classifier=svm_classifier.fit(variables_train, labels_train)
from sklearn.calibration import CalibratedClassifierCV
svm_calibrator = CalibratedClassifierCV(svm_classifier, cv='prefit')
svm_calibrator = svm_calibrator.fit(variables_train, labels_train)

In [None]:
estimators1 = []
estimators1.append(('Bernoulli',bnb_classifier))
estimators1.append(('Multinomial',mn_bayes))
estimators1.append(('RandomForest',rf_classifier))
ensemble2 = VotingClassifier(estimators1,voting = 'soft')
#fit model to training data
t0=time()
ensemble2.fit(variables_train, labels_train)

ensemble2_training_time=time()-t0
print("Training Time: %fs"%ensemble2_training_time)
#test our model on the test data
print("Accuracy: %fs"%ensemble2.score(variables_test, labels_test))
t0=time()
ensemble2_predictions=ensemble2.predict(variables_test)
ensemble2_prediction_time = time() - t0
training_time_container['soft'] = ensemble2_training_time
prediction_time_container['soft'] = ensemble2_prediction_time
print("Prediction Time: %fs"%ensemble2_prediction_time)

accuracy_container['Ensemble_soft']=sklearn.metrics.accuracy_score(labels_test, ensemble2_predictions)
print("Accuracy Score of Soft-Voting Ensemble is : %f" %accuracy_container['Ensemble_soft'])

print(sklearn.metrics.confusion_matrix(labels_test,ensemble2_predictions))

#**Bagged Decision Trees**
## BaggingClassifier with the Classification and Regression Trees algorithm (DecisionTreeClassifier). A total of 100 trees are created.

In [None]:
from time import time 
import random
from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier
cart = DecisionTreeClassifier()
num_trees = 100       #tweaking this value for accuracy increase, but keep an eye for overfitting.
Bagging_classifier= BaggingClassifier(base_estimator=cart, n_estimators=num_trees, random_state=random.seed())
t0=time()
Bagging_classifier.fit(variables_train, labels_train)

BaggingClassifier_training_time=time()-t0
print("Training Time: %fs"%BaggingClassifier_training_time)
training_time_container['bagging'] = BaggingClassifier_training_time

#test our model on the test data
print("Accuracy: %fs"%Bagging_classifier.score(variables_test, labels_test))


In [None]:
t0=time()
BaggingClassifier_predictions=Bagging_classifier.predict(variables_test)
BaggingClassifier_prediction_time = time() - t0
print("Prediction Time: %fs"%BaggingClassifier_prediction_time)
prediction_time_container['bagging'] = BaggingClassifier_prediction_time

accuracy_container['Bagging_classifier']=sklearn.metrics.accuracy_score(labels_test, BaggingClassifier_predictions)
print("Accuracy Score of Bagging classifier is : %f" %accuracy_container['Bagging_classifier'])

print(sklearn.metrics.confusion_matrix(labels_test,BaggingClassifier_predictions))

#**Boosting Algorithm**
##**Adaboost**

It generally works by weighting instances in the dataset by how easy or difficult they are to classify, allowing the algorithm to pay or or less attention to them in the construction of subsequent models.

In [None]:
from sklearn.ensemble import AdaBoostClassifier
seed = 7
num_trees = 100
Adaboost_classifier = AdaBoostClassifier(n_estimators=num_trees, random_state=seed)
t0=time()
Adaboost_classifier.fit(variables_train, labels_train)

AdaboostClassifier_training_time=time()-t0
print("Training Time: %fs"%AdaboostClassifier_training_time)

#test our model on the test data
print("Accuracy: %fs"%Adaboost_classifier.score(variables_test, labels_test))
t0=time()
AdaboostClassifier_predictions = Adaboost_classifier.predict(variables_test)
AdaboostClassifier_prediction_time = time() - t0
print("Prediction Time: %fs"%AdaboostClassifier_prediction_time)
training_time_container['adaboost'] = AdaboostClassifier_training_time
prediction_time_container['adaboost'] = AdaboostClassifier_prediction_time

accuracy_container['Adaboost_classifier']=sklearn.metrics.accuracy_score(labels_test, AdaboostClassifier_predictions)
print("Accuracy Score of Adaboost classifier is : %f" %accuracy_container['Adaboost_classifier'])

print(sklearn.metrics.confusion_matrix(labels_test,AdaboostClassifier_predictions))

##**Stochastic Gradient Boosting**


In [None]:
from sklearn.ensemble import GradientBoostingClassifier
seed = 7
num_trees = 100
SGB_classifier = GradientBoostingClassifier(n_estimators=num_trees, random_state=seed)
t0=time()
SGB_classifier.fit(variables_train, labels_train)

SGBClassifier_training_time=time()-t0
print("Training Time: %fs"%SGBClassifier_training_time)

#test our model on the test data
print("Accuracy: %fs"%SGB_classifier.score(variables_test, labels_test))
t0=time()
SGBClassifier_predictions = SGB_classifier.predict(variables_test)
SGBClassifier_prediction_time = time() - t0
print("Prediction Time: %fs"%SGBClassifier_prediction_time)
training_time_container['sgb'] = SGBClassifier_training_time
prediction_time_container['sgb'] = SGBClassifier_prediction_time

accuracy_container['SGB_classifier']=sklearn.metrics.accuracy_score(labels_test, SGBClassifier_predictions)
print("Accuracy Score of SGB_classifier is : %f" %accuracy_container['SGB_classifier'])

print(sklearn.metrics.confusion_matrix(labels_test,SGBClassifier_predictions))

## so what we get is useless accuracy beacuse the dataset is actually highly unbalanced and would not show any good result.
Two things for future work, Cross validation or correcting our dataset ie doing a class balance for the dataset.

not much hope attached to cross validation. Class balance by creating copy class wise seems to be the only option for now.

moreover what's astounding is that the individual classifiers are performing way better than ensemble, although the reverse should have happened.

In [None]:
with plt.style.context('fivethirtyeight'):
    plt.figure(figsize=(25,8))
    plt.bar(range(9),accuracy_container.values(),align='center',color='g')
    plt.xticks(range(9),accuracy_container.keys(),fontsize = 15)
    plt.ylabel("Accuracy Scores")
    plt.grid(True)
    plt.title("Comparison of Accuracy Scores of different classifiers")
    plt.ylim(0.3,1.0)

In [None]:
with plt.style.context('fivethirtyeight'):
    plt.figure(figsize=(25,8))
    plt.bar(range(9),training_time_container.values(),align='center',color='g')
    plt.xticks(range(9),training_time_container.keys(),fontsize = 15)
    plt.ylabel("training time")
    plt.grid(True)
    plt.title("Comparison of training time of different classifiers")
    plt.ylim(0.3,1.0)

In [None]:
with plt.style.context('fivethirtyeight'):
    plt.figure(figsize=(25,8))
    plt.bar(range(9),prediction_time_container.values(),align='center',color='g')
    plt.xticks(range(9),prediction_time_container.keys(),fontsize = 15)
    plt.ylabel("prediction time")
    plt.grid(True)
    plt.title("Comparison of prediction time of different classifiers")
    plt.ylim(0.3,1.0)