In [1]:
import pandas as pd
import numpy as np
data=pd.read_csv("eclipse.csv")
data.head()

Unnamed: 0,Bug ID,Product,Component,Assignee,Status,Resolution,Summary,Changed,Assignee Real Name,Classification,...,Reporter,Reporter Real Name,Severity,Tags,Target Milestone,URL,Version,Votes,Whiteboard,Alias
0,3638,JDT,UI,aeschli,VERIFIED,FIXED,Package Viewer: order resource folders before ...,17-01-2002 07:28,Martin Aeschlimann,Eclipse,...,aeschli,Martin Aeschlimann,major,,---,,2,0,,
1,3854,JDT,UI,aeschli,VERIFIED,FIXED,Wrong execution's classpath. (1GEY0W0),18-01-2002 04:02,Martin Aeschlimann,Eclipse,...,david_audel,David Audel,normal,,---,,2,0,,
2,4188,JDT,UI,aeschli,VERIFIED,FIXED,type hierachy - typo (1GJW2XJ),28-01-2002 03:12,Martin Aeschlimann,Eclipse,...,erich_gamma,Erich Gamma,normal,,---,,2,0,,
3,5115,JDT,Debug,aeschli,VERIFIED,FIXED,Workspace source locator fails with mulitple p...,13-11-2001 10:11,Martin Aeschlimann,Eclipse,...,darin.eclipse,Darin Wright,normal,,---,,2,0,,
4,5820,JDT,UI,aeschli,VERIFIED,FIXED,Close all editors brings up hierarchy of object,20-11-2001 16:22,Martin Aeschlimann,Eclipse,...,jed.anderson,Jed Anderson,normal,,---,,2,0,,


In [2]:
unnecessary_columns = ['Bug ID', 'Changed', 'Assignee Real Name', 'Classification', 'Flags', 'Hardware', 'Keywords', 
                       'Number of Comments', 'Opened', 'OS', 'QA Contact', 'QA Contact Real Name', 'Reporter', 
                       'Reporter Real Name', 'Tags', 'Target Milestone', 'URL', 'Version', 'Votes', 'Whiteboard', 'Alias']
data.drop(columns=unnecessary_columns, inplace=True)
data.head()

Unnamed: 0,Product,Component,Assignee,Status,Resolution,Summary,Priority,Severity
0,JDT,UI,aeschli,VERIFIED,FIXED,Package Viewer: order resource folders before ...,P1,major
1,JDT,UI,aeschli,VERIFIED,FIXED,Wrong execution's classpath. (1GEY0W0),P1,normal
2,JDT,UI,aeschli,VERIFIED,FIXED,type hierachy - typo (1GJW2XJ),P1,normal
3,JDT,Debug,aeschli,VERIFIED,FIXED,Workspace source locator fails with mulitple p...,P1,normal
4,JDT,UI,aeschli,VERIFIED,FIXED,Close all editors brings up hierarchy of object,P1,normal


In [3]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
nltk.download('punkt')

# Identify important columns for preprocessing
important_columns = ['Summary']

data['Summary'] = data['Summary'].str.lower()
data['Summary'] = data['Summary'].str.replace('[^\w\s]', '')

# Preprocessing function
def preprocess_text(text):
    # Tokenization
    tokens = word_tokenize(text)
    
    # Remove stopwords and punctuation, and lowercase the words
    stop_words = set(stopwords.words('english'))
    filtered_tokens = [word.lower() for word in tokens if word.isalnum() and word.lower() not in stop_words]
    
    # Stemming
    stemmer = PorterStemmer()
    stemmed_tokens = [stemmer.stem(word) for word in filtered_tokens]
    
    # Lemmatization
    lemmatizer = WordNetLemmatizer()
    lemmatized_tokens = [lemmatizer.lemmatize(word) for word in filtered_tokens]
    
    return stemmed_tokens, lemmatized_tokens

# Apply preprocessing to important columns
for column in important_columns:
    data[column + '_stemmed'], data[column + '_lemmatized'] = zip(*data[column].apply(preprocess_text))

# Now you have additional columns with preprocessed text: Summary_stemmed, Summary_lemmatized, 
# Description_stemmed, Description_lemmatized, Resolution_stemmed, and Resolution_lemmatized


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\geeth\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
  if sys.path[0] == "":


In [4]:
data.head()

Unnamed: 0,Product,Component,Assignee,Status,Resolution,Summary,Priority,Severity,Summary_stemmed,Summary_lemmatized
0,JDT,UI,aeschli,VERIFIED,FIXED,package viewer order resource folders before r...,P1,major,"[packag, viewer, order, resourc, folder, resou...","[package, viewer, order, resource, folder, res..."
1,JDT,UI,aeschli,VERIFIED,FIXED,wrong executions classpath 1gey0w0,P1,normal,"[wrong, execut, classpath, 1gey0w0]","[wrong, execution, classpath, 1gey0w0]"
2,JDT,UI,aeschli,VERIFIED,FIXED,type hierachy typo 1gjw2xj,P1,normal,"[type, hierachi, typo, 1gjw2xj]","[type, hierachy, typo, 1gjw2xj]"
3,JDT,Debug,aeschli,VERIFIED,FIXED,workspace source locator fails with mulitple p...,P1,normal,"[workspac, sourc, locat, fail, mulitpl, packag...","[workspace, source, locator, fails, mulitple, ..."
4,JDT,UI,aeschli,VERIFIED,FIXED,close all editors brings up hierarchy of object,P1,normal,"[close, editor, bring, hierarchi, object]","[close, editor, brings, hierarchy, object]"


In [5]:
from sklearn.feature_extraction.text import TfidfVectorizer

def extract_features(corpus, method='tfidf'):
    if method == 'tfidf':
        vectorizer = TfidfVectorizer()
    else:
        raise ValueError("Invalid method. Choose 'tfidf'.")
    
    features = vectorizer.fit_transform(corpus)
    return features, vectorizer

# summaries = data['Summary'].tolist()
# Assuming 'df' is your pandas DataFrame containing the bug report data
corpus = data['Component'] + ' ' + data['Product'] + ' ' + data['Summary']

# Call the extract_features function with the concatenated corpus
features, vectorizer = extract_features(corpus)

# Call the extract_features function with the 'summary' column
features, vectorizer = extract_features(corpus)
# print(features)

In [6]:
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report

def train_model(features, labels):
    X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size=0.2, random_state=42)
    
    # Train a Naive Bayes classifier
    model = MultinomialNB()
    model.fit(X_train, y_train)
    
    # Evaluate the model
    y_pred = model.predict(X_test)
#     print(classification_report(y_test, y_pred))
    
    return model

def predict_text(product, component, summary, model, vectorizer):
    # Combine the product, component, and summary into a single text
    text = ' '.join([product, component, summary])
    
    # Preprocess the text
    stemmed_tokens, lemmatized_tokens = preprocess_text(text)
    
    # Join the preprocessed tokens into a single string
    preprocessed_text = ' '.join(stemmed_tokens)  # You can choose stemmed or lemmatized tokens here
    
    # Transform the preprocessed text using the vectorizer
    features = vectorizer.transform([preprocessed_text])
    
    # Make a prediction using the trained model
    prediction = model.predict(features)
    
    return prediction



# Assuming you have a pandas DataFrame named df containing your dataset
severity_labels = data['Severity']
priority_labels = data['Priority']
assignee_labels = data['Assignee']

# Concatenate the labels into a single array
labels = severity_labels + ' ' + priority_labels + ' ' + assignee_labels

# Call the train_model function with features and labels
model = train_model(features, labels)



In [7]:
# Assuming you have a trained model named model and a vectorizer named vectorizer

# Bug information
product = "JDT"
component = "UI"
summary = "Package Viewer: order resource folders before resources?"

# Make a prediction
prediction = predict_text(product, component, summary, model, vectorizer)

# Print the prediction
print("Predicted label:", prediction)


Predicted label: ['normal P3 daniel_megert']
