# Análisis

In [47]:
# !pip install pyvolutionary==2.4.2
# !pip install pycaret
# !pip install pycaret[full]
# !pip install scikit_learn==1.4

In [48]:
import warnings
warnings.filterwarnings("ignore",category=FutureWarning)
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
import datetime as dt
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import Lasso
from sklearn.linear_model import Ridge
from sklearn.model_selection import KFold
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import RobustScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import MaxAbsScaler
from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import make_pipeline
from sklearn import metrics
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import RandomOverSampler
from imblearn.over_sampling import SMOTE
from imblearn.combine import SMOTEENN
from numpy.linalg import svd
import requests
import json


sscaler = StandardScaler()
rscaler = RobustScaler()
mmscaler = MinMaxScaler()
mascaler = MaxAbsScaler()

le = LabelEncoder()

def lencoder(df, col, lenc=False, train=True):
    if train:
        label = le.fit_transform(df[col])
        df[col] = pd.to_numeric(label)
        return df,le
    elif lenc != False:
        label = lenc.transform(df[col])
        df[col] = pd.to_numeric(label)
        return df
    else:
        print('Nothing done')
        return df



In [49]:
def run_kfold_model (X,y,K=10,graph=True):
    regr = LinearRegression()
    kf = KFold(n_splits=5)
    s=1
    for train_index, test_index in kf.split(X):
        # print("TRAIN:", train_index, "TEST:", test_index)
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]
        regr.fit(X_train, y_train)
        print('Slice '+str(s)+' score: '+str(regr.score(X_test, y_test)))
        y_pred = regr.predict(X_test)
        if graph:
            plt.scatter(y_test, y_pred, color = 'red', marker = 'o', s = 35, alpha = 0.5, label = 'Test data')
            plt.show()
        s+=1
    return(regr)

In [50]:
def scores (model, X_train, X_test, y_test, y_pred, graph=True):
    MAE=metrics.mean_absolute_error(y_test, y_pred)
    MSE=metrics.mean_squared_error(y_test, y_pred)
    RMSE=np.sqrt(metrics.mean_squared_error(y_test, y_pred))
    CV_RMSE= (RMSE/np.average(y_test))*100
    R2=metrics.r2_score(y_test, y_pred)
    ACC=metrics.accuracy_score
    REC=metrics.recall_score
    PREC=metrics.precision_score

    print('MAE:'+str(MAE))
    print('MSE:'+str(MSE))
    print('RMSE:'+str(RMSE))
    print('CV(RMSE): '+str(CV_RMSE))
    print('R2:'+str(R2))
    print('Accuracy:'+str(ACC))
    print('Recall:'+str(REC))
    print('Precision:'+str(PREC))

    # print('Training score: '+str(model.score(X_train, y_train))) #(Accuracy or R2)
    # print('Test score: '+str(model.score(X_test, y_train)))
    if graph == True:
        plt.scatter(y_train, y_pred, color = 'red', marker = 'o', s = 35, alpha = 0.5, label = 'Test data')
        plt.show()

In [51]:
import dask.dataframe as dd

In [52]:
from ast import literal_eval
def clean(x):
    return literal_eval(x)

In [53]:
pd.set_option("mode.copy_on_write", True)
pd.options.mode.copy_on_write = True

In [54]:
import math

def dummies(df, col):
    tDummy = pd.get_dummies(df[col]).add_prefix(col+'_')
    df = pd.concat([df,tDummy],axis=1).reindex(df.index)
    return(df)

def just_dummies(df, col):
    tDummy = pd.get_dummies(df[col]).add_prefix(col+'_')
    return(tDummy)

def just_dummies_steps(df, col, ix):
    num_chunks = math.ceil(len(df) / ix)  # Calculate number of chunks
    all_dummies = []  # Store results from each chunk

    for i in range(num_chunks):
        start = i * ix  # Starting index for the current chunk
        end = (i + 1) * ix  # Ending index for the current chunk

        chunk = df.loc[start:end-1] # Select chunk of data for the dummies
        chunk_dummies = just_dummies(chunk, col)  # Apply just_dummies to chunk
        all_dummies.append(chunk_dummies)  # Append the output to the list

    # Concatenate all dummy DataFrames horizontally
    final_dummies = pd.concat(all_dummies, axis=1)
    return final_dummies

# Checkpoint 1

In [None]:
# data = pd.read_excel('mozilla-bugs-all.xlsx')
# data.describe

In [None]:
def json_data(URL,params):
    headers = {"Accept" : "application/json"}
    resp = requests.get(URL, params= params, headers= headers)
    df = pd.DataFrame()
    if resp.status_code != 200:
        print('error: ' + str(resp.status_code))
    else:
        print('Success')
        bugs = resp.text
        data = json.loads(bugs)
        df = pd.json_normalize(data['bugs'])
    return(df)

params ={
    "include_fields" : ["id"
                        ,"summary"
                        ,"status"
                        ,"description"
                        ,"type"
                        ,"classification"
                        ,"product"
                        ,"component"
                        ,"priority"
                        ,"assigned_to"
                        ,"resolution"
                        ,"creation_time"
                        ,"last_change_time"
                        ,"severity"
                        ,"version"
                        ]
    ,"product" : "Core"
    # ,"status" : ["VERIFIED","RESOLVED","CLOSED","UNCONFIRMED","NEW"]
    ,"limit" : 10000
    ,"order": "opendate DESC"
}
URL = "https://bugzilla.mozilla.org/rest/bug"


In [None]:
params["status"] = "RESOLVED"
params.pop("status", None)

test_df = json_data(URL,params)
print(test_df.columns.to_list())
test_df.tail()

In [None]:
# for i in test_df.columns.to_list():
#     print("Column "+i+" unique values:")
#     # print(test_df[i].unique())
#     print(test_df[i].value_counts())

#     print()


In [None]:
test_df.groupby(['resolution']).sum().plot(kind='pie', y='id', autopct='%1.0f%%', labeldistance=None)
from datetime import datetime

print(test_df['creation_time'].min())
datetime.now() - pd.to_datetime(pd.to_datetime(test_df['creation_time'][test_df['resolution']=='FIXED']).values.astype(np.int64).mean())
plt.pie(x=test_df['resolution'].unique(), labels=test_df['resolution'].value_counts())
plt.show()

plt.title('Category Distribution')
plt.ylabel('')  # Hide the y-label
plt.show()

test_df.to_csv("dataset_20240907.csv")
test_df.date = pd.to_datetime(test_df.date).values.astype(np.int64)

test_df = pd.DataFrame(pd.to_datetime(test_df.groupby('column').mean().date))

# Checkpoint 2

In [None]:
data = pd.read_csv('dataset_20240827.csv')
data.describe

<bound method NDFrame.describe of       Unnamed: 0 resolution product         creation_time priority  \
0              0        NaN    Core  2024-08-27T07:10:15Z       P5   
1              1        NaN    Core  2024-08-27T07:09:56Z       --   
2              2        NaN    Core  2024-08-27T06:56:08Z       --   
3              3        NaN    Core  2024-08-27T06:13:19Z       P5   
4              4        NaN    Core  2024-08-27T05:29:11Z       --   
...          ...        ...     ...                   ...      ...   
9995        9995        NaN    Core  2024-03-23T18:04:58Z       --   
9996        9996        NaN    Core  2024-03-23T18:01:29Z       --   
9997        9997        NaN    Core  2024-03-23T17:57:21Z       P3   
9998        9998        NaN    Core  2024-03-23T17:56:41Z       P3   
9999        9999        NaN    Core  2024-03-23T17:55:13Z       P3   

     classification    type      last_change_time  \
0        Components  defect  2024-08-27T07:10:15Z   
1        Components

In [None]:
#import nltk
#nltk.download('punkt_tab')
#nltk.download('stopwords')
#nltk.download('punkt')
#nltk.download('wordnet')

In [None]:
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.tokenize import MWETokenizer
from nltk.stem import WordNetLemmatizer
import math
import re

def NLProcess (text):

    # print('Original text: ',data.iloc[1]['description'])
    #print("====================== starting ======================")
    ######################################################################### Tokenised
    mwe_tokenizer = MWETokenizer([('does', 'not'), ('in', 'spite', 'of'),('don', '’', 't')])
    tokenizer = word_tokenize
    tokenised = tokenizer(text)
    #print('Tokenised: ',tokenised)
    retokenised = mwe_tokenizer.tokenize(tokenised)
    # retokenised = [token.replace('_', '') for token in retokenised]
    #print('ReTokenised: ',retokenised)

    ######################################################################### Lower-case converted
    normalised = [word.lower() for word in retokenised]
    #print('Normalised: ',normalised)

    ######################################################################### Punctuation marks removed
    unmarked = [re.sub(r'[^\w\s]', '', token) for token in normalised if re.sub(r'[^\w\s]', '', token)]
    #print('Punctuation marks removed: ',unmarked)

    ######################################################################### Stop-words removed
    stop_words = set(stopwords.words('english'))
    filtered = [word for word in unmarked if word not in stop_words]
    #print('Stop-words removed: ',filtered)

    ######################################################################### Lemmatised
    lemmatizer = WordNetLemmatizer()
    lemmatised = [lemmatizer.lemmatize(word) for word in filtered]
    #print('Lemmatised: ',lemmatised)

    return lemmatised


In [None]:
# def dfNLProcess (df,column):
#     for index, row in df.iterrows():
#         text = row[column]
#         # print('Original text: ',data.iloc[1]['description'])
#         #print("====================== starting ======================")
#         ######################################################################### Tokenised
#         mwe_tokenizer = MWETokenizer([('does', 'not'), ('in', 'spite', 'of'),('don', '’', 't')])
#         tokenizer = word_tokenize
#         tokenised = tokenizer(text)
#         #print('Tokenised: ',tokenised)
#         retokenised = mwe_tokenizer.tokenize(tokenised)
#         # retokenised = [token.replace('_', '') for token in retokenised]
#         #print('ReTokenised: ',retokenised)

#         ######################################################################### Lower-case converted
#         normalised = [word.lower() for word in retokenised]
#         #print('Normalised: ',normalised)

#         ######################################################################### Punctuation marks removed
#         unmarked = [re.sub(r'[^\w\s]', '', token) for token in normalised if re.sub(r'[^\w\s]', '', token)]
#         #print('Punctuation marks removed: ',unmarked)

#         ######################################################################### Stop-words removed
#         stop_words = set(stopwords.words('english'))
#         filtered = [word for word in unmarked if word not in stop_words]
#         #print('Stop-words removed: ',filtered)

#         ######################################################################### Lemmatised
#         lemmatizer = WordNetLemmatizer()
#         lemmatised = [lemmatizer.lemmatize(word) for word in filtered]
#         #print('Lemmatised: ',lemmatised)

#         lemmatised
#         ncol = 'P'+col
#         df2 = pd.DataFrame({ncol:lemmatised})
#         df2 = dummies(df2)


In [None]:
# dfNLProcess(data,'summary')

In [None]:
# for i in range(10):
#     NLProcess(data.iloc[i]['description'])

data['Psummary'] = data['summary'].map(NLProcess)

# type(data['summary'])

In [None]:
data['Pdescription'] = data['description'].fillna('').map(NLProcess)

In [None]:
#data.loc[data['description'].str.contains('//crash-stats.mozilla.org/report/index/768c5c44-57c5-4746-890c-9af820240811', case=False, na=False)]

In [None]:
data.head()

Unnamed: 0.1,Unnamed: 0,resolution,product,creation_time,priority,classification,type,last_change_time,component,summary,...,id,severity,version,assigned_to_detail.nick,assigned_to_detail.id,assigned_to_detail.name,assigned_to_detail.real_name,assigned_to_detail.email,Psummary,Pdescription
0,0,,Core,2024-08-27T07:10:15Z,P5,Components,defect,2024-08-27T07:10:15Z,Widget,Intermittent widget/tests/browser/browser_test...,...,1915086,S4,unspecified,nobody,1,nobody@mozilla.org,Nobody; OK to take it and work on it,nobody@mozilla.org,"[intermittent, widgettestsbrowserbrowser_test_...","[filed, nfay, mozillacom, parsed, log, http, t..."
1,1,,Core,2024-08-27T07:09:56Z,--,Components,defect,2024-08-27T07:12:28Z,Networking: DNS,"Bad name, functionality for network.trr.exclud...",...,1915085,--,Firefox 129,nobody,1,nobody@mozilla.org,Nobody; OK to take it and work on it,nobody@mozilla.org,"[bad, name, functionality, networktrrexcludeet...","[user, agent, mozilla50, macintosh, intel, mac..."
2,2,,Core,2024-08-27T06:56:08Z,--,Components,task,2024-08-27T07:02:38Z,WebRTC: Audio/Video,Request sRGB colorspace from ScreenCapturerSck,...,1915082,,unspecified,pehrsons,489889,apehrson@mozilla.com,Andreas Pehrson [:pehrsons],apehrson@mozilla.com,"[request, srgb, colorspace, screencapturersck]","[screencapturekit, doc, http, developerappleco..."
3,3,,Core,2024-08-27T06:13:19Z,P5,Components,defect,2024-08-27T06:17:48Z,Panning and Zooming,Intermittent gfx/layers/apz/test/mochitest/tes...,...,1915078,S4,unspecified,nobody,1,nobody@mozilla.org,Nobody; OK to take it and work on it,nobody@mozilla.org,"[intermittent, gfxlayersapztestmochitesttest_g...","[filed, nfay, mozillacom, parsed, log, http, t..."
4,4,,Core,2024-08-27T05:29:11Z,--,Components,defect,2024-08-27T06:25:14Z,DOM: Core & HTML,Removing meta viewport tag has no effect,...,1915077,--,unspecified,nobody,1,nobody@mozilla.org,Nobody; OK to take it and work on it,nobody@mozilla.org,"[removing, meta, viewport, tag, effect]","[str, 1, open, site, meta, viewport, element, ..."


In [None]:
data.to_csv("dataset_20240907_plusP.csv")

# Checkpoint 3 Psummary

In [None]:
data = pd.read_csv('dataset_20240907_plusP.csv', converters={'Psummary': clean})
data.head()

Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,resolution,product,creation_time,priority,classification,type,last_change_time,component,...,id,severity,version,assigned_to_detail.nick,assigned_to_detail.id,assigned_to_detail.name,assigned_to_detail.real_name,assigned_to_detail.email,Psummary,Pdescription
0,0,0,,Core,2024-08-27T07:10:15Z,P5,Components,defect,2024-08-27T07:10:15Z,Widget,...,1915086,S4,unspecified,nobody,1,nobody@mozilla.org,Nobody; OK to take it and work on it,nobody@mozilla.org,"[intermittent, widgettestsbrowserbrowser_test_...","['filed', 'nfay', 'mozillacom', 'parsed', 'log..."
1,1,1,,Core,2024-08-27T07:09:56Z,--,Components,defect,2024-08-27T07:12:28Z,Networking: DNS,...,1915085,--,Firefox 129,nobody,1,nobody@mozilla.org,Nobody; OK to take it and work on it,nobody@mozilla.org,"[bad, name, functionality, networktrrexcludeet...","['user', 'agent', 'mozilla50', 'macintosh', 'i..."
2,2,2,,Core,2024-08-27T06:56:08Z,--,Components,task,2024-08-27T07:02:38Z,WebRTC: Audio/Video,...,1915082,,unspecified,pehrsons,489889,apehrson@mozilla.com,Andreas Pehrson [:pehrsons],apehrson@mozilla.com,"[request, srgb, colorspace, screencapturersck]","['screencapturekit', 'doc', 'http', 'developer..."
3,3,3,,Core,2024-08-27T06:13:19Z,P5,Components,defect,2024-08-27T06:17:48Z,Panning and Zooming,...,1915078,S4,unspecified,nobody,1,nobody@mozilla.org,Nobody; OK to take it and work on it,nobody@mozilla.org,"[intermittent, gfxlayersapztestmochitesttest_g...","['filed', 'nfay', 'mozillacom', 'parsed', 'log..."
4,4,4,,Core,2024-08-27T05:29:11Z,--,Components,defect,2024-08-27T06:25:14Z,DOM: Core & HTML,...,1915077,--,unspecified,nobody,1,nobody@mozilla.org,Nobody; OK to take it and work on it,nobody@mozilla.org,"[removing, meta, viewport, tag, effect]","['str', '1', 'open', 'site', 'meta', 'viewport..."


In [None]:
data.drop(axis=1, columns=['Unnamed: 0.1','Unnamed: 0'], inplace=True)
data.head()

Unnamed: 0,resolution,product,creation_time,priority,classification,type,last_change_time,component,summary,description,...,id,severity,version,assigned_to_detail.nick,assigned_to_detail.id,assigned_to_detail.name,assigned_to_detail.real_name,assigned_to_detail.email,Psummary,Pdescription
0,,Core,2024-08-27T07:10:15Z,P5,Components,defect,2024-08-27T07:10:15Z,Widget,Intermittent widget/tests/browser/browser_test...,**Filed by:** nfay [at] mozilla.com\n**Parsed ...,...,1915086,S4,unspecified,nobody,1,nobody@mozilla.org,Nobody; OK to take it and work on it,nobody@mozilla.org,"[intermittent, widgettestsbrowserbrowser_test_...","['filed', 'nfay', 'mozillacom', 'parsed', 'log..."
1,,Core,2024-08-27T07:09:56Z,--,Components,defect,2024-08-27T07:12:28Z,Networking: DNS,"Bad name, functionality for network.trr.exclud...",User Agent: Mozilla/5.0 (Macintosh; Intel Mac ...,...,1915085,--,Firefox 129,nobody,1,nobody@mozilla.org,Nobody; OK to take it and work on it,nobody@mozilla.org,"[bad, name, functionality, networktrrexcludeet...","['user', 'agent', 'mozilla50', 'macintosh', 'i..."
2,,Core,2024-08-27T06:56:08Z,--,Components,task,2024-08-27T07:02:38Z,WebRTC: Audio/Video,Request sRGB colorspace from ScreenCapturerSck,[The ScreenCaptureKit docs](https://developer....,...,1915082,,unspecified,pehrsons,489889,apehrson@mozilla.com,Andreas Pehrson [:pehrsons],apehrson@mozilla.com,"[request, srgb, colorspace, screencapturersck]","['screencapturekit', 'doc', 'http', 'developer..."
3,,Core,2024-08-27T06:13:19Z,P5,Components,defect,2024-08-27T06:17:48Z,Panning and Zooming,Intermittent gfx/layers/apz/test/mochitest/tes...,**Filed by:** nfay [at] mozilla.com\n**Parsed ...,...,1915078,S4,unspecified,nobody,1,nobody@mozilla.org,Nobody; OK to take it and work on it,nobody@mozilla.org,"[intermittent, gfxlayersapztestmochitesttest_g...","['filed', 'nfay', 'mozillacom', 'parsed', 'log..."
4,,Core,2024-08-27T05:29:11Z,--,Components,defect,2024-08-27T06:25:14Z,DOM: Core & HTML,Removing meta viewport tag has no effect,STR;\n\n1. Open any sites having a meta viewpo...,...,1915077,--,unspecified,nobody,1,nobody@mozilla.org,Nobody; OK to take it and work on it,nobody@mozilla.org,"[removing, meta, viewport, tag, effect]","['str', '1', 'open', 'site', 'meta', 'viewport..."


In [None]:
# from sklearn.feature_selection import mutual_info_classif

In [None]:
type(data.iloc[0]['Psummary'])
# type(data['Psummary'])

list

In [None]:
exploded = data.explode('Psummary')

In [None]:
filter='^\d+$'
notfilter='^(?!'+filter+').*$'
exploded_filtered = exploded.loc[exploded['Psummary'].str.contains(notfilter)]

In [None]:
checkit = just_dummies(exploded_filtered,'Psummary')

In [None]:
checkit.index.name = 'index'

In [None]:
checkit = dd.from_pandas(checkit, chunksize=100)

In [None]:
checkit.head()

Unnamed: 0_level_0,Psummary_00f,Psummary_01f,Psummary_04347e67c6f87ee7a33c8ed8103aecebac6c3888,Psummary_0a1,Psummary_0async,Psummary_0cad754da2ee3ececcfa1aad8b858a0286c24e16,Psummary_0dc559f060db0d62d95f424e3fd26a5f673b2f6b,Psummary_0e30966b198ad28943799eaf5b3b08100b6f70c3,Psummary_0kb,Psummary_0px,...,Psummary_zoomteams,Psummary_zoomus,Psummary_zstd,Psummary_zstd_dctx_setparameter,Psummary_zwp_tablet_tool_v2_set_cursor,Psummary_zwsetevent,Psummary_zwusermsgwaitformultipleobjectsex,Psummary_zydis,Psummary_ñ,Psummary_㜱㜸㤱㠰㤴㜶㔰㔰
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
0,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
0,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
0,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
0,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


In [None]:
# prefix='Psummary_'
# filter=prefix+'\d+$'
# notfilter='^(?!'+filter+').*$'
# print(len(checkit.filter(regex=(notfilter)).columns.tolist()))
# # print(checkit.filter(regex=(notfilter)).columns.tolist())

In [None]:
checkot = checkit.groupby(['index'], observed=True).sum()

In [None]:
checkot['nindex'] = checkot.index

In [None]:
checkot.head()

Unnamed: 0_level_0,Psummary_00f,Psummary_01f,Psummary_04347e67c6f87ee7a33c8ed8103aecebac6c3888,Psummary_0a1,Psummary_0async,Psummary_0cad754da2ee3ececcfa1aad8b858a0286c24e16,Psummary_0dc559f060db0d62d95f424e3fd26a5f673b2f6b,Psummary_0e30966b198ad28943799eaf5b3b08100b6f70c3,Psummary_0kb,Psummary_0px,...,Psummary_zoomus,Psummary_zstd,Psummary_zstd_dctx_setparameter,Psummary_zwp_tablet_tool_v2_set_cursor,Psummary_zwsetevent,Psummary_zwusermsgwaitformultipleobjectsex,Psummary_zydis,Psummary_ñ,Psummary_㜱㜸㤱㠰㤴㜶㔰㔰,nindex
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2733,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,2733
2786,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,2786
8923,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,8923
8985,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,8985
1775,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1775


In [None]:
checket = checkot.compute()

KeyboardInterrupt: 

In [None]:
checkit = checket.reset_index()

In [None]:
checkit.head()

In [None]:
# def rowgroupbyidnex(df):
#     rows = []
#     for i in len(df):
#         rows.append(df.iloc[i].groupby(checkit.index, observed=True).sum())





In [None]:
checkit.to_csv('dataset_20240907_plusP_Psummary.csv')

# Checkpoint 4 Out of order

In [None]:
data = pd.read_csv('dataset_20240907_plusP.csv', converters={'Pdescription': clean,'Psummary': clean})

In [None]:
exploded = data.explode('Pdescription')
filter='^\d+$'
notfilter='^(?!'+filter+').*$'

exploded = exploded.loc[exploded['Pdescription'].notna()]

exploded_filtered = exploded.loc[exploded['Pdescription'].str.contains(notfilter)]

In [None]:
exploded_filtered['nindex'] = exploded_filtered.index

In [None]:
exploded_filtered = exploded_filtered.reset_index()

In [None]:
checkit = just_dummies_steps(exploded_filtered[['nindex','Pdescription']],'Pdescription',1000)

In [None]:
checkit = dd.from_pandas(checkit, chunksize=100)

In [None]:
checkot = checkit.groupby([checkit.index], observed=True).sum()
checkot['nindex'] = checkot.index
checket = checkot.compute()
checkit = checket.reset_index()
checkit.to_csv('dataset_20240907_plusP_Pdescription.csv')

# Checkpoint 5 TFIDF

In [None]:
data = pd.read_csv('dataset_20240907_plusP.csv', converters={'Pdescription': clean,'Psummary': clean})

KeyboardInterrupt: 

In [None]:
summary = pd.read_csv('dataset_20240907_plusP_Psummary.csv')

In [None]:
summary.drop(axis=1, columns=['Unnamed: 0','nindex','index'], inplace=True)
summary.head()

In [None]:
data['Psummary']=data['Psummary'].tolist()

In [None]:
data['Pdescription']=data['Pdescription'].apply(lambda x: ' '.join(x))
data['Psummary']=data['Psummary'].apply(lambda x: ' '.join(x))

In [None]:
summary.head()

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

def tfidf(df,col):
  vectorizer = TfidfVectorizer()
  vectors = vectorizer.fit_transform(df[col])

  # Select the first five documents from the data set
  tf_idf = pd.DataFrame(vectors.todense())#.iloc[:5]
  tf_idf.columns = vectorizer.get_feature_names_out()
  tfidf_matrix = tf_idf.T
  tfidf_matrix.columns = ['bug'+ str(i) for i in range(1, 10001)]
  tfidf_matrix['count'] = tfidf_matrix.sum(axis=1)

  # Top words
  tfidf_matrix = tfidf_matrix.sort_values(by ='count', ascending=False)

  # Print the first 10 words
  # print(tfidf_matrix.drop(columns=['count']).head(10))

  return tfidf_matrix


In [None]:
# tfidf_Pdescription = tfidf(data,'Pdescription')
tfidf_Psummary = tfidf(data,'Psummary')
over1 = tfidf_Psummary.loc[tfidf_Psummary['count']>10]

In [None]:
filter='^\d+$'
notfilter='^(?!'+filter+').*$'
wordlst = over1.loc[over1.index.str.contains(notfilter)]

In [None]:
wordlst['words'] = 'Psummary_' + wordlst.index.astype(str)
wordlst.head()

In [None]:
summary.head()

In [None]:
summary = summary[wordlst['words']]
summary.head()

In [None]:
summary.tail()

In [None]:
mergeddata = pd.concat([data,summary],axis=1)

mergeddata.to_csv('dataset_20240907_plusP_Psummary_tfidf_10.csv')

# Checkpoint 6

In [55]:
mergeddata = pd.read_csv('dataset_20240907_plusP_Psummary_tfidf_10.csv')

In [56]:
mergeddata.head()
mergeddata['resolution'] = mergeddata['resolution'].fillna('')
mergeddata['severity'] = mergeddata['severity'].fillna('SN/A')
# mergeddata['resolution'] = mergeddata['resolution'].astype(str)

In [57]:
mergeddata.drop([
'Unnamed: 0.2',
'Unnamed: 0.1',
'Unnamed: 0',
'Unnamed: 0.2',
'Unnamed: 0.1',
'Unnamed: 0',
'summary',
'description',
'Psummary',
'Pdescription',
'assigned_to_detail.real_name'
],axis=1,inplace=True)

mergeddata.head()

Unnamed: 0,resolution,product,creation_time,priority,classification,type,last_change_time,component,status,assigned_to,...,Psummary_doc,Psummary_shadow,Psummary_specific,Psummary_exit,Psummary_unexpected,Psummary_wb,Psummary_cpu,Psummary_wasm,Psummary_webrender,Psummary_start
0,,Core,2024-08-27T07:10:15Z,P5,Components,defect,2024-08-27T07:10:15Z,Widget,NEW,nobody@mozilla.org,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,,Core,2024-08-27T07:09:56Z,--,Components,defect,2024-08-27T07:12:28Z,Networking: DNS,UNCONFIRMED,nobody@mozilla.org,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,,Core,2024-08-27T06:56:08Z,--,Components,task,2024-08-27T07:02:38Z,WebRTC: Audio/Video,ASSIGNED,apehrson@mozilla.com,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,,Core,2024-08-27T06:13:19Z,P5,Components,defect,2024-08-27T06:17:48Z,Panning and Zooming,NEW,nobody@mozilla.org,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,,Core,2024-08-27T05:29:11Z,--,Components,defect,2024-08-27T06:25:14Z,DOM: Core & HTML,NEW,nobody@mozilla.org,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [58]:
mergeddata = mergeddata[mergeddata['priority']!= '--']
# mergeddata.loc[mergeddata['priority']=='--'].head()


mergeddata = mergeddata[mergeddata['severity']!= '--']
mergeddata['severity'].replace('normal','S3')
# mergeddata.loc[mergeddata['severity']=='--'].head()

len(mergeddata)

4110

In [59]:
mergeddata['severity'].unique()

array(['S4', 'SN/A', 'S3', 'S2', 'S1'], dtype=object)

In [60]:
tipos = mergeddata.dtypes
# tipos['columna']=tipos.index
# tipos.rename({'0':'tipo'})
tipos = tipos.to_frame()

# columnas = tipos.loc[tipos['tipo']=='object'].index
columnas = tipos.loc[tipos[0]!='float'].index.to_list()
columnas

for columna in columnas:
    print(columna)
    print(mergeddata[columna].apply(type).unique())
    print('--------------------------------------------')

resolution
[<class 'str'>]
--------------------------------------------
product
[<class 'str'>]
--------------------------------------------
creation_time
[<class 'str'>]
--------------------------------------------
priority
[<class 'str'>]
--------------------------------------------
classification
[<class 'str'>]
--------------------------------------------
type
[<class 'str'>]
--------------------------------------------
last_change_time
[<class 'str'>]
--------------------------------------------
component
[<class 'str'>]
--------------------------------------------
status
[<class 'str'>]
--------------------------------------------
assigned_to
[<class 'str'>]
--------------------------------------------
id
[<class 'int'>]
--------------------------------------------
severity
[<class 'str'>]
--------------------------------------------
version
[<class 'str'>]
--------------------------------------------
assigned_to_detail.nick
[<class 'str'>]
---------------------------------------

# Prepare

In [61]:
mergeddata = dummies(mergeddata,'resolution')
mergeddata = dummies(mergeddata,'product')
mergeddata, lemodel = lencoder(mergeddata,'priority')
mergeddata = dummies(mergeddata,'classification')
mergeddata = dummies(mergeddata,'type')
mergeddata = dummies(mergeddata,'component')
mergeddata = dummies(mergeddata,'assigned_to')
mergeddata, lemodel = lencoder(mergeddata,'severity')
mergeddata, lemodel = lencoder(mergeddata,'version')
mergeddata = dummies(mergeddata,'assigned_to_detail.nick')
mergeddata = dummies(mergeddata,'assigned_to_detail.id')
mergeddata = dummies(mergeddata,'assigned_to_detail.name')
mergeddata = dummies(mergeddata,'assigned_to_detail.email')
mergeddata['creation_time'] = pd.to_datetime(mergeddata['creation_time'])
mergeddata['last_change_time'] = pd.to_datetime(mergeddata['last_change_time'])

In [62]:
def columns_with_nan(df):
    # Obtener una lista de los nombres de las columnas
    columns = df.columns.to_list()

    # Filtrar las columnas que tienen valores NaN
    columns_with_nan = [col for col in columns if df[col].isna().any()]

    return columns_with_nan

In [63]:
columns_with_nan(mergeddata)

['Psummary_intermittent',
 'Psummary_bug',
 'Psummary_tracking',
 'Psummary_single',
 'Psummary_sync',
 'Psummary_pr',
 'Psummary_wptsync',
 'Psummary_test',
 'Psummary_wpt',
 'Psummary_tier',
 'Psummary_failure',
 'Psummary_add',
 'Psummary_crash',
 'Psummary_firefox',
 'Psummary_code',
 'Psummary_new',
 'Psummary_remove',
 'Psummary_fix',
 'Psummary_update',
 'Psummary_failing',
 'Psummary_args',
 'Psummary_assertion',
 'Psummary_nt',
 'Psummary_http',
 'Psummary_implement',
 'Psummary_support',
 'Psummary_unknown',
 'Psummary_window',
 'Psummary_mozilla',
 'Psummary_use',
 'Psummary_video',
 'Psummary_page',
 'Psummary_error',
 'Psummary_set',
 'Psummary_meta',
 'Psummary_text',
 'Psummary_perma',
 'Psummary_using',
 'Psummary_element',
 'Psummary_file',
 'Psummary_work',
 'Psummary_does_not',
 'Psummary_event',
 'Psummary_etp',
 'Psummary_export',
 'Psummary_strict',
 'Psummary_webkit',
 'Psummary_enable',
 'Psummary_make',
 'Psummary_dom',
 'Psummary_bugswebkitorgshow_bugcgi',
 'P

In [64]:
def rows_with_nan(df, column_name):
    # Filtrar las filas donde la columna especificada tiene valores NaN
    rows_with_nan = df[df[column_name].isna()]
    return rows_with_nan
rows_with_nan(mergeddata,'Psummary_intermittent')
print(rows_with_nan(mergeddata,'Psummary_intermittent').index)
mergeddata.drop(index=9999, inplace=True)

Index([9999], dtype='int64')


# PyCaret

In [None]:
from pycaret.classification import *

dataset = mergeddata

data = dataset.drop(['priority'
,'resolution'
,'product'
,'creation_time'
,'classification'
,'type'
,'last_change_time'
,'component'
,'status'
,'assigned_to'
,'assigned_to_detail.nick'
,'assigned_to_detail.name'
,'assigned_to_detail.email'
                ],axis=1)

clf1 = setup(data, target='severity')

# Compare models
best_model = compare_models()

ModuleNotFoundError: No module named 'pycaret'

In [None]:
tuned_model = tune_model(best_model)

NameError: name 'tune_model' is not defined

In [None]:
evaluate_model(tuned_model)

# XGBoost

In [None]:
from sklearn.metrics import accuracy_score, roc_auc_score, recall_score, precision_score, f1_score, cohen_kappa_score, matthews_corrcoef

def scores(y_test, y_pred):
    accuracy = accuracy_score(y_test, y_pred)
    print("Accuracy:", accuracy)

    # Recall
    recall = recall_score(y_test, y_pred,average='weighted')
    print("Recall:", recall)

    # Precision
    precision = precision_score(y_test, y_pred,average='weighted')
    print("Precision:", precision)

    # F1 Score
    f1 = f1_score(y_test, y_pred,average='weighted')
    print("F1 Score:", f1)

    # Cohen's Kappa
    kappa = cohen_kappa_score(y_test, y_pred)
    print("Cohen's Kappa:", kappa)

    # Matthews Correlation Coefficient (MCC)
    mcc = matthews_corrcoef(y_test, y_pred)
    print("MCC:", mcc)


In [None]:
def run_kfold_model (model, X,y,K=10,graph=True):
    regr = LinearRegression()
    kf = KFold(n_splits=5)
    s=1
    for train_index, test_index in kf.split(X):
        # print("TRAIN:", train_index, "TEST:", test_index)
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]
        model.fit(X_train, y_train)
        print('Slice '+str(s)+' score: '+str(regr.score(X_test, y_test)))
        y_pred = model.predict(X_test)
        scores(y_test, y_pred)
        if graph:
            plt.scatter(y_test, y_pred, color = 'red', marker = 'o', s = 35, alpha = 0.5, label = 'Test data')
            plt.show()
        s+=1
    return(regr)

In [None]:
from numpy import loadtxt
import xgboost as xgb
from xgboost import DMatrix, XGBClassifier

dataset = mergeddata

X = dataset.drop(['severity','priority'
,'resolution'
,'product'
,'creation_time'
,'classification'
,'type'
,'last_change_time'
,'component'
,'status'
,'assigned_to'
,'assigned_to_detail.nick'
,'assigned_to_detail.name'
,'assigned_to_detail.email'
                ],axis=1)

Y = dataset['severity']

# split data into train and test sets
seed = 7
test_size = 0.3
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=test_size, random_state=seed)

# dtrain = DMatrix(X_train, label=y_train, enable_categorical=True)

# params = {
#     # 'objective': 'binary:logistic',
#     'objective': 'multi:softmax',
#     'num_class': 4,
#     'tree_method': 'hist'  # 'hist' or 'approx' are recommended for categorical data
# }

# fit model on training data
model = XGBClassifier(objective= 'multi:softmax', num_class= 4)
model.fit(X_train, y_train)
# model = xgb.train(params, dtrain, num_boost_round=10)

# make predictions for test data
# dpredict = xgb.DMatrix(X_test, enable_categorical=True)

# y_pred = model.predict(dpredict)


# predictions = [round(value) for value in y_pred]

# scores(model, X_train, X_test, y_train, y_test)

# run_kfold_model(model, X_train, y_train)

y_pred = model.predict(X_test)
scores(y_test, y_pred)

Accuracy: 0.8572587185725872
Recall: 0.8572587185725872
Precision: 0.8462048189113937
F1 Score: 0.8503490062200572
Cohen's Kappa: 0.7705153801901797
MCC: 0.7711123744607169


# PyCaret top 7

In [65]:
from pycaret.classification import *

dataset = mergeddata

data = dataset.drop(['priority'
,'resolution'
,'product'
,'creation_time'
,'classification'
,'type'
,'last_change_time'
,'component'
,'status'
,'assigned_to'
,'assigned_to_detail.nick'
,'assigned_to_detail.name'
,'assigned_to_detail.email'
                ],axis=1)

clf1 = setup(data, target='severity')

model = [create_model(i) for i in [
    'rf',
    'gbc',
    'et',
    'xgboost',
    'catboost',
    'lightgbm',
    'dt']]

for i in (range(7)):
    print(i)
    model[i].get_params()


Unnamed: 0,Description,Value
0,Session id,8853
1,Target,severity
2,Target type,Multiclass
3,Original data shape,"(4109, 1389)"
4,Transformed data shape,"(4109, 1389)"
5,Transformed train set shape,"(2876, 1389)"
6,Transformed test set shape,"(1233, 1389)"
7,Numeric features,337
8,Preprocess,True
9,Imputation type,simple


Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.8438,0.0,0.8438,0.8383,0.8391,0.7502,0.7525
1,0.8715,0.0,0.8715,0.8562,0.8626,0.7921,0.7932
2,0.8507,0.0,0.8507,0.8413,0.8428,0.762,0.7656
3,0.8507,0.9522,0.8507,0.84,0.8429,0.7616,0.7638
4,0.8646,0.9571,0.8646,0.8617,0.8565,0.7819,0.7852
5,0.8785,0.9615,0.8785,0.883,0.8719,0.8052,0.8065
6,0.8676,0.0,0.8676,0.8452,0.8549,0.7839,0.7856
7,0.8293,0.0,0.8293,0.8257,0.8237,0.73,0.7336
8,0.8362,0.0,0.8362,0.8236,0.8285,0.7368,0.7386
9,0.8328,0.0,0.8328,0.8262,0.8271,0.7338,0.7361


Processing:   0%|          | 0/4 [00:00<?, ?it/s]

Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.8472,0.0,0.8472,0.8539,0.8467,0.7564,0.7588
1,0.8646,0.0,0.8646,0.8562,0.8575,0.7831,0.7851
2,0.8438,0.0,0.8438,0.8405,0.8389,0.7523,0.7556
3,0.8646,0.0,0.8646,0.8506,0.8551,0.7833,0.7854
4,0.8472,0.0,0.8472,0.8479,0.8417,0.756,0.7597
5,0.8611,0.0,0.8611,0.8552,0.8544,0.7778,0.7788
6,0.8641,0.0,0.8641,0.8459,0.8546,0.779,0.7796
7,0.8188,0.0,0.8188,0.8245,0.8165,0.7161,0.7209
8,0.8746,0.0,0.8746,0.8615,0.867,0.7985,0.7993
9,0.8467,0.0,0.8467,0.8416,0.8411,0.7569,0.7594


Processing:   0%|          | 0/4 [00:00<?, ?it/s]

Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.8507,0.0,0.8507,0.8545,0.851,0.7619,0.7634
1,0.8576,0.0,0.8576,0.8417,0.8488,0.7688,0.7696
2,0.8299,0.0,0.8299,0.8229,0.8245,0.7284,0.7304
3,0.8542,0.9419,0.8542,0.8389,0.8444,0.7658,0.7675
4,0.8333,0.9527,0.8333,0.8266,0.8221,0.729,0.7345
5,0.8854,0.9593,0.8854,0.8869,0.8804,0.8162,0.817
6,0.8711,0.0,0.8711,0.8514,0.8599,0.7887,0.7905
7,0.8293,0.0,0.8293,0.8206,0.8221,0.7287,0.7311
8,0.8432,0.0,0.8432,0.8299,0.8363,0.7468,0.7472
9,0.8014,0.0,0.8014,0.7992,0.7996,0.6825,0.6832


Processing:   0%|          | 0/4 [00:00<?, ?it/s]

Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.8229,0.0,0.8229,0.815,0.8173,0.7147,0.7167
1,0.8854,0.0,0.8854,0.875,0.8785,0.8143,0.8157
2,0.8576,0.0,0.8576,0.8464,0.8495,0.7723,0.7753
3,0.8611,0.9573,0.8611,0.8459,0.8523,0.7765,0.7779
4,0.8542,0.9616,0.8542,0.8492,0.8474,0.766,0.7688
5,0.8576,0.9668,0.8576,0.8495,0.8521,0.7717,0.7722
6,0.8606,0.0,0.8606,0.8382,0.849,0.7715,0.7725
7,0.8188,0.0,0.8188,0.829,0.8144,0.7106,0.7122
8,0.8537,0.0,0.8537,0.8527,0.8525,0.7659,0.7667
9,0.8432,0.0,0.8432,0.8465,0.8422,0.7523,0.7545


Processing:   0%|          | 0/4 [00:00<?, ?it/s]

Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.8542,0.0,0.8542,0.8408,0.8461,0.7641,0.766
1,0.8681,0.0,0.8681,0.8543,0.8593,0.7871,0.7885
2,0.8507,0.0,0.8507,0.8412,0.8425,0.7626,0.7661
3,0.8507,0.9571,0.8507,0.839,0.8431,0.7609,0.7623
4,0.8576,0.9595,0.8576,0.8414,0.8453,0.7713,0.7747
5,0.8785,0.965,0.8785,0.882,0.8711,0.8053,0.8065
6,0.8537,0.0,0.8537,0.8327,0.8417,0.762,0.7634
7,0.8502,0.0,0.8502,0.8611,0.8453,0.7614,0.7636
8,0.8676,0.0,0.8676,0.849,0.8574,0.7864,0.7874
9,0.8467,0.0,0.8467,0.8385,0.8387,0.7566,0.76


Processing:   0%|          | 0/4 [00:00<?, ?it/s]

Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.816,0.0,0.816,0.8128,0.8115,0.7036,0.7071
1,0.8542,0.0,0.8542,0.8441,0.8473,0.7618,0.7628
2,0.8646,0.0,0.8646,0.861,0.8599,0.7816,0.7822
3,0.8299,0.9518,0.8299,0.8107,0.8192,0.7246,0.7255
4,0.8611,0.9646,0.8611,0.8479,0.8527,0.7765,0.7778
5,0.8646,0.9639,0.8646,0.8576,0.858,0.7824,0.783
6,0.878,0.0,0.878,0.8569,0.8673,0.8004,0.8011
7,0.8223,0.0,0.8223,0.8129,0.8154,0.716,0.7184
8,0.8432,0.0,0.8432,0.8426,0.8409,0.749,0.7502
9,0.8223,0.0,0.8223,0.8224,0.8213,0.7171,0.718


Processing:   0%|          | 0/4 [00:00<?, ?it/s]

Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.809,0.0,0.809,0.8151,0.8117,0.6929,0.6931
1,0.8403,0.0,0.8403,0.8315,0.835,0.7402,0.7411
2,0.809,0.0,0.809,0.8164,0.8117,0.6954,0.6965
3,0.816,0.8655,0.816,0.8156,0.8152,0.7053,0.7058
4,0.8229,0.8656,0.8229,0.8207,0.8155,0.7133,0.718
5,0.8299,0.8694,0.8299,0.8203,0.8238,0.7259,0.7262
6,0.8328,0.0,0.8328,0.8276,0.8279,0.7274,0.7295
7,0.8293,0.0,0.8293,0.8279,0.8272,0.7262,0.7267
8,0.8293,0.0,0.8293,0.8209,0.8251,0.7233,0.7235
9,0.7979,0.0,0.7979,0.8,0.7973,0.6783,0.6795


Processing:   0%|          | 0/4 [00:00<?, ?it/s]

0
1
2
3
4
5
6


In [66]:
import pickle as pk
i=0
for m in [
          'rf',
          'gbc',
          'et',
          'xgboost',
          'catboost',
          'lightgbm',
          'dt']:
  print(model[i].get_params())
  name=m+'.pkl'
  with open(name, "wb") as file: # file is a variable for storing the newly created file, it can be anything.
    pk.dump(model, file) # Dump function is used to write the object into the created file in byte format.
  i+=1

{'bootstrap': True, 'ccp_alpha': 0.0, 'class_weight': None, 'criterion': 'gini', 'max_depth': None, 'max_features': 'sqrt', 'max_leaf_nodes': None, 'max_samples': None, 'min_impurity_decrease': 0.0, 'min_samples_leaf': 1, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'monotonic_cst': None, 'n_estimators': 100, 'n_jobs': -1, 'oob_score': False, 'random_state': 8853, 'verbose': 0, 'warm_start': False}
{'ccp_alpha': 0.0, 'criterion': 'friedman_mse', 'init': None, 'learning_rate': 0.1, 'loss': 'log_loss', 'max_depth': 3, 'max_features': None, 'max_leaf_nodes': None, 'min_impurity_decrease': 0.0, 'min_samples_leaf': 1, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'n_estimators': 100, 'n_iter_no_change': None, 'random_state': 8853, 'subsample': 1.0, 'tol': 0.0001, 'validation_fraction': 0.1, 'verbose': 0, 'warm_start': False}


# PyCaret Optimise

In [67]:
tuned_model = tune_model(model[3])

Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.8507,0.0,0.8507,0.8392,0.843,0.7603,0.7628
1,0.8542,0.0,0.8542,0.8426,0.8456,0.7657,0.7678
2,0.8438,0.0,0.8438,0.8336,0.8354,0.7514,0.7548
3,0.8681,0.9543,0.8681,0.8584,0.8597,0.7902,0.7935
4,0.8542,0.9521,0.8542,0.8389,0.8424,0.7657,0.7691
5,0.8681,0.9567,0.8681,0.8497,0.857,0.7892,0.791
6,0.8537,0.0,0.8537,0.8324,0.8417,0.7623,0.7635
7,0.8188,0.0,0.8188,0.8407,0.8164,0.7155,0.7206
8,0.8571,0.0,0.8571,0.8413,0.848,0.7704,0.7717
9,0.8397,0.0,0.8397,0.8315,0.8316,0.7454,0.7489


Processing:   0%|          | 0/7 [00:00<?, ?it/s]

Fitting 10 folds for each of 10 candidates, totalling 100 fits


In [69]:
tuned_model.get_params()

{'boosting_type': 'gbdt',
 'class_weight': None,
 'colsample_bytree': 1.0,
 'importance_type': 'split',
 'learning_rate': 0.1,
 'max_depth': -1,
 'min_child_samples': 6,
 'min_child_weight': 0.001,
 'min_split_gain': 0.6,
 'n_estimators': 40,
 'n_jobs': -1,
 'num_leaves': 6,
 'objective': None,
 'random_state': 8853,
 'reg_alpha': 0.3,
 'reg_lambda': 0.0005,
 'subsample': 1.0,
 'subsample_for_bin': 200000,
 'subsample_freq': 0,
 'feature_fraction': 0.5,
 'bagging_freq': 2,
 'bagging_fraction': 1.0}

In [68]:
# evaluate_model(tuned_model)

interactive(children=(ToggleButtons(description='Plot Type:', icons=('',), options=(('Pipeline Plot', 'pipelin…

# CUDA

In [None]:
# Remove existing CUDA installation (use with caution)
!apt-get --purge remove nvidia*

# Download and install CUDA 11
!wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64/cuda-ubuntu1804.pin
!mv cuda-ubuntu1804.pin /etc/apt/preferences.d/cuda-repository-pin-600
!apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64/7fa2af80.pub
!apt-get update
!apt-get -y install cuda-11-0

# Verify CUDA 11 installation
!nvcc --version

# Install cuDF 23.08 (compatible with CUDA 11)
!pip install cudf-cu11==23.08 --extra-index-url=https://pypi.nvidia.com

Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
Note, selecting 'nvidia-driver-550-server' for glob 'nvidia*'
Note, selecting 'nvidia-firmware-535-535.154.05' for glob 'nvidia*'
Note, selecting 'nvidia-docker2' for glob 'nvidia*'
Note, selecting 'nvidia-firmware-560-server-560.28.03' for glob 'nvidia*'
Note, selecting 'nvidia-cuda-toolkit-doc' for glob 'nvidia*'
Note, selecting 'nvidia-imex' for glob 'nvidia*'
Note, selecting 'nvidia-dkms-450-server' for glob 'nvidia*'
Note, selecting 'nvidia-firmware-535-server-535.154.05' for glob 'nvidia*'
Note, selecting 'nvidia-headless-390' for glob 'nvidia*'
Note, selecting 'nvidia-cuda-toolkit-gcc' for glob 'nvidia*'
Note, selecting 'nvidia-headless-418' for glob 'nvidia*'
Note, selecting 'nvidia-headless-430' for glob 'nvidia*'
Note, selecting 'nvidia-headless-435' for glob 'nvidia*'
Note, selecting 'nvidia-headless-440' for glob 'nvidia*'
Note, selecting 'nvidia-headless-450' for glob 'nvidia*'

In [None]:
!nvidia-smi
!nvcc --version

Wed Oct  2 02:44:28 2024       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.104.05             Driver Version: 535.104.05   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  Tesla T4                       Off | 00000000:00:04.0 Off |                    0 |
| N/A   65C    P0              31W /  70W |    103MiB / 15360MiB |      0%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
                                                                    

In [None]:
!pip install --upgrade pip

Collecting pip
  Using cached pip-24.2-py3-none-any.whl.metadata (3.6 kB)
Using cached pip-24.2-py3-none-any.whl (1.8 MB)
[0mInstalling collected packages: pip
  Attempting uninstall: pip
    Found existing installation: pip 24.1.2
    Uninstalling pip-24.1.2:
      Successfully uninstalled pip-24.1.2
Successfully installed pip-24.2


In [None]:
!ls -la /usr/local/lib/python3.10/dist-packages | grep -i ylibraft
!rm -rf /usr/local/lib/python3.10/dist-packages/-ylibraft-cu11*

drwxr-xr-x   9 root root     4096 Oct  2 02:20 pylibraft
drwxr-xr-x   2 root root     4096 Oct  2 02:20 pylibraft_cu11-23.8.0.dist-info
drwxr-xr-x   2 root root     4096 Oct  2 02:20 pylibraft_cu11.libs
drwxr-xr-x   3 root root     4096 Oct  2 02:06 pylibraft_cu12-24.8.1.dist-info
drwxr-xr-x   2 root root     4096 Oct  2 02:06 pylibraft_cu12.libs
drwxr-xr-x  10 root root     4096 Oct  2 02:06 ~ylibraft
drwxr-xr-x   3 root root     4096 Oct  2 01:12 ~ylibraft_cu11-24.8.1.dist-info
drwxr-xr-x   2 root root     4096 Oct  2 01:12 ~ylibraft_cu11.libs


In [None]:
!pip install cudf-cu11==23.08
!pip install cuml-cu11==23.08

[0m

In [None]:
!pip install cudf-cu11==23.08 --extra-index-url=https://pypi.nvidia.com
!pip install cuml-cu11==23.08 --extra-index-url=https://pypi.nvidia.com

[0mLooking in indexes: https://pypi.org/simple, https://pypi.nvidia.com
[0mLooking in indexes: https://pypi.org/simple, https://pypi.nvidia.com
Collecting cuml-cu11==23.08
  Using cached https://pypi.nvidia.com/cuml-cu11/cuml_cu11-23.8.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1081.6 MB)
Collecting dask-cuda==23.8.* (from cuml-cu11==23.08)
  Using cached dask_cuda-23.8.0-py3-none-any.whl.metadata (2.3 kB)
Collecting dask-cudf-cu11==23.8.* (from cuml-cu11==23.08)
  Using cached https://pypi.nvidia.com/dask-cudf-cu11/dask_cudf_cu11-23.8.0-py3-none-any.whl (81 kB)
Collecting distributed==2023.7.1 (from cuml-cu11==23.08)
  Using cached distributed-2023.7.1-py3-none-any.whl.metadata (3.3 kB)
Collecting raft-dask-cu11==23.8.* (from cuml-cu11==23.08)
  Using cached https://pypi.nvidia.com/raft-dask-cu11/raft_dask_cu11-23.8.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (214.7 MB)
Collecting pylibraft-cu11==23.8.* (from raft-dask-cu11==23.8.*->cuml-cu11==23.

In [None]:
import os
os.environ['LD_LIBRARY_PATH'] = '/usr/local/cuda/lib64:' + os.environ.get('LD_LIBRARY_PATH', '')

In [None]:
import cudf
import cuml

mergeddata = cudf.from_pandas(mergeddata)

from cuml.model_selection import GridSearchCV

ImportError: libcublas.so.11: cannot open shared object file: No such file or directory

# Optimising

In [None]:
dataset = mergeddata

X = dataset.drop(['severity','priority'
,'resolution'
,'product'
,'creation_time'
,'classification'
,'type'
,'last_change_time'
,'component'
,'status'
,'assigned_to'
,'assigned_to_detail.nick'
,'assigned_to_detail.name'
,'assigned_to_detail.email'
                ],axis=1)

Y = dataset['severity']

# split data into train and test sets
seed = 7
test_size = 0.3
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=test_size, random_state=seed)


In [None]:
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score
import pickle

# model[3].save_model('xgb_model.model')

# modelxgb = cuml.xgboost.XGBClassifier()

# modelxgb.load_model('xgb_model.model')

# with open('xgboost.pkl', 'rb') as f:
#   modelxgb = pickle.load(f)

modelxgb = model[0]

param_grid = {
    # 'max_depth': [3, 4, 5, 10],
    'max_depth': [3],
    # 'subsample': [0.1, 0.6, 0.8, 1.0]
    'subsample': [1.0],
    # 'colsample_bylevel': [0.6, 0.8, 1.0]
    'colsample_bylevel': [0.6],
    # 'colsample_bytree': [0.6, 0.8, 1.0]
    'colsample_bytree': [0.8],
    # 'min_child_weight': [1, 5, 100]
    'min_child_weight': [0.5],
    # 'reg_alpha': [1, 5, 100],
    # 'reg_lambda': [1, 5, 100]
    'reg_alpha': [1],
    'reg_lambda': [5],
    'gamma': [0.1],
    # 'gamma': [0, 0.1, 0.2, 0.3]
    # 'n_estimators': [50, 100, 250, 500],
    # 'learning_rate': [0.01, 0.2, 0.5]
    'n_estimators': [250],
    'learning_rate': [0.2],
    'scale_pos_weight': [1],
    # 'scale_pos_weight': [1, 5, 10],
    'max_delta_step': [5],
    'tree_method': ['hist']

}

# Set up the GridSearchCV
grid_search = GridSearchCV(
    estimator=modelxgb,
    param_grid=param_grid,
    scoring='accuracy',
    cv=3,
    verbose=1,
    n_jobs=-1
)

# Fit the model
grid_search.fit(X_train, y_train)

# Get the best parameters and evaluate the model
best_params = grid_search.best_params_
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)

print(f'Best Parameters: {best_params}')
print(f'Best Model Accuracy: {accuracy:.2f}')

Fitting 3 folds for each of 3 candidates, totalling 9 fits
Best Parameters: {'colsample_bylevel': 0.6, 'colsample_bytree': 0.8, 'gamma': 0.1, 'learning_rate': 0.2, 'max_delta_step': 5, 'max_depth': 3, 'min_child_weight': 0.5, 'n_estimators': 250, 'reg_alpha': 1, 'reg_lambda': 5, 'scale_pos_weight': 1, 'subsample': 1.0, 'tree_method': 'hist'}
Best Model Accuracy: 0.85


# Other balancing techniques

In [None]:
# !pip install imbalanced-learn==0.11.0

from imblearn.over_sampling import RandomOverSampler, SMOTE, ADASYN
from imblearn.under_sampling import RandomUnderSampler
from imblearn.combine import SMOTEENN
from sklearn.model_selection import train_test_split

In [None]:
mergeddata['severity'].value_counts()


Unnamed: 0_level_0,count
severity,Unnamed: 1_level_1
3,2122
2,1185
4,700
1,98
0,4


In [None]:
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score

dataset = mergeddata.drop(mergeddata[mergeddata['severity'] == 0].index)

# Assuming 'severity' is your target variable
mapping = {1: 0, 2: 1, 3: 2, 4: 3}  # Create a mapping dictionary
dataset['severity'] = dataset['severity'].map(mapping)

X = dataset.drop(['severity','priority'
,'resolution'
,'product'
,'creation_time'
,'classification'
,'type'
,'last_change_time'
,'component'
,'status'
,'assigned_to'
,'assigned_to_detail.nick'
,'assigned_to_detail.name'
,'assigned_to_detail.email'
                ],axis=1)

Y = dataset['severity']

# split data into train and test sets
seed = 7
test_size = 0.3
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=test_size, random_state=seed)

params = {
    'objective': 'multi:softmax',
    'max_depth': 3,
    'subsample': 1.0,
    'colsample_bylevel': 0.6,
    'colsample_bytree': 0.8,
    'min_child_weight': 0.5,
    'reg_alpha': 1,
    'reg_lambda': 5,
    'gamma': 0.1,
    'n_estimators': 250,
    'learning_rate': 0.2,
    'scale_pos_weight': 1,
    'max_delta_step': 5,
    'tree_method': 'hist',
    'random_state': 42

}

modelxgb = XGBClassifier(params)

# Resampling techniques
ros = RandomOverSampler(random_state=42)
X_ros, y_ros = ros.fit_resample(X_train, y_train)
print('ROS')
modelxgb.fit(X_ros, y_ros) # Use resampled data
y_pred = modelxgb.predict(X_test)
print(accuracy_score(y_test, y_pred))

rus = RandomUnderSampler(random_state=42)
X_rus, y_rus = rus.fit_resample(X_train, y_train)
print('RUS')
modelxgb.fit(X_rus, y_rus) # Use resampled data
y_pred = modelxgb.predict(X_test)
print(accuracy_score(y_test, y_pred))

# smote = SMOTE(random_state=42, k_neighbors=2)
# X_smote, y_smote = smote.fit_resample(X_train, y_train)
# print('SMOTE')
# modelxgb.fit(X_smote, y_smote) # Use resampled data
# y_pred = modelxgb.predict(X_test)
# print(accuracy_score(y_test, y_pred))

adasyn = ADASYN(random_state=42, )
X_adasyn, y_adasyn = adasyn.fit_resample(X_train, y_train)
print('ADASYN')
modelxgb.fit(X_adasyn, y_adasyn) # Use resampled data
y_pred = modelxgb.predict(X_test)
print(accuracy_score(y_test, y_pred))
# Apply resampling to training data

# smoteenn = SMOTEENN(random_state=42)
# X_smoteenn, y_smoteenn = smoteenn.fit_resample(X_train, y_train)
# print('SMOTENN')
# modelxgb.fit(X_smoteenn, y_smoteenn) # Use resampled data
# y_pred = modelxgb.predict(X_test)
# print(accuracy_score(y_test, y_pred))


# Example: Train and evaluate with SMOTE





ROS
0.8165584415584416
RUS
0.6972402597402597


ValueError: Input X contains NaN.
ADASYN does not accept missing values encoded as NaN natively. For supervised learning, you might want to consider sklearn.ensemble.HistGradientBoostingClassifier and Regressor which accept missing values encoded as NaNs natively. Alternatively, it is possible to preprocess the data, for instance by using an imputer transformer in a pipeline or drop samples with missing values. See https://scikit-learn.org/stable/modules/impute.html You can find a list of all estimators that handle NaN values at the following page: https://scikit-learn.org/stable/modules/impute.html#estimators-that-handle-nan-values