# Análisis

In [20]:
# !pip install pyvolutionary==2.4.2
# !pip install pycaret
# !pip install pycaret[full]
# !pip install scikit_learn==1.4

In [21]:
import warnings
warnings.filterwarnings("ignore",category=FutureWarning)
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
import datetime as dt
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import Lasso
from sklearn.linear_model import Ridge
from sklearn.model_selection import KFold
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import RobustScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import MaxAbsScaler
from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import make_pipeline
from sklearn import metrics
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import RandomOverSampler
from imblearn.over_sampling import SMOTE
from imblearn.combine import SMOTEENN
from numpy.linalg import svd
import requests
import json


sscaler = StandardScaler()
rscaler = RobustScaler()
mmscaler = MinMaxScaler()
mascaler = MaxAbsScaler()

le = LabelEncoder()

def lencoder(df, col, lenc=False, train=True):
    if train:
        label = le.fit_transform(df[col])
        df[col] = pd.to_numeric(label)
        return df,le
    elif lenc != False:
        label = lenc.transform(df[col])
        df[col] = pd.to_numeric(label)
        return df
    else:
        print('Nothing done')
        return df



In [22]:
def run_kfold_model (X,y,K=10,graph=True):
    regr = LinearRegression()
    kf = KFold(n_splits=5)
    s=1
    for train_index, test_index in kf.split(X):
        # print("TRAIN:", train_index, "TEST:", test_index)
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]
        regr.fit(X_train, y_train)
        print('Slice '+str(s)+' score: '+str(regr.score(X_test, y_test)))
        y_pred = regr.predict(X_test)
        if graph:
            plt.scatter(y_test, y_pred, color = 'red', marker = 'o', s = 35, alpha = 0.5, label = 'Test data')
            plt.show()
        s+=1
    return(regr)

In [23]:
def scores (model, X_train, X_test, y_test, y_pred, graph=True):
    MAE=metrics.mean_absolute_error(y_test, y_pred)
    MSE=metrics.mean_squared_error(y_test, y_pred)
    RMSE=np.sqrt(metrics.mean_squared_error(y_test, y_pred))
    CV_RMSE= (RMSE/np.average(y_test))*100
    R2=metrics.r2_score(y_test, y_pred)
    ACC=metrics.accuracy_score
    REC=metrics.recall_score
    PREC=metrics.precision_score

    print('MAE:'+str(MAE))
    print('MSE:'+str(MSE))
    print('RMSE:'+str(RMSE))
    print('CV(RMSE): '+str(CV_RMSE))
    print('R2:'+str(R2))
    print('Accuracy:'+str(ACC))
    print('Recall:'+str(REC))
    print('Precision:'+str(PREC))

    # print('Training score: '+str(model.score(X_train, y_train))) #(Accuracy or R2)
    # print('Test score: '+str(model.score(X_test, y_train)))
    if graph == True:
        plt.scatter(y_train, y_pred, color = 'red', marker = 'o', s = 35, alpha = 0.5, label = 'Test data')
        plt.show()

In [24]:
import dask.dataframe as dd

In [25]:
from ast import literal_eval
def clean(x):
    return literal_eval(x)

In [26]:
pd.set_option("mode.copy_on_write", True)
pd.options.mode.copy_on_write = True

In [27]:
def dummies(df, col):
    tDummy = pd.get_dummies(df[col]).add_prefix(col+'_')
    df = pd.concat([df,tDummy],axis=1).reindex(df.index)
    return(df)

def just_dummies(df, col):
    tDummy = pd.get_dummies(df[col]).add_prefix(col+'_')
    return(tDummy)

def just_dummies_steps(df, col, ix):
    for i in range(len(df)):
        dfr = df.loc[df[ix]==i]
        if i==0:
            tDummy0 = dummies(dfr,col)
        else:
            tDummy = dummies(dfr,col)
            tDummy0 = pd.concat([tDummy0,tDummy],axis=1)
    return(tDummy)

# Checkpoint 1

In [None]:
# data = pd.read_excel('mozilla-bugs-all.xlsx')
# data.describe

In [None]:
def json_data(URL,params):
    headers = {"Accept" : "application/json"}
    resp = requests.get(URL, params= params, headers= headers)
    df = pd.DataFrame()
    if resp.status_code != 200:
        print('error: ' + str(resp.status_code))
    else:
        print('Success')
        bugs = resp.text
        data = json.loads(bugs)
        df = pd.json_normalize(data['bugs'])
    return(df)

params ={
    "include_fields" : ["id"
                        ,"summary"
                        ,"status"
                        ,"description"
                        ,"type"
                        ,"classification"
                        ,"product"
                        ,"component"
                        ,"priority"
                        ,"assigned_to"
                        ,"resolution"
                        ,"creation_time"
                        ,"last_change_time"
                        ,"severity"
                        ,"version"
                        ]
    ,"product" : "Core"
    # ,"status" : ["VERIFIED","RESOLVED","CLOSED","UNCONFIRMED","NEW"]
    ,"limit" : 10000
    ,"order": "opendate DESC"
}
URL = "https://bugzilla.mozilla.org/rest/bug"


In [None]:
params["status"] = "RESOLVED"
params.pop("status", None)

test_df = json_data(URL,params)
print(test_df.columns.to_list())
test_df.tail()

In [None]:
# for i in test_df.columns.to_list():
#     print("Column "+i+" unique values:")
#     # print(test_df[i].unique())
#     print(test_df[i].value_counts())

#     print()


In [None]:
test_df.groupby(['resolution']).sum().plot(kind='pie', y='id', autopct='%1.0f%%', labeldistance=None)
from datetime import datetime

print(test_df['creation_time'].min())
datetime.now() - pd.to_datetime(pd.to_datetime(test_df['creation_time'][test_df['resolution']=='FIXED']).values.astype(np.int64).mean())
plt.pie(x=test_df['resolution'].unique(), labels=test_df['resolution'].value_counts())
plt.show()

plt.title('Category Distribution')
plt.ylabel('')  # Hide the y-label
plt.show()

test_df.to_csv("dataset_20240907.csv")
test_df.date = pd.to_datetime(test_df.date).values.astype(np.int64)

test_df = pd.DataFrame(pd.to_datetime(test_df.groupby('column').mean().date))

# Checkpoint 2

In [None]:
data = pd.read_csv('dataset_20240827.csv')
data.describe

<bound method NDFrame.describe of       Unnamed: 0 resolution product         creation_time priority  \
0              0        NaN    Core  2024-08-27T07:10:15Z       P5   
1              1        NaN    Core  2024-08-27T07:09:56Z       --   
2              2        NaN    Core  2024-08-27T06:56:08Z       --   
3              3        NaN    Core  2024-08-27T06:13:19Z       P5   
4              4        NaN    Core  2024-08-27T05:29:11Z       --   
...          ...        ...     ...                   ...      ...   
9995        9995        NaN    Core  2024-03-23T18:04:58Z       --   
9996        9996        NaN    Core  2024-03-23T18:01:29Z       --   
9997        9997        NaN    Core  2024-03-23T17:57:21Z       P3   
9998        9998        NaN    Core  2024-03-23T17:56:41Z       P3   
9999        9999        NaN    Core  2024-03-23T17:55:13Z       P3   

     classification    type      last_change_time  \
0        Components  defect  2024-08-27T07:10:15Z   
1        Components

In [None]:
#import nltk
#nltk.download('punkt_tab')
#nltk.download('stopwords')
#nltk.download('punkt')
#nltk.download('wordnet')

In [None]:
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.tokenize import MWETokenizer
from nltk.stem import WordNetLemmatizer
import math
import re

def NLProcess (text):

    # print('Original text: ',data.iloc[1]['description'])
    #print("====================== starting ======================")
    ######################################################################### Tokenised
    mwe_tokenizer = MWETokenizer([('does', 'not'), ('in', 'spite', 'of'),('don', '’', 't')])
    tokenizer = word_tokenize
    tokenised = tokenizer(text)
    #print('Tokenised: ',tokenised)
    retokenised = mwe_tokenizer.tokenize(tokenised)
    # retokenised = [token.replace('_', '') for token in retokenised]
    #print('ReTokenised: ',retokenised)

    ######################################################################### Lower-case converted
    normalised = [word.lower() for word in retokenised]
    #print('Normalised: ',normalised)

    ######################################################################### Punctuation marks removed
    unmarked = [re.sub(r'[^\w\s]', '', token) for token in normalised if re.sub(r'[^\w\s]', '', token)]
    #print('Punctuation marks removed: ',unmarked)

    ######################################################################### Stop-words removed
    stop_words = set(stopwords.words('english'))
    filtered = [word for word in unmarked if word not in stop_words]
    #print('Stop-words removed: ',filtered)

    ######################################################################### Lemmatised
    lemmatizer = WordNetLemmatizer()
    lemmatised = [lemmatizer.lemmatize(word) for word in filtered]
    #print('Lemmatised: ',lemmatised)

    return lemmatised


In [None]:
# def dfNLProcess (df,column):
#     for index, row in df.iterrows():
#         text = row[column]
#         # print('Original text: ',data.iloc[1]['description'])
#         #print("====================== starting ======================")
#         ######################################################################### Tokenised
#         mwe_tokenizer = MWETokenizer([('does', 'not'), ('in', 'spite', 'of'),('don', '’', 't')])
#         tokenizer = word_tokenize
#         tokenised = tokenizer(text)
#         #print('Tokenised: ',tokenised)
#         retokenised = mwe_tokenizer.tokenize(tokenised)
#         # retokenised = [token.replace('_', '') for token in retokenised]
#         #print('ReTokenised: ',retokenised)

#         ######################################################################### Lower-case converted
#         normalised = [word.lower() for word in retokenised]
#         #print('Normalised: ',normalised)

#         ######################################################################### Punctuation marks removed
#         unmarked = [re.sub(r'[^\w\s]', '', token) for token in normalised if re.sub(r'[^\w\s]', '', token)]
#         #print('Punctuation marks removed: ',unmarked)

#         ######################################################################### Stop-words removed
#         stop_words = set(stopwords.words('english'))
#         filtered = [word for word in unmarked if word not in stop_words]
#         #print('Stop-words removed: ',filtered)

#         ######################################################################### Lemmatised
#         lemmatizer = WordNetLemmatizer()
#         lemmatised = [lemmatizer.lemmatize(word) for word in filtered]
#         #print('Lemmatised: ',lemmatised)

#         lemmatised
#         ncol = 'P'+col
#         df2 = pd.DataFrame({ncol:lemmatised})
#         df2 = dummies(df2)


In [None]:
# dfNLProcess(data,'summary')

In [None]:
# for i in range(10):
#     NLProcess(data.iloc[i]['description'])

data['Psummary'] = data['summary'].map(NLProcess)

# type(data['summary'])

In [None]:
data['Pdescription'] = data['description'].fillna('').map(NLProcess)

In [None]:
#data.loc[data['description'].str.contains('//crash-stats.mozilla.org/report/index/768c5c44-57c5-4746-890c-9af820240811', case=False, na=False)]

In [None]:
data.head()

Unnamed: 0.1,Unnamed: 0,resolution,product,creation_time,priority,classification,type,last_change_time,component,summary,...,id,severity,version,assigned_to_detail.nick,assigned_to_detail.id,assigned_to_detail.name,assigned_to_detail.real_name,assigned_to_detail.email,Psummary,Pdescription
0,0,,Core,2024-08-27T07:10:15Z,P5,Components,defect,2024-08-27T07:10:15Z,Widget,Intermittent widget/tests/browser/browser_test...,...,1915086,S4,unspecified,nobody,1,nobody@mozilla.org,Nobody; OK to take it and work on it,nobody@mozilla.org,"[intermittent, widgettestsbrowserbrowser_test_...","[filed, nfay, mozillacom, parsed, log, http, t..."
1,1,,Core,2024-08-27T07:09:56Z,--,Components,defect,2024-08-27T07:12:28Z,Networking: DNS,"Bad name, functionality for network.trr.exclud...",...,1915085,--,Firefox 129,nobody,1,nobody@mozilla.org,Nobody; OK to take it and work on it,nobody@mozilla.org,"[bad, name, functionality, networktrrexcludeet...","[user, agent, mozilla50, macintosh, intel, mac..."
2,2,,Core,2024-08-27T06:56:08Z,--,Components,task,2024-08-27T07:02:38Z,WebRTC: Audio/Video,Request sRGB colorspace from ScreenCapturerSck,...,1915082,,unspecified,pehrsons,489889,apehrson@mozilla.com,Andreas Pehrson [:pehrsons],apehrson@mozilla.com,"[request, srgb, colorspace, screencapturersck]","[screencapturekit, doc, http, developerappleco..."
3,3,,Core,2024-08-27T06:13:19Z,P5,Components,defect,2024-08-27T06:17:48Z,Panning and Zooming,Intermittent gfx/layers/apz/test/mochitest/tes...,...,1915078,S4,unspecified,nobody,1,nobody@mozilla.org,Nobody; OK to take it and work on it,nobody@mozilla.org,"[intermittent, gfxlayersapztestmochitesttest_g...","[filed, nfay, mozillacom, parsed, log, http, t..."
4,4,,Core,2024-08-27T05:29:11Z,--,Components,defect,2024-08-27T06:25:14Z,DOM: Core & HTML,Removing meta viewport tag has no effect,...,1915077,--,unspecified,nobody,1,nobody@mozilla.org,Nobody; OK to take it and work on it,nobody@mozilla.org,"[removing, meta, viewport, tag, effect]","[str, 1, open, site, meta, viewport, element, ..."


In [None]:
data.to_csv("dataset_20240907_plusP.csv")

# Checkpoint 3

In [None]:
data = pd.read_csv('dataset_20240907_plusP.csv', converters={'Psummary': clean})
data.head()

Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,resolution,product,creation_time,priority,classification,type,last_change_time,component,...,id,severity,version,assigned_to_detail.nick,assigned_to_detail.id,assigned_to_detail.name,assigned_to_detail.real_name,assigned_to_detail.email,Psummary,Pdescription
0,0,0,,Core,2024-08-27T07:10:15Z,P5,Components,defect,2024-08-27T07:10:15Z,Widget,...,1915086,S4,unspecified,nobody,1,nobody@mozilla.org,Nobody; OK to take it and work on it,nobody@mozilla.org,"[intermittent, widgettestsbrowserbrowser_test_...","['filed', 'nfay', 'mozillacom', 'parsed', 'log..."
1,1,1,,Core,2024-08-27T07:09:56Z,--,Components,defect,2024-08-27T07:12:28Z,Networking: DNS,...,1915085,--,Firefox 129,nobody,1,nobody@mozilla.org,Nobody; OK to take it and work on it,nobody@mozilla.org,"[bad, name, functionality, networktrrexcludeet...","['user', 'agent', 'mozilla50', 'macintosh', 'i..."
2,2,2,,Core,2024-08-27T06:56:08Z,--,Components,task,2024-08-27T07:02:38Z,WebRTC: Audio/Video,...,1915082,,unspecified,pehrsons,489889,apehrson@mozilla.com,Andreas Pehrson [:pehrsons],apehrson@mozilla.com,"[request, srgb, colorspace, screencapturersck]","['screencapturekit', 'doc', 'http', 'developer..."
3,3,3,,Core,2024-08-27T06:13:19Z,P5,Components,defect,2024-08-27T06:17:48Z,Panning and Zooming,...,1915078,S4,unspecified,nobody,1,nobody@mozilla.org,Nobody; OK to take it and work on it,nobody@mozilla.org,"[intermittent, gfxlayersapztestmochitesttest_g...","['filed', 'nfay', 'mozillacom', 'parsed', 'log..."
4,4,4,,Core,2024-08-27T05:29:11Z,--,Components,defect,2024-08-27T06:25:14Z,DOM: Core & HTML,...,1915077,--,unspecified,nobody,1,nobody@mozilla.org,Nobody; OK to take it and work on it,nobody@mozilla.org,"[removing, meta, viewport, tag, effect]","['str', '1', 'open', 'site', 'meta', 'viewport..."


In [None]:
data.drop(axis=1, columns=['Unnamed: 0.1','Unnamed: 0'], inplace=True)
data.head()

Unnamed: 0,resolution,product,creation_time,priority,classification,type,last_change_time,component,summary,description,...,id,severity,version,assigned_to_detail.nick,assigned_to_detail.id,assigned_to_detail.name,assigned_to_detail.real_name,assigned_to_detail.email,Psummary,Pdescription
0,,Core,2024-08-27T07:10:15Z,P5,Components,defect,2024-08-27T07:10:15Z,Widget,Intermittent widget/tests/browser/browser_test...,**Filed by:** nfay [at] mozilla.com\r\n**Parse...,...,1915086,S4,unspecified,nobody,1,nobody@mozilla.org,Nobody; OK to take it and work on it,nobody@mozilla.org,"[intermittent, widgettestsbrowserbrowser_test_...","['filed', 'nfay', 'mozillacom', 'parsed', 'log..."
1,,Core,2024-08-27T07:09:56Z,--,Components,defect,2024-08-27T07:12:28Z,Networking: DNS,"Bad name, functionality for network.trr.exclud...",User Agent: Mozilla/5.0 (Macintosh; Intel Mac ...,...,1915085,--,Firefox 129,nobody,1,nobody@mozilla.org,Nobody; OK to take it and work on it,nobody@mozilla.org,"[bad, name, functionality, networktrrexcludeet...","['user', 'agent', 'mozilla50', 'macintosh', 'i..."
2,,Core,2024-08-27T06:56:08Z,--,Components,task,2024-08-27T07:02:38Z,WebRTC: Audio/Video,Request sRGB colorspace from ScreenCapturerSck,[The ScreenCaptureKit docs](https://developer....,...,1915082,,unspecified,pehrsons,489889,apehrson@mozilla.com,Andreas Pehrson [:pehrsons],apehrson@mozilla.com,"[request, srgb, colorspace, screencapturersck]","['screencapturekit', 'doc', 'http', 'developer..."
3,,Core,2024-08-27T06:13:19Z,P5,Components,defect,2024-08-27T06:17:48Z,Panning and Zooming,Intermittent gfx/layers/apz/test/mochitest/tes...,**Filed by:** nfay [at] mozilla.com\r\n**Parse...,...,1915078,S4,unspecified,nobody,1,nobody@mozilla.org,Nobody; OK to take it and work on it,nobody@mozilla.org,"[intermittent, gfxlayersapztestmochitesttest_g...","['filed', 'nfay', 'mozillacom', 'parsed', 'log..."
4,,Core,2024-08-27T05:29:11Z,--,Components,defect,2024-08-27T06:25:14Z,DOM: Core & HTML,Removing meta viewport tag has no effect,STR;\r\n\r\n1. Open any sites having a meta vi...,...,1915077,--,unspecified,nobody,1,nobody@mozilla.org,Nobody; OK to take it and work on it,nobody@mozilla.org,"[removing, meta, viewport, tag, effect]","['str', '1', 'open', 'site', 'meta', 'viewport..."


In [None]:
# from sklearn.feature_selection import mutual_info_classif

In [None]:
type(data.iloc[0]['Psummary'])
# type(data['Psummary'])

list

In [None]:
exploded = data.explode('Psummary')

In [None]:
filter='^\d+$'
notfilter='^(?!'+filter+').*$'
exploded_filtered = exploded.loc[exploded['Psummary'].str.contains(notfilter)]

In [None]:
checkit = just_dummies(exploded_filtered,'Psummary')

In [None]:
checkit.index.name = 'index'

In [None]:
checkit = dd.from_pandas(checkit, chunksize=100)

In [None]:
checkit.head()

Unnamed: 0_level_0,Psummary_00f,Psummary_01f,Psummary_04347e67c6f87ee7a33c8ed8103aecebac6c3888,Psummary_0a1,Psummary_0async,Psummary_0cad754da2ee3ececcfa1aad8b858a0286c24e16,Psummary_0dc559f060db0d62d95f424e3fd26a5f673b2f6b,Psummary_0e30966b198ad28943799eaf5b3b08100b6f70c3,Psummary_0kb,Psummary_0px,...,Psummary_zoomteams,Psummary_zoomus,Psummary_zstd,Psummary_zstd_dctx_setparameter,Psummary_zwp_tablet_tool_v2_set_cursor,Psummary_zwsetevent,Psummary_zwusermsgwaitformultipleobjectsex,Psummary_zydis,Psummary_ñ,Psummary_㜱㜸㤱㠰㤴㜶㔰㔰
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
0,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
0,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
0,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
0,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


In [None]:
# prefix='Psummary_'
# filter=prefix+'\d+$'
# notfilter='^(?!'+filter+').*$'
# print(len(checkit.filter(regex=(notfilter)).columns.tolist()))
# # print(checkit.filter(regex=(notfilter)).columns.tolist())

In [None]:
checkot = checkit.groupby(['index'], observed=True).sum()

In [None]:
checkot['nindex'] = checkot.index

In [None]:
checkot.head()

Unnamed: 0_level_0,Psummary_00f,Psummary_01f,Psummary_04347e67c6f87ee7a33c8ed8103aecebac6c3888,Psummary_0a1,Psummary_0async,Psummary_0cad754da2ee3ececcfa1aad8b858a0286c24e16,Psummary_0dc559f060db0d62d95f424e3fd26a5f673b2f6b,Psummary_0e30966b198ad28943799eaf5b3b08100b6f70c3,Psummary_0kb,Psummary_0px,...,Psummary_zoomus,Psummary_zstd,Psummary_zstd_dctx_setparameter,Psummary_zwp_tablet_tool_v2_set_cursor,Psummary_zwsetevent,Psummary_zwusermsgwaitformultipleobjectsex,Psummary_zydis,Psummary_ñ,Psummary_㜱㜸㤱㠰㤴㜶㔰㔰,nindex
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,2
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,3
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,4


In [None]:
checket = checkot.compute()

In [None]:
checkit = checket.reset_index()

In [None]:
checkit.head()

Unnamed: 0,index,Psummary_00f,Psummary_01f,Psummary_04347e67c6f87ee7a33c8ed8103aecebac6c3888,Psummary_0a1,Psummary_0async,Psummary_0cad754da2ee3ececcfa1aad8b858a0286c24e16,Psummary_0dc559f060db0d62d95f424e3fd26a5f673b2f6b,Psummary_0e30966b198ad28943799eaf5b3b08100b6f70c3,Psummary_0kb,...,Psummary_zoomus,Psummary_zstd,Psummary_zstd_dctx_setparameter,Psummary_zwp_tablet_tool_v2_set_cursor,Psummary_zwsetevent,Psummary_zwusermsgwaitformultipleobjectsex,Psummary_zydis,Psummary_ñ,Psummary_㜱㜸㤱㠰㤴㜶㔰㔰,nindex
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
2,2,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,2
3,3,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,3
4,4,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,4


In [None]:
# def rowgroupbyidnex(df):
#     rows = []
#     for i in len(df):
#         rows.append(df.iloc[i].groupby(checkit.index, observed=True).sum())





In [None]:
checkit.to_csv('dataset_20240907_plusP_Psummary.csv')

# Checkpoint 4

In [None]:
data = pd.read_csv('dataset_20240907_plusP.csv', converters={'Pdescription': clean,'Psummary': clean})

In [None]:
summary = pd.read_csv('dataset_20240907_plusP_Psummary.csv')

In [None]:
exploded = data.explode('Pdescription')
filter='^\d+$'
notfilter='^(?!'+filter+').*$'

exploded = exploded.loc[exploded['Pdescription'].notna()]

exploded_filtered = exploded.loc[exploded['Pdescription'].str.contains(notfilter)]

In [None]:
exploded_filtered['nindex'] = exploded_filtered.index

In [None]:
exploded_filtered = exploded_filtered.reset_index()
checkit = just_dummies_steps(exploded_filtered[['nindex','Pdescription']],'Pdescription','nindex')

In [None]:
checkit = dd.from_pandas(checkit, chunksize=100)

In [None]:
checkot = checkit.groupby([checkit.index], observed=True).sum()
checkot['nindex'] = checkot.index
checket = checkot.compute()
checkit = checket.reset_index()
checkit.to_csv('dataset_20240907_plusP_Pdescription.csv')

In [None]:
summary.drop(axis=1, columns=['Unnamed: 0','nindex','index'], inplace=True)
summary.head()

Unnamed: 0,Psummary_00f,Psummary_01f,Psummary_04347e67c6f87ee7a33c8ed8103aecebac6c3888,Psummary_0a1,Psummary_0async,Psummary_0cad754da2ee3ececcfa1aad8b858a0286c24e16,Psummary_0dc559f060db0d62d95f424e3fd26a5f673b2f6b,Psummary_0e30966b198ad28943799eaf5b3b08100b6f70c3,Psummary_0kb,Psummary_0px,...,Psummary_zoomteams,Psummary_zoomus,Psummary_zstd,Psummary_zstd_dctx_setparameter,Psummary_zwp_tablet_tool_v2_set_cursor,Psummary_zwsetevent,Psummary_zwusermsgwaitformultipleobjectsex,Psummary_zydis,Psummary_ñ,Psummary_㜱㜸㤱㠰㤴㜶㔰㔰
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [None]:
data['Psummary']=data['Psummary'].tolist()

In [None]:
data['Pdescription']=data['Pdescription'].apply(lambda x: ' '.join(x))
data['Psummary']=data['Psummary'].apply(lambda x: ' '.join(x))

In [None]:
summary.head()

Unnamed: 0,Psummary_00f,Psummary_01f,Psummary_04347e67c6f87ee7a33c8ed8103aecebac6c3888,Psummary_0a1,Psummary_0async,Psummary_0cad754da2ee3ececcfa1aad8b858a0286c24e16,Psummary_0dc559f060db0d62d95f424e3fd26a5f673b2f6b,Psummary_0e30966b198ad28943799eaf5b3b08100b6f70c3,Psummary_0kb,Psummary_0px,...,Psummary_zoomteams,Psummary_zoomus,Psummary_zstd,Psummary_zstd_dctx_setparameter,Psummary_zwp_tablet_tool_v2_set_cursor,Psummary_zwsetevent,Psummary_zwusermsgwaitformultipleobjectsex,Psummary_zydis,Psummary_ñ,Psummary_㜱㜸㤱㠰㤴㜶㔰㔰
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

def tfidf(df,col):
  vectorizer = TfidfVectorizer()
  vectors = vectorizer.fit_transform(df[col])

  # Select the first five documents from the data set
  tf_idf = pd.DataFrame(vectors.todense())#.iloc[:5]
  tf_idf.columns = vectorizer.get_feature_names_out()
  tfidf_matrix = tf_idf.T
  tfidf_matrix.columns = ['bug'+ str(i) for i in range(1, 10001)]
  tfidf_matrix['count'] = tfidf_matrix.sum(axis=1)

  # Top words
  tfidf_matrix = tfidf_matrix.sort_values(by ='count', ascending=False)

  # Print the first 10 words
  # print(tfidf_matrix.drop(columns=['count']).head(10))

  return tfidf_matrix


In [None]:
# tfidf_Pdescription = tfidf(data,'Pdescription')
tfidf_Psummary = tfidf(data,'Psummary')
over1 = tfidf_Psummary.loc[tfidf_Psummary['count']>1]

In [None]:
filter='^\d+$'
notfilter='^(?!'+filter+').*$'
wordlst = over1.loc[over1.index.str.contains(notfilter)]

In [None]:
wordlst['words'] = 'Psummary_' + wordlst.index.astype(str)
wordlst.head()

Unnamed: 0,bug1,bug2,bug3,bug4,bug5,bug6,bug7,bug8,bug9,bug10,...,bug9993,bug9994,bug9995,bug9996,bug9997,bug9998,bug9999,bug10000,count,words
intermittent,0.239473,0.0,0.0,0.185069,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,379.11454,Psummary_intermittent
bug,0.260809,0.0,0.0,0.201557,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,354.356753,Psummary_bug
tracking,0.262563,0.0,0.0,0.202912,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,352.168633,Psummary_tracking
single,0.264538,0.0,0.0,0.204439,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,349.34071,Psummary_single
sync,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,213.736062,Psummary_sync


In [None]:
summary.head()

Unnamed: 0,Psummary_00f,Psummary_01f,Psummary_04347e67c6f87ee7a33c8ed8103aecebac6c3888,Psummary_0a1,Psummary_0async,Psummary_0cad754da2ee3ececcfa1aad8b858a0286c24e16,Psummary_0dc559f060db0d62d95f424e3fd26a5f673b2f6b,Psummary_0e30966b198ad28943799eaf5b3b08100b6f70c3,Psummary_0kb,Psummary_0px,...,Psummary_zoomteams,Psummary_zoomus,Psummary_zstd,Psummary_zstd_dctx_setparameter,Psummary_zwp_tablet_tool_v2_set_cursor,Psummary_zwsetevent,Psummary_zwusermsgwaitformultipleobjectsex,Psummary_zydis,Psummary_ñ,Psummary_㜱㜸㤱㠰㤴㜶㔰㔰
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [None]:
summary = summary[wordlst['words']]
summary.head()

Unnamed: 0,Psummary_intermittent,Psummary_bug,Psummary_tracking,Psummary_single,Psummary_sync,Psummary_pr,Psummary_wptsync,Psummary_test,Psummary_wpt,Psummary_tier,...,Psummary_loopffsh,Psummary_unified,Psummary_loong64,Psummary_imagetestreftesticoicobmp8bppicosize1x18bppico,Psummary_downloads,Psummary_mitigate,Psummary_testsjittestjittesttestsgcbug1517158js,Psummary_fuse,Psummary_lighter,Psummary_eventsource
0,1,1,1,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,1,1,1,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [None]:
summary.tail()

Unnamed: 0,Psummary_intermittent,Psummary_bug,Psummary_tracking,Psummary_single,Psummary_sync,Psummary_pr,Psummary_wptsync,Psummary_test,Psummary_wpt,Psummary_tier,...,Psummary_loopffsh,Psummary_unified,Psummary_loong64,Psummary_imagetestreftesticoicobmp8bppicosize1x18bppico,Psummary_downloads,Psummary_mitigate,Psummary_testsjittestjittesttestsgcbug1517158js,Psummary_fuse,Psummary_lighter,Psummary_eventsource
9994,0,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
9995,0,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
9996,0,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
9997,0,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
9998,0,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0


In [None]:
mergeddata = pd.concat([data,summary],axis=1)

mergeddata.to_csv('dataset_20240907_plusP_Psummary_tfidf.csv')

# Checkpoint 5

In [28]:
mergeddata = pd.read_csv('dataset_20240907_plusP_Psummary_tfidf.csv')

  mergeddata = pd.read_csv('dataset_20240907_plusP_Psummary_tfidf.csv')


In [29]:
mergeddata.head()
mergeddata['resolution'] = mergeddata['resolution'].fillna('')
mergeddata['severity'] = mergeddata['severity'].fillna('SN/A')
# mergeddata['resolution'] = mergeddata['resolution'].astype(str)

In [30]:
mergeddata.drop([
'Unnamed: 0.2',
'Unnamed: 0.1',
'Unnamed: 0',
'Unnamed: 0.2',
'Unnamed: 0.1',
'Unnamed: 0',
'summary',
'description',
'Psummary',
'Pdescription',
'assigned_to_detail.real_name'
],axis=1,inplace=True)

mergeddata.head()

Unnamed: 0,resolution,product,creation_time,priority,classification,type,last_change_time,component,status,assigned_to,...,Psummary_loopffsh,Psummary_unified,Psummary_loong64,Psummary_imagetestreftesticoicobmp8bppicosize1x18bppico,Psummary_downloads,Psummary_mitigate,Psummary_testsjittestjittesttestsgcbug1517158js,Psummary_fuse,Psummary_lighter,Psummary_eventsource
0,,Core,2024-08-27T07:10:15Z,P5,Components,defect,2024-08-27T07:10:15Z,Widget,NEW,nobody@mozilla.org,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,,Core,2024-08-27T07:09:56Z,--,Components,defect,2024-08-27T07:12:28Z,Networking: DNS,UNCONFIRMED,nobody@mozilla.org,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,,Core,2024-08-27T06:56:08Z,--,Components,task,2024-08-27T07:02:38Z,WebRTC: Audio/Video,ASSIGNED,apehrson@mozilla.com,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,,Core,2024-08-27T06:13:19Z,P5,Components,defect,2024-08-27T06:17:48Z,Panning and Zooming,NEW,nobody@mozilla.org,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,,Core,2024-08-27T05:29:11Z,--,Components,defect,2024-08-27T06:25:14Z,DOM: Core & HTML,NEW,nobody@mozilla.org,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [31]:
mergeddata = mergeddata[mergeddata['priority']!= '--']
# mergeddata.loc[mergeddata['priority']=='--'].head()


mergeddata = mergeddata[mergeddata['severity']!= '--']
mergeddata['severity'].replace('normal','S3')
# mergeddata.loc[mergeddata['severity']=='--'].head()

len(mergeddata)

4110

In [32]:
mergeddata['severity'].unique()

array(['S4', 'SN/A', 'S3', 'S2', 'S1'], dtype=object)

In [33]:
tipos = mergeddata.dtypes
# tipos['columna']=tipos.index
# tipos.rename({'0':'tipo'})
tipos = tipos.to_frame()

# columnas = tipos.loc[tipos['tipo']=='object'].index
columnas = tipos.loc[tipos[0]!='float'].index.to_list()
columnas

for columna in columnas:
    print(columna)
    print(mergeddata[columna].apply(type).unique())
    print('--------------------------------------------')

resolution
[<class 'str'>]
--------------------------------------------
product
[<class 'str'>]
--------------------------------------------
creation_time
[<class 'str'>]
--------------------------------------------
priority
[<class 'str'>]
--------------------------------------------
classification
[<class 'str'>]
--------------------------------------------
type
[<class 'str'>]
--------------------------------------------
last_change_time
[<class 'str'>]
--------------------------------------------
component
[<class 'str'>]
--------------------------------------------
status
[<class 'str'>]
--------------------------------------------
assigned_to
[<class 'str'>]
--------------------------------------------
id
[<class 'int'>]
--------------------------------------------
severity
[<class 'str'>]
--------------------------------------------
version
[<class 'str'>]
--------------------------------------------
assigned_to_detail.nick
[<class 'str'>]
---------------------------------------

# Prepare

In [34]:
mergeddata = dummies(mergeddata,'resolution')
mergeddata = dummies(mergeddata,'product')
mergeddata, lemodel = lencoder(mergeddata,'priority')
mergeddata = dummies(mergeddata,'classification')
mergeddata = dummies(mergeddata,'type')
mergeddata = dummies(mergeddata,'component')
mergeddata = dummies(mergeddata,'assigned_to')
mergeddata, lemodel = lencoder(mergeddata,'severity')
mergeddata, lemodel = lencoder(mergeddata,'version')
mergeddata = dummies(mergeddata,'assigned_to_detail.nick')
mergeddata = dummies(mergeddata,'assigned_to_detail.id')
mergeddata = dummies(mergeddata,'assigned_to_detail.name')
mergeddata = dummies(mergeddata,'assigned_to_detail.email')
mergeddata['creation_time'] = pd.to_datetime(mergeddata['creation_time'])
mergeddata['last_change_time'] = pd.to_datetime(mergeddata['last_change_time'])

In [35]:
def columns_with_nan(df):
    # Obtener una lista de los nombres de las columnas
    columns = df.columns.to_list()

    # Filtrar las columnas que tienen valores NaN
    columns_with_nan = [col for col in columns if df[col].isna().any()]

    return columns_with_nan

In [36]:
def rows_with_nan(df, column_name):
    # Filtrar las filas donde la columna especificada tiene valores NaN
    rows_with_nan = df[df[column_name].isna()]
    return rows_with_nan
rows_with_nan(mergeddata,'Psummary_intermittent')
mergeddata.drop(index=9999, inplace=True)

# PyCaret

In [None]:
from pycaret.classification import *

dataset = mergeddata

data = dataset.drop(['priority'
,'resolution'
,'product'
,'creation_time'
,'classification'
,'type'
,'last_change_time'
,'component'
,'status'
,'assigned_to'
,'assigned_to_detail.nick'
,'assigned_to_detail.name'
,'assigned_to_detail.email'
                ],axis=1)

clf1 = setup(data, target='severity')

# Compare models
best_model = compare_models()

ModuleNotFoundError: No module named 'pycaret'

In [None]:
tuned_model = tune_model(best_model)

NameError: name 'tune_model' is not defined

In [None]:
evaluate_model(tuned_model)

# XGBoost

In [None]:
from sklearn.metrics import accuracy_score, roc_auc_score, recall_score, precision_score, f1_score, cohen_kappa_score, matthews_corrcoef

def scores(y_test, y_pred):
    accuracy = accuracy_score(y_test, y_pred)
    print("Accuracy:", accuracy)

    # Recall
    recall = recall_score(y_test, y_pred,average='weighted')
    print("Recall:", recall)

    # Precision
    precision = precision_score(y_test, y_pred,average='weighted')
    print("Precision:", precision)

    # F1 Score
    f1 = f1_score(y_test, y_pred,average='weighted')
    print("F1 Score:", f1)

    # Cohen's Kappa
    kappa = cohen_kappa_score(y_test, y_pred)
    print("Cohen's Kappa:", kappa)

    # Matthews Correlation Coefficient (MCC)
    mcc = matthews_corrcoef(y_test, y_pred)
    print("MCC:", mcc)


In [None]:
def run_kfold_model (model, X,y,K=10,graph=True):
    regr = LinearRegression()
    kf = KFold(n_splits=5)
    s=1
    for train_index, test_index in kf.split(X):
        # print("TRAIN:", train_index, "TEST:", test_index)
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]
        model.fit(X_train, y_train)
        print('Slice '+str(s)+' score: '+str(regr.score(X_test, y_test)))
        y_pred = model.predict(X_test)
        scores(y_test, y_pred)
        if graph:
            plt.scatter(y_test, y_pred, color = 'red', marker = 'o', s = 35, alpha = 0.5, label = 'Test data')
            plt.show()
        s+=1
    return(regr)

In [None]:
from numpy import loadtxt
import xgboost as xgb
from xgboost import DMatrix, XGBClassifier

dataset = mergeddata

X = dataset.drop(['severity','priority'
,'resolution'
,'product'
,'creation_time'
,'classification'
,'type'
,'last_change_time'
,'component'
,'status'
,'assigned_to'
,'assigned_to_detail.nick'
,'assigned_to_detail.name'
,'assigned_to_detail.email'
                ],axis=1)

Y = dataset['severity']

# split data into train and test sets
seed = 7
test_size = 0.3
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=test_size, random_state=seed)

# dtrain = DMatrix(X_train, label=y_train, enable_categorical=True)

# params = {
#     # 'objective': 'binary:logistic',
#     'objective': 'multi:softmax',
#     'num_class': 4,
#     'tree_method': 'hist'  # 'hist' or 'approx' are recommended for categorical data
# }

# fit model on training data
model = XGBClassifier(objective= 'multi:softmax', num_class= 4)
model.fit(X_train, y_train)
# model = xgb.train(params, dtrain, num_boost_round=10)

# make predictions for test data
# dpredict = xgb.DMatrix(X_test, enable_categorical=True)

# y_pred = model.predict(dpredict)


# predictions = [round(value) for value in y_pred]

# scores(model, X_train, X_test, y_train, y_test)

# run_kfold_model(model, X_train, y_train)

y_pred = model.predict(X_test)
scores(y_test, y_pred)

Accuracy: 0.8572587185725872
Recall: 0.8572587185725872
Precision: 0.8462048189113937
F1 Score: 0.8503490062200572
Cohen's Kappa: 0.7705153801901797
MCC: 0.7711123744607169


# PyCaret top 7

In [37]:
from pycaret.classification import *

dataset = mergeddata

data = dataset.drop(['priority'
,'resolution'
,'product'
,'creation_time'
,'classification'
,'type'
,'last_change_time'
,'component'
,'status'
,'assigned_to'
,'assigned_to_detail.nick'
,'assigned_to_detail.name'
,'assigned_to_detail.email'
                ],axis=1)

clf1 = setup(data, target='severity')

model = [create_model(i) for i in [
    # 'rf',
    # 'gbc',
    # 'et',
    'xgboost',
    # 'catboost',
    # 'lightgbm',
    'dt']]

for i in (range(2)):
    print(i)
    model[i].get_params()


Unnamed: 0,Description,Value
0,Session id,1234
1,Target,severity
2,Target type,Multiclass
3,Original data shape,"(4109, 4184)"
4,Transformed data shape,"(4109, 4184)"
5,Transformed train set shape,"(2876, 4184)"
6,Transformed test set shape,"(1233, 4184)"
7,Numeric features,3132
8,Preprocess,True
9,Imputation type,simple


Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.8333,0.0,0.8333,0.8247,0.8287,0.7311,0.7314
1,0.8576,0.0,0.8576,0.8394,0.8472,0.7694,0.7706
2,0.8785,0.0,0.8785,0.8772,0.8756,0.804,0.8044
3,0.8785,0.964,0.8785,0.8709,0.8734,0.8042,0.8047
4,0.8472,0.9642,0.8472,0.8393,0.8404,0.7561,0.7582
5,0.8299,0.9485,0.8299,0.8141,0.8213,0.7261,0.727
6,0.8432,0.0,0.8432,0.8281,0.8347,0.7469,0.7476
7,0.8397,0.0,0.8397,0.8459,0.8402,0.7469,0.7493
8,0.8502,0.0,0.8502,0.8463,0.8464,0.7601,0.761
9,0.8711,0.0,0.8711,0.8637,0.8632,0.7952,0.8


Processing:   0%|          | 0/4 [00:00<?, ?it/s]

Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.8194,0.0,0.8194,0.8108,0.8145,0.7094,0.7099
1,0.816,0.0,0.816,0.8144,0.8137,0.7033,0.7044
2,0.875,0.0,0.875,0.8749,0.8746,0.7996,0.7998
3,0.8472,0.8826,0.8472,0.83,0.8385,0.751,0.7513
4,0.8611,0.9024,0.8611,0.8602,0.8573,0.7779,0.7802
5,0.8229,0.8648,0.8229,0.821,0.8213,0.7155,0.7161
6,0.8118,0.0,0.8118,0.8051,0.8078,0.6959,0.6961
7,0.8293,0.0,0.8293,0.8235,0.8249,0.7277,0.7289
8,0.8223,0.0,0.8223,0.8131,0.8174,0.7125,0.7127
9,0.7979,0.0,0.7979,0.7955,0.7962,0.6766,0.677


Processing:   0%|          | 0/4 [00:00<?, ?it/s]

0
1


In [38]:
import pickle as pk
i=0
for m in [#'rf','gbc','et',
          'xgboost',
          #'catboost','lightgbm',
          'dt']:
  print(model[i].get_params())
  name=m+'.pkl'
  with open(name, "wb") as file: # file is a variable for storing the newly created file, it can be anything.
    pk.dump(model, file) # Dump function is used to write the object into the created file in byte format.
  i+=1

{'objective': 'multi:softprob', 'base_score': None, 'booster': 'gbtree', 'callbacks': None, 'colsample_bylevel': None, 'colsample_bynode': None, 'colsample_bytree': None, 'device': 'cpu', 'early_stopping_rounds': None, 'enable_categorical': False, 'eval_metric': None, 'feature_types': None, 'gamma': None, 'grow_policy': None, 'importance_type': None, 'interaction_constraints': None, 'learning_rate': None, 'max_bin': None, 'max_cat_threshold': None, 'max_cat_to_onehot': None, 'max_delta_step': None, 'max_depth': None, 'max_leaves': None, 'min_child_weight': None, 'missing': nan, 'monotone_constraints': None, 'multi_strategy': None, 'n_estimators': None, 'n_jobs': -1, 'num_parallel_tree': None, 'random_state': 1234, 'reg_alpha': None, 'reg_lambda': None, 'sampling_method': None, 'scale_pos_weight': None, 'subsample': None, 'tree_method': 'auto', 'validate_parameters': None, 'verbosity': 0}
{'ccp_alpha': 0.0, 'class_weight': None, 'criterion': 'gini', 'max_depth': None, 'max_features': No

# CUDA

In [None]:
# Remove existing CUDA installation (use with caution)
!apt-get --purge remove nvidia*

# Download and install CUDA 11
!wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64/cuda-ubuntu1804.pin
!mv cuda-ubuntu1804.pin /etc/apt/preferences.d/cuda-repository-pin-600
!apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64/7fa2af80.pub
!apt-get update
!apt-get -y install cuda-11-0

# Verify CUDA 11 installation
!nvcc --version

# Install cuDF 23.08 (compatible with CUDA 11)
!pip install cudf-cu11==23.08 --extra-index-url=https://pypi.nvidia.com

Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
Note, selecting 'nvidia-driver-550-server' for glob 'nvidia*'
Note, selecting 'nvidia-firmware-535-535.154.05' for glob 'nvidia*'
Note, selecting 'nvidia-docker2' for glob 'nvidia*'
Note, selecting 'nvidia-firmware-560-server-560.28.03' for glob 'nvidia*'
Note, selecting 'nvidia-cuda-toolkit-doc' for glob 'nvidia*'
Note, selecting 'nvidia-imex' for glob 'nvidia*'
Note, selecting 'nvidia-dkms-450-server' for glob 'nvidia*'
Note, selecting 'nvidia-firmware-535-server-535.154.05' for glob 'nvidia*'
Note, selecting 'nvidia-headless-390' for glob 'nvidia*'
Note, selecting 'nvidia-cuda-toolkit-gcc' for glob 'nvidia*'
Note, selecting 'nvidia-headless-418' for glob 'nvidia*'
Note, selecting 'nvidia-headless-430' for glob 'nvidia*'
Note, selecting 'nvidia-headless-435' for glob 'nvidia*'
Note, selecting 'nvidia-headless-440' for glob 'nvidia*'
Note, selecting 'nvidia-headless-450' for glob 'nvidia*'

In [None]:
!nvidia-smi
!nvcc --version

Wed Oct  2 02:44:28 2024       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.104.05             Driver Version: 535.104.05   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  Tesla T4                       Off | 00000000:00:04.0 Off |                    0 |
| N/A   65C    P0              31W /  70W |    103MiB / 15360MiB |      0%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
                                                                    

In [None]:
!pip install --upgrade pip

Collecting pip
  Using cached pip-24.2-py3-none-any.whl.metadata (3.6 kB)
Using cached pip-24.2-py3-none-any.whl (1.8 MB)
[0mInstalling collected packages: pip
  Attempting uninstall: pip
    Found existing installation: pip 24.1.2
    Uninstalling pip-24.1.2:
      Successfully uninstalled pip-24.1.2
Successfully installed pip-24.2


In [None]:
!ls -la /usr/local/lib/python3.10/dist-packages | grep -i ylibraft
!rm -rf /usr/local/lib/python3.10/dist-packages/-ylibraft-cu11*

drwxr-xr-x   9 root root     4096 Oct  2 02:20 pylibraft
drwxr-xr-x   2 root root     4096 Oct  2 02:20 pylibraft_cu11-23.8.0.dist-info
drwxr-xr-x   2 root root     4096 Oct  2 02:20 pylibraft_cu11.libs
drwxr-xr-x   3 root root     4096 Oct  2 02:06 pylibraft_cu12-24.8.1.dist-info
drwxr-xr-x   2 root root     4096 Oct  2 02:06 pylibraft_cu12.libs
drwxr-xr-x  10 root root     4096 Oct  2 02:06 ~ylibraft
drwxr-xr-x   3 root root     4096 Oct  2 01:12 ~ylibraft_cu11-24.8.1.dist-info
drwxr-xr-x   2 root root     4096 Oct  2 01:12 ~ylibraft_cu11.libs


In [None]:
!pip install cudf-cu11==23.08
!pip install cuml-cu11==23.08

[0m

In [None]:
!pip install cudf-cu11==23.08 --extra-index-url=https://pypi.nvidia.com
!pip install cuml-cu11==23.08 --extra-index-url=https://pypi.nvidia.com

[0mLooking in indexes: https://pypi.org/simple, https://pypi.nvidia.com
[0mLooking in indexes: https://pypi.org/simple, https://pypi.nvidia.com
Collecting cuml-cu11==23.08
  Using cached https://pypi.nvidia.com/cuml-cu11/cuml_cu11-23.8.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1081.6 MB)
Collecting dask-cuda==23.8.* (from cuml-cu11==23.08)
  Using cached dask_cuda-23.8.0-py3-none-any.whl.metadata (2.3 kB)
Collecting dask-cudf-cu11==23.8.* (from cuml-cu11==23.08)
  Using cached https://pypi.nvidia.com/dask-cudf-cu11/dask_cudf_cu11-23.8.0-py3-none-any.whl (81 kB)
Collecting distributed==2023.7.1 (from cuml-cu11==23.08)
  Using cached distributed-2023.7.1-py3-none-any.whl.metadata (3.3 kB)
Collecting raft-dask-cu11==23.8.* (from cuml-cu11==23.08)
  Using cached https://pypi.nvidia.com/raft-dask-cu11/raft_dask_cu11-23.8.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (214.7 MB)
Collecting pylibraft-cu11==23.8.* (from raft-dask-cu11==23.8.*->cuml-cu11==23.

In [None]:
import os
os.environ['LD_LIBRARY_PATH'] = '/usr/local/cuda/lib64:' + os.environ.get('LD_LIBRARY_PATH', '')

In [None]:
import cudf
import cuml

mergeddata = cudf.from_pandas(mergeddata)

from cuml.model_selection import GridSearchCV

ImportError: libcublas.so.11: cannot open shared object file: No such file or directory

# Optimising

In [39]:
dataset = mergeddata

X = dataset.drop(['severity','priority'
,'resolution'
,'product'
,'creation_time'
,'classification'
,'type'
,'last_change_time'
,'component'
,'status'
,'assigned_to'
,'assigned_to_detail.nick'
,'assigned_to_detail.name'
,'assigned_to_detail.email'
                ],axis=1)

Y = dataset['severity']

# split data into train and test sets
seed = 7
test_size = 0.3
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=test_size, random_state=seed)


In [40]:
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score
import pickle

# model[3].save_model('xgb_model.model')

# modelxgb = cuml.xgboost.XGBClassifier()

# modelxgb.load_model('xgb_model.model')

# with open('xgboost.pkl', 'rb') as f:
#   modelxgb = pickle.load(f)

modelxgb = model[0]

param_grid = {
    # 'max_depth': [3, 4, 5, 10],
    'max_depth': [3],
    # 'subsample': [0.1, 0.6, 0.8, 1.0]
    'subsample': [1.0],
    # 'colsample_bylevel': [0.6, 0.8, 1.0]
    'colsample_bylevel': [0.6],
    # 'colsample_bytree': [0.6, 0.8, 1.0]
    'colsample_bytree': [0.8],
    # 'min_child_weight': [1, 5, 100]
    'min_child_weight': [1],
    # 'reg_alpha': [1, 5, 100],
    # 'reg_lambda': [1, 5, 100]
    'reg_alpha': [1],
    'reg_lambda': [5],
    'gamma': [0.1],
    # 'gamma': [0, 0.1, 0.2, 0.3]
    # 'n_estimators': [50, 100, 250, 500],
    # 'learning_rate': [0.01, 0.2, 0.5]
    'n_estimators': [250],
    'learning_rate': [0.2]
}

# Set up the GridSearchCV
grid_search = GridSearchCV(
    estimator=modelxgb,
    param_grid=param_grid,
    scoring='accuracy',
    cv=3,
    verbose=1,
    n_jobs=-1
)

# Fit the model
grid_search.fit(X_train, y_train)

# Get the best parameters and evaluate the model
best_params = grid_search.best_params_
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)

print(f'Best Parameters: {best_params}')
print(f'Best Model Accuracy: {accuracy:.2f}')

Fitting 3 folds for each of 1 candidates, totalling 3 fits
Best Parameters: {'colsample_bylevel': 0.6, 'colsample_bytree': 0.8, 'gamma': 0.1, 'learning_rate': 0.2, 'max_depth': 3, 'min_child_weight': 1, 'n_estimators': 250, 'reg_alpha': 1, 'reg_lambda': 5, 'subsample': 1.0}
Best Model Accuracy: 0.85
