# Análisis

In [1]:
import warnings
warnings.filterwarnings("ignore",category=FutureWarning)
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
import datetime as dt
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import Lasso
from sklearn.linear_model import Ridge
from sklearn.model_selection import KFold
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import RobustScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import MaxAbsScaler
from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import make_pipeline
from sklearn import metrics
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import RandomOverSampler
from imblearn.over_sampling import SMOTE
from imblearn.combine import SMOTEENN
from numpy.linalg import svd
import requests
import json


sscaler = StandardScaler()
rscaler = RobustScaler()
mmscaler = MinMaxScaler()
mascaler = MaxAbsScaler()



In [2]:
def run_kfold_model (X,y,K=10,graph=True):
    regr = LinearRegression()
    kf = KFold(n_splits=5)
    s=1
    for train_index, test_index in kf.split(X):
        # print("TRAIN:", train_index, "TEST:", test_index)
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]
        regr.fit(X_train, y_train)
        print('Slice '+str(s)+' score: '+str(regr.score(X_test, y_test)))
        y_pred = regr.predict(X_test)
        if graph:
            plt.scatter(y_test, y_pred, color = 'red', marker = 'o', s = 35, alpha = 0.5, label = 'Test data')
            plt.show()
        s+=1
    return(regr)

In [3]:
def scores (model, X_train, X_test, y_train, y_pred, graph=True):
    MAE=metrics.mean_absolute_error(y_train, y_pred)
    MSE=metrics.mean_squared_error(y_train, y_pred)
    RMSE=np.sqrt(metrics.mean_squared_error(y_train, y_pred))
    CV_RMSE= (RMSE/np.average(y_train))*100
    R2=metrics.r2_score(y_train, y_pred)
    ACC=metrics.accuracy_score
    REC=metrics.recall_score
    PREC=metrics.precision_score

    print('MAE:'+str(MAE))
    print('MSE:'+str(MSE))
    print('RMSE:'+str(RMSE))
    print('CV(RMSE): '+str(CV_RMSE))
    print('R2:'+str(R2))
    print('Accuracy:'+str(ACC))
    print('Recall:'+str(REC))
    print('Precision:'+str(PREC))

    print('Training score: '+str(model.score(X_train, y_train))) #(Accuracy or R2)
    print('Test score: '+str(model.score(X_test, y_train)))
    if graph == True:
        plt.scatter(y_train, y_pred, color = 'red', marker = 'o', s = 35, alpha = 0.5, label = 'Test data')
        plt.show()

In [4]:
# data = pd.read_excel('mozilla-bugs-all.xlsx')
# data.describe

In [5]:
# def json_data(URL,params):
#     headers = {"Accept" : "application/json"}
#     resp = requests.get(URL, params= params, headers= headers)
#     df = pd.DataFrame()
#     if resp.status_code != 200:
#         print('error: ' + str(resp.status_code))
#     else:
#         print('Success')
#         bugs = resp.text
#         data = json.loads(bugs)
#         df = pd.json_normalize(data['bugs'])
#     return(df)
    
# params ={
#     "include_fields" : ["id"
#                         ,"summary"
#                         ,"status"
#                         ,"description"
#                         ,"type"
#                         ,"classification"
#                         ,"product"
#                         ,"component"
#                         ,"priority"
#                         ,"assigned_to"
#                         ,"resolution"
#                         ,"creation_time"
#                         ,"last_change_time"
#                         ,"severity"
#                         ,"version"
#                         ]
#     ,"product" : "Core"
#     # ,"status" : ["VERIFIED","RESOLVED","CLOSED","UNCONFIRMED","NEW"]
#     ,"limit" : 10000
#     ,"order": "opendate DESC"
# }
# URL = "https://bugzilla.mozilla.org/rest/bug"


In [6]:
# params["status"] = "RESOLVED"
# params.pop("status", None)

# test_df = json_data(URL,params)
# print(test_df.columns.to_list())
# test_df.tail()

In [7]:
# for i in test_df.columns.to_list():
#     print("Column "+i+" unique values:")
#     # print(test_df[i].unique())
#     print(test_df[i].value_counts())

#     print()


In [8]:
# # test_df.groupby(['resolution']).sum().plot(kind='pie', y='id', autopct='%1.0f%%', labeldistance=None)
# from datetime import datetime

# print(test_df['creation_time'].min())
# datetime.now() - pd.to_datetime(pd.to_datetime(test_df['creation_time'][test_df['resolution']=='FIXED']).values.astype(np.int64).mean())
# # plt.pie(x=test_df['resolution'].unique(), labels=test_df['resolution'].value_counts())
# # plt.show()

# # plt.title('Category Distribution')
# # plt.ylabel('')  # Hide the y-label
# # plt.show()

# # test_df.to_csv("dataset_20240907.csv")
# # test_df.date = pd.to_datetime(test_df.date).values.astype(np.int64)

# # test_df = pd.DataFrame(pd.to_datetime(test_df.groupby('column').mean().date))

In [9]:
data = pd.read_csv('dataset_20240827.csv')
data.describe

<bound method NDFrame.describe of       Unnamed: 0 resolution product         creation_time priority  \
0              0        NaN    Core  2024-08-27T07:10:15Z       P5   
1              1        NaN    Core  2024-08-27T07:09:56Z       --   
2              2        NaN    Core  2024-08-27T06:56:08Z       --   
3              3        NaN    Core  2024-08-27T06:13:19Z       P5   
4              4        NaN    Core  2024-08-27T05:29:11Z       --   
...          ...        ...     ...                   ...      ...   
9995        9995        NaN    Core  2024-03-23T18:04:58Z       --   
9996        9996        NaN    Core  2024-03-23T18:01:29Z       --   
9997        9997        NaN    Core  2024-03-23T17:57:21Z       P3   
9998        9998        NaN    Core  2024-03-23T17:56:41Z       P3   
9999        9999        NaN    Core  2024-03-23T17:55:13Z       P3   

     classification    type      last_change_time  \
0        Components  defect  2024-08-27T07:10:15Z   
1        Components

In [19]:
data.iloc[1]['summary']

'Bad name, functionality for network.trr.exclude-etc-hosts'

In [21]:
import nltk
nltk.download('punkt_tab')
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

[nltk_data] Downloading package punkt_tab to
[nltk_data]     /home/5f8afb46-9dbe-42c1-ae9d-
[nltk_data]     87d1e44235ab/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/5f8afb46-9dbe-42c1-ae9d-
[nltk_data]     87d1e44235ab/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to
[nltk_data]     /home/5f8afb46-9dbe-42c1-ae9d-
[nltk_data]     87d1e44235ab/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to
[nltk_data]     /home/5f8afb46-9dbe-42c1-ae9d-
[nltk_data]     87d1e44235ab/nltk_data...


True

In [23]:
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.tokenize import MWETokenizer
from nltk.stem import WordNetLemmatizer
import math
import re

def NLProcess (text):
        
    # print('Original text: ',data.iloc[1]['description'])
    #print("====================== starting ======================")
    ######################################################################### Tokenised
    mwe_tokenizer = MWETokenizer([('does', 'not'), ('in', 'spite', 'of'),('don', '’', 't')])
    tokenizer = word_tokenize
    tokenised = tokenizer(text)
    #print('Tokenised: ',tokenised)
    retokenised = mwe_tokenizer.tokenize(tokenised)
    # retokenised = [token.replace('_', '') for token in retokenised]
    #print('ReTokenised: ',retokenised)

    ######################################################################### Lower-case converted
    normalised = [word.lower() for word in retokenised] 
    #print('Normalised: ',normalised)

    ######################################################################### Punctuation marks removed
    unmarked = [re.sub(r'[^\w\s]', '', token) for token in normalised if re.sub(r'[^\w\s]', '', token)]
    #print('Punctuation marks removed: ',unmarked)

    ######################################################################### Stop-words removed
    stop_words = set(stopwords.words('english'))
    filtered = [word for word in unmarked if word not in stop_words]
    #print('Stop-words removed: ',filtered)

    ######################################################################### Lemmatised
    lemmatizer = WordNetLemmatizer()
    lemmatised = [lemmatizer.lemmatize(word) for word in filtered]
    #print('Lemmatised: ',lemmatised)
    
    return lemmatised


In [25]:
# for i in range(10):
#     NLProcess(data.iloc[i]['description'])

data['Psummary'] = data['summary'].map(NLProcess)

# type(data['summary'])

Tokenised:  ['Intermittent', 'widget/tests/browser/browser_test_ime_state_after_body_removed_and_reconnected_in_designMode.js', '|', 'single', 'tracking', 'bug']
ReTokenised:  ['Intermittent', 'widget/tests/browser/browser_test_ime_state_after_body_removed_and_reconnected_in_designMode.js', '|', 'single', 'tracking', 'bug']
Normalised:  ['intermittent', 'widget/tests/browser/browser_test_ime_state_after_body_removed_and_reconnected_in_designmode.js', '|', 'single', 'tracking', 'bug']
Punctuation marks removed:  ['intermittent', 'widgettestsbrowserbrowser_test_ime_state_after_body_removed_and_reconnected_in_designmodejs', 'single', 'tracking', 'bug']
Stop-words removed:  ['intermittent', 'widgettestsbrowserbrowser_test_ime_state_after_body_removed_and_reconnected_in_designmodejs', 'single', 'tracking', 'bug']
Lemmatised:  ['intermittent', 'widgettestsbrowserbrowser_test_ime_state_after_body_removed_and_reconnected_in_designmodejs', 'single', 'tracking', 'bug']
Tokenised:  ['Bad', 'name'

IOPub data rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_data_rate_limit`.

Current values:
ServerApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
ServerApp.rate_limit_window=3.0 (secs)



Stop-words removed:  ['wptsync', 'sync', 'pr', '45649', 'nt', 'reset', 'animationstransitions', 'movebefore']
Lemmatised:  ['wptsync', 'sync', 'pr', '45649', 'nt', 'reset', 'animationstransitions', 'movebefore']
Tokenised:  ['[', 'wpt-sync', ']', 'Sync', 'PR', '45647', '-', 'DOM', ':', 'Implement', '`', 'inspect', '(', ')', '`', 'Observable', 'operator']
ReTokenised:  ['[', 'wpt-sync', ']', 'Sync', 'PR', '45647', '-', 'DOM', ':', 'Implement', '`', 'inspect', '(', ')', '`', 'Observable', 'operator']
Normalised:  ['[', 'wpt-sync', ']', 'sync', 'pr', '45647', '-', 'dom', ':', 'implement', '`', 'inspect', '(', ')', '`', 'observable', 'operator']
Punctuation marks removed:  ['wptsync', 'sync', 'pr', '45647', 'dom', 'implement', 'inspect', 'observable', 'operator']
Stop-words removed:  ['wptsync', 'sync', 'pr', '45647', 'dom', 'implement', 'inspect', 'observable', 'operator']
Lemmatised:  ['wptsync', 'sync', 'pr', '45647', 'dom', 'implement', 'inspect', 'observable', 'operator']
Tokenised:  

In [22]:
data['Pdescription'] = data['description'].fillna('').map(NLProcess)



In [70]:
data.loc[data['description'].str.contains('//crash-stats.mozilla.org/report/index/768c5c44-57c5-4746-890c-9af820240811', case=False, na=False)]

Unnamed: 0.1,Unnamed: 0,resolution,product,creation_time,priority,classification,type,last_change_time,component,summary,...,assigned_to,id,severity,version,assigned_to_detail.nick,assigned_to_detail.id,assigned_to_detail.name,assigned_to_detail.real_name,assigned_to_detail.email,Psummary
14,14,,Core,2024-08-27T00:41:00Z,--,Components,defect,2024-08-27T07:57:36Z,JavaScript: GC,Crash in [@ js::gc::Arena::getAllocKind],...,nobody@mozilla.org,1915055,--,unspecified,nobody,1,nobody@mozilla.org,Nobody; OK to take it and work on it,nobody@mozilla.org,"[crash, j, gc, arena, getallockind]"


In [73]:
data['description'].iloc[15]

nan