In [1]:
import pandas as pd
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer, PorterStemmer
from sklearn.model_selection import RandomizedSearchCV
import xgboost as xgb
from sentence_transformers import SentenceTransformer
from sklearn.utils import shuffle
import nltk

# Load NLTK resources
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

# Load data
train_data = pd.read_csv('bugs-train.csv')
test_data = pd.read_csv('bugs-test.csv')

  from tqdm.autonotebook import tqdm, trange
2024-06-03 20:23:03.945158: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2024-06-03 20:23:03.971074: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/kanat.ozgen/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /home/kanat.ozgen/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /home/kanat.ozgen/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [2]:
# Text preprocessing function
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()
stemmer = PorterStemmer()

def preprocess_text(text):
    text = text.lower()
    tokens = word_tokenize(text)
    filtered_tokens = [stemmer.stem(lemmatizer.lemmatize(word)) for word in tokens if word not in stop_words]
    return ' '.join(filtered_tokens)

# Apply preprocessing
train_data['summary'] = train_data['summary'].apply(preprocess_text)
test_data['summary'] = test_data['summary'].apply(preprocess_text)

In [3]:
# Initialize Sentence-BERT model
model = SentenceTransformer('all-MiniLM-L6-v2')

# Get Sentence-BERT embeddings for train and test data
X_train_bert = model.encode(train_data['summary'].tolist(), show_progress_bar=True, batch_size=64)
X_test_bert = model.encode(test_data['summary'].tolist(), show_progress_bar=True, batch_size=64)

# Map severity to numerical values
severity_mapping = {'trivial': 0, 'enhancement': 1, 'minor': 2, 'normal': 3, 'major': 4, 'blocker': 5, 'critical': 6}
train_data['severity'] = train_data['severity'].map(severity_mapping).dropna()
y_train = train_data['severity'].values

# Shuffle the data
X_train_combined, y_train = shuffle(X_train_bert, y_train, random_state=42)

Batches: 100%|██████████| 2500/2500 [00:14<00:00, 169.67it/s]
Batches: 100%|██████████| 1346/1346 [00:07<00:00, 170.11it/s]


In [4]:
import pandas as pd
from sklearn.utils.class_weight import compute_sample_weight
from xgboost import XGBClassifier

# Calculate sample weights
sample_weights = compute_sample_weight(class_weight='balanced', y=y_train)

# XGBoost model
xgboost = XGBClassifier(
    eval_metric='mlogloss',
    use_label_encoder=False,
    learning_rate=0.3,
    max_depth=9,
    subsample=0.8,
    colsample_bytree=0.7,
    n_estimators=250,
    tree_method='gpu_hist',
    gpu_id=0,
    predictor='gpu_predictor',
    random_state=42
)

# Train the model with XGBoost only
xgboost.fit(X_train_combined, y_train, sample_weight=sample_weights)

# Make predictions on the test set
test_predictions = xgboost.predict(X_test_bert)

# Convert numerical predictions back to severity strings
severity_mapping_inverse = {v: k for k, v in severity_mapping.items()}
test_data['severity'] = [severity_mapping_inverse[pred] for pred in test_predictions]

# Prepare the result file in the format of the sample solution
result = test_data[['bug_id', 'severity']].copy()
result_path = 'bugs-pred.csv'
result.to_csv(result_path, index=False)


Fitting 5 folds for each of 48 candidates, totalling 240 fits


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

[CV] END ................C=0.001, max_iter=300, solver=lbfgs; total time=  17.8s
[CV] END ................C=0.001, max_iter=300, solver=lbfgs; total time=  18.7s
[CV] END ................C=0.001, max_iter=300, solver=lbfgs; total time=  18.7s
[CV] END ................C=0.001, max_iter=100, solver=lbfgs; total time=  19.3s
[CV] END ................C=0.001, max_iter=200, solver=lbfgs; total time=  19.9s
[CV] END ................C=0.001, max_iter=200, solver=lbfgs; total time=  20.1s
[CV] END ................C=0.001, max_iter=300, solver=lbfgs; total time=  20.1s
[CV] END ................C=0.001, max_iter=100, solver=lbfgs; total time=  20.4s
[CV] END ................C=0.001, max_iter=500, solver=lbfgs; total time=  19.6s
[CV] END ................C=0.001, max_iter=500, solver=lbfgs; total time=  19.8s
[CV] END ................C=0.001, max_iter=200, solver=lbfgs; total time=  21.7s
[CV] END ................C=0.001, max_iter=100, solver=lbfgs; total time=  22.6s
[CV] END ................C=0

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


[CV] END ..................C=0.1, max_iter=100, solver=lbfgs; total time= 1.7min


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


[CV] END ..................C=0.1, max_iter=100, solver=lbfgs; total time= 1.7min
[CV] END .............C=0.01, max_iter=300, solver=liblinear; total time= 2.6min
[CV] END .............C=0.01, max_iter=200, solver=liblinear; total time= 2.9min
[CV] END .............C=0.01, max_iter=200, solver=liblinear; total time= 3.0min


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


[CV] END ..................C=0.1, max_iter=100, solver=lbfgs; total time= 2.0min
[CV] END .............C=0.01, max_iter=300, solver=liblinear; total time= 2.8min
[CV] END .............C=0.01, max_iter=200, solver=liblinear; total time= 2.9min


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


[CV] END ..................C=0.1, max_iter=100, solver=lbfgs; total time= 2.1min


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


[CV] END ..................C=0.1, max_iter=100, solver=lbfgs; total time= 2.0min
[CV] END .............C=0.01, max_iter=300, solver=liblinear; total time= 2.9min
[CV] END .............C=0.01, max_iter=200, solver=liblinear; total time= 3.1min
[CV] END .............C=0.01, max_iter=300, solver=liblinear; total time= 3.1min
[CV] END .............C=0.01, max_iter=300, solver=liblinear; total time= 3.1min
[CV] END .............C=0.01, max_iter=500, solver=liblinear; total time= 2.8min
[CV] END .............C=0.01, max_iter=200, solver=liblinear; total time= 3.3min
[CV] END .............C=0.01, max_iter=500, solver=liblinear; total time= 2.7min
[CV] END .............C=0.01, max_iter=500, solver=liblinear; total time= 3.0min
[CV] END .............C=0.01, max_iter=500, solver=liblinear; total time= 3.1min
[CV] END .............C=0.01, max_iter=500, solver=liblinear; total time= 3.3min
[CV] END ..................C=0.1, max_iter=200, solver=lbfgs; total time= 2.5min
[CV] END ..................C

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


[CV] END ....................C=1, max_iter=100, solver=lbfgs; total time= 1.9min
[CV] END ..............C=0.1, max_iter=100, solver=liblinear; total time= 5.2min
[CV] END ..................C=0.1, max_iter=500, solver=lbfgs; total time= 3.0min
[CV] END ..............C=0.1, max_iter=100, solver=liblinear; total time= 5.3min
[CV] END ..............C=0.1, max_iter=100, solver=liblinear; total time= 5.5min


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


[CV] END ....................C=1, max_iter=100, solver=lbfgs; total time= 1.9min
[CV] END ..............C=0.1, max_iter=100, solver=liblinear; total time= 5.4min
[CV] END ..............C=0.1, max_iter=100, solver=liblinear; total time= 5.6min
[CV] END ..............C=0.1, max_iter=200, solver=liblinear; total time= 5.0min
[CV] END ..............C=0.1, max_iter=200, solver=liblinear; total time= 5.3min


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


[CV] END ....................C=1, max_iter=100, solver=lbfgs; total time= 1.8min


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


[CV] END ....................C=1, max_iter=100, solver=lbfgs; total time= 1.9min
[CV] END ..............C=0.1, max_iter=200, solver=liblinear; total time= 4.8min


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


[CV] END ....................C=1, max_iter=100, solver=lbfgs; total time= 1.9min
[CV] END ..............C=0.1, max_iter=200, solver=liblinear; total time= 5.5min
[CV] END ..............C=0.1, max_iter=300, solver=liblinear; total time= 5.1min
[CV] END ..............C=0.1, max_iter=200, solver=liblinear; total time= 5.5min
[CV] END ..............C=0.1, max_iter=300, solver=liblinear; total time= 5.3min
[CV] END ..............C=0.1, max_iter=300, solver=liblinear; total time= 5.3min
[CV] END ..............C=0.1, max_iter=300, solver=liblinear; total time= 5.3min
[CV] END ..............C=0.1, max_iter=500, solver=liblinear; total time= 5.1min
[CV] END ..............C=0.1, max_iter=300, solver=liblinear; total time= 5.6min
[CV] END ..............C=0.1, max_iter=500, solver=liblinear; total time= 5.4min


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


[CV] END ....................C=1, max_iter=200, solver=lbfgs; total time= 3.6min
[CV] END ..............C=0.1, max_iter=500, solver=liblinear; total time= 5.7min


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


[CV] END ....................C=1, max_iter=200, solver=lbfgs; total time= 3.4min


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


[CV] END ....................C=1, max_iter=200, solver=lbfgs; total time= 3.5min
[CV] END ..............C=0.1, max_iter=500, solver=liblinear; total time= 5.5min
[CV] END ..............C=0.1, max_iter=500, solver=liblinear; total time= 5.6min


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


[CV] END ....................C=1, max_iter=200, solver=lbfgs; total time= 3.6min


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


[CV] END ....................C=1, max_iter=200, solver=lbfgs; total time= 3.6min


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


[CV] END ...................C=10, max_iter=100, solver=lbfgs; total time= 1.7min


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


[CV] END ....................C=1, max_iter=300, solver=lbfgs; total time= 4.4min


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


[CV] END ...................C=10, max_iter=100, solver=lbfgs; total time= 1.8min


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


[CV] END ....................C=1, max_iter=300, solver=lbfgs; total time= 4.7min


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


[CV] END ....................C=1, max_iter=300, solver=lbfgs; total time= 4.9min


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


[CV] END ....................C=1, max_iter=300, solver=lbfgs; total time= 5.1min


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


[CV] END ....................C=1, max_iter=300, solver=lbfgs; total time= 5.4min


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


[CV] END ...................C=10, max_iter=100, solver=lbfgs; total time= 1.7min


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


[CV] END ...................C=10, max_iter=100, solver=lbfgs; total time= 1.7min


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


[CV] END ...................C=10, max_iter=100, solver=lbfgs; total time= 1.9min
[CV] END ....................C=1, max_iter=500, solver=lbfgs; total time= 6.0min
[CV] END ....................C=1, max_iter=500, solver=lbfgs; total time= 6.2min
[CV] END ................C=1, max_iter=100, solver=liblinear; total time= 9.8min
[CV] END ....................C=1, max_iter=500, solver=lbfgs; total time= 6.9min
[CV] END ................C=1, max_iter=100, solver=liblinear; total time= 9.7min
[CV] END ....................C=1, max_iter=500, solver=lbfgs; total time= 6.7min
[CV] END ................C=1, max_iter=100, solver=liblinear; total time=10.0min
[CV] END ................C=1, max_iter=100, solver=liblinear; total time=10.6min
[CV] END ................C=1, max_iter=100, solver=liblinear; total time=10.5min
[CV] END ................C=1, max_iter=200, solver=liblinear; total time= 9.5min
[CV] END ................C=1, max_iter=200, solver=liblinear; total time=10.1min
[CV] END ................C=1

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


[CV] END ...................C=10, max_iter=200, solver=lbfgs; total time= 3.7min
[CV] END ................C=1, max_iter=200, solver=liblinear; total time=10.2min
[CV] END ................C=1, max_iter=200, solver=liblinear; total time=10.5min
[CV] END ....................C=1, max_iter=500, solver=lbfgs; total time= 7.4min


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


[CV] END ...................C=10, max_iter=200, solver=lbfgs; total time= 3.7min
[CV] END ................C=1, max_iter=300, solver=liblinear; total time= 9.8min
[CV] END ................C=1, max_iter=300, solver=liblinear; total time= 9.6min


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


[CV] END ...................C=10, max_iter=200, solver=lbfgs; total time= 3.5min
[CV] END ................C=1, max_iter=300, solver=liblinear; total time=10.0min
[CV] END ................C=1, max_iter=300, solver=liblinear; total time=10.0min


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


[CV] END ...................C=10, max_iter=200, solver=lbfgs; total time= 3.7min


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


[CV] END ...................C=10, max_iter=200, solver=lbfgs; total time= 3.6min
[CV] END ................C=1, max_iter=300, solver=liblinear; total time=10.8min
[CV] END ................C=1, max_iter=500, solver=liblinear; total time= 9.9min
[CV] END ................C=1, max_iter=500, solver=liblinear; total time= 9.7min
[CV] END ................C=1, max_iter=500, solver=liblinear; total time=10.2min
[CV] END ................C=1, max_iter=500, solver=liblinear; total time=10.3min
[CV] END ................C=1, max_iter=500, solver=liblinear; total time=10.8min


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


[CV] END ...................C=10, max_iter=300, solver=lbfgs; total time= 4.6min


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


[CV] END ...................C=10, max_iter=300, solver=lbfgs; total time= 4.4min


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


[CV] END ...................C=10, max_iter=300, solver=lbfgs; total time= 4.2min


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


[CV] END ...................C=10, max_iter=300, solver=lbfgs; total time= 5.5min


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


[CV] END ...................C=10, max_iter=300, solver=lbfgs; total time= 5.0min


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


[CV] END ..................C=100, max_iter=100, solver=lbfgs; total time= 1.9min


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


[CV] END ..................C=100, max_iter=100, solver=lbfgs; total time= 1.6min


KeyboardInterrupt: 

In [5]:
from sklearn.linear_model import LogisticRegression
from sklearn.utils.class_weight import compute_sample_weight
import pandas as pd

# Calculate sample weights
sample_weights = compute_sample_weight(class_weight='balanced', y=y_train)

# Initialize the Logistic Regression model with specified parameters
logreg = LogisticRegression(random_state=42, C=1, max_iter=500, solver='liblinear')

# Fit the model
logreg.fit(X_train_combined, y_train, sample_weight=sample_weights)

# Make predictions on the test set
test_predictions = logreg.predict(X_test_bert)

# Convert numerical predictions back to severity strings
severity_mapping_inverse = {v: k for k, v in severity_mapping.items()}
test_data['severity'] = [severity_mapping_inverse[pred] for pred in test_predictions]

# Prepare the result file in the format of the sample solution
result = test_data[['bug_id', 'severity']].copy()
result_path = 'bugs-pred.csv'
result.to_csv(result_path, index=False)

# Since GridSearchCV is removed, we do not have best parameters or score
print("Model trained with C=1, max_iter=500, solver='liblinear'")


Model trained with C=1, max_iter=500, solver='liblinear'


In [6]:
test_data["severity"].value_counts()

normal         18743
enhancement    15495
critical       12229
major          11237
trivial        10487
blocker         9453
minor           8450
Name: severity, dtype: int64

: 