The Model That Have Results Public 0.72 - Private 0.55


In [None]:
import pandas as pd
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer, PorterStemmer
from sklearn.model_selection import RandomizedSearchCV
import xgboost as xgb
from sentence_transformers import SentenceTransformer
from sklearn.utils import shuffle
import nltk

# Load NLTK resources
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

# Load data
train_data = pd.read_csv('bugs-train.csv')
test_data = pd.read_csv('bugs-test.csv')
# Initialize Sentence-BERT model
model = SentenceTransformer('all-MiniLM-L6-v2')

# Get Sentence-BERT embeddings for train and test data
X_train_bert = model.encode(train_data['summary'].tolist(), show_progress_bar=True, batch_size=64)
X_test_bert = model.encode(test_data['summary'].tolist(), show_progress_bar=True, batch_size=64)

# Map severity to numerical values
severity_mapping = {'trivial': 0, 'enhancement': 1, 'minor': 2, 'normal': 3, 'major': 4, 'blocker': 5, 'critical': 6}
train_data['severity'] = train_data['severity'].map(severity_mapping).dropna()
y_train = train_data['severity'].values

# Shuffle the data
X_train_combined, y_train = shuffle(X_train_bert, y_train, random_state=42)
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.utils.class_weight import compute_sample_weight
import pandas as pd

# Calculate sample weights
sample_weights = compute_sample_weight(class_weight='balanced', y=y_train)

# Define the parameter grid for Logistic Regression
param_grid = {
    'C': [0.001, 0.01, 0.1, 1, 10, 100],
    'solver': ['lbfgs', 'liblinear'],
    'max_iter': [100, 200, 300, 500]
}

# Initialize the Logistic Regression model
logreg = LogisticRegression(random_state=42)

# Set up the GridSearchCV
grid_search = GridSearchCV(estimator=logreg, param_grid=param_grid,
                           scoring='accuracy', cv=5, n_jobs=-1, verbose=2)

# Fit the model with the best parameters
grid_search.fit(X_train_combined, y_train, sample_weight=sample_weights)

# Get the best estimator
best_logreg = grid_search.best_estimator_

# Make predictions on the test set
test_predictions = best_logreg.predict(X_test_bert)

# Convert numerical predictions back to severity strings
severity_mapping_inverse = {v: k for k, v in severity_mapping.items()}
test_data['severity'] = [severity_mapping_inverse[pred] for pred in test_predictions]

# Prepare the result file in the format of the sample solution
result = test_data[['bug_id', 'severity']].copy()
result_path = 'bugs-pred.csv'
result.to_csv(result_path, index=False)

# Print out the best parameters and the best score
print("Best parameters found: ", grid_search.best_params_)
print("Best cross-validation score: ", grid_search.best_score_)

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.utils.class_weight import compute_sample_weight
import pandas as pd

# Calculate sample weights
sample_weights = compute_sample_weight(class_weight='balanced', y=y_train)

# Initialize the Logistic Regression model with specified parameters
logreg = LogisticRegression(random_state=42, C=1, max_iter=500, solver='liblinear')

# Fit the model
logreg.fit(X_train_combined, y_train, sample_weight=sample_weights)

# Make predictions on the test set
test_predictions = logreg.predict(X_test_bert)

# Convert numerical predictions back to severity strings
severity_mapping_inverse = {v: k for k, v in severity_mapping.items()}
test_data['severity'] = [severity_mapping_inverse[pred] for pred in test_predictions]

# Prepare the result file in the format of the sample solution
result = test_data[['bug_id', 'severity']].copy()
result_path = 'bugs-pred.csv'
result.to_csv(result_path, index=False)

# Since GridSearchCV is removed, we do not have best parameters or score
print("Model trained with C=1, max_iter=500, solver='liblinear'")


In [None]:
test_data["severity"].value_counts()

The Model That Have Results Public 0.70 - Private 0.60


In [None]:
import pandas as pd
import numpy as np
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer, PorterStemmer
from sklearn.model_selection import RandomizedSearchCV
import xgboost as xgb
from sentence_transformers import SentenceTransformer
from sklearn.utils import shuffle
import nltk
from sklearn.metrics import classification_report, confusion_matrix

# Load NLTK resources
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

# Load data
train_data = pd.read_csv('bugs-train.csv')
test_data = pd.read_csv('bugs-test.csv')

# Text preprocessing function
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()
stemmer = PorterStemmer()

def preprocess_text(text):
    text = text.lower()
    tokens = word_tokenize(text)
    filtered_tokens = [stemmer.stem(lemmatizer.lemmatize(word)) for word in tokens if word not in stop_words]
    return ' '.join(filtered_tokens)

# Apply preprocessing
train_data['summary'] = train_data['summary'].apply(preprocess_text)
test_data['summary'] = test_data['summary'].apply(preprocess_text)

# Initialize Sentence-BERT model
model = SentenceTransformer('all-MiniLM-L6-v2')

# Get Sentence-BERT embeddings for train and test data
X_train_bert = model.encode(train_data['summary'].tolist(), show_progress_bar=True, batch_size=64)
X_test_bert = model.encode(test_data['summary'].tolist(), show_progress_bar=True, batch_size=64)

# Map severity to numerical values
severity_mapping = {'trivial': 0, 'enhancement': 1, 'minor': 2, 'normal': 3, 'major': 4, 'blocker': 5, 'critical': 6}
train_data['severity'] = train_data['severity'].map(severity_mapping).dropna()
y_train = train_data['severity'].values

# Shuffle the data
X_train_combined, y_train = shuffle(X_train_bert, y_train, random_state=42)

# Model training with XGBoost
xgb_model = xgb.XGBClassifier(random_state=42, use_label_encoder=False, eval_metric='mlogloss')
param_dist_xgb = {
    'n_estimators': [100, 200],
    'max_depth': [3, 6, 9],
    'learning_rate': [0.01, 0.1, 0.2],
    'subsample': [0.8, 0.9, 1.0],
    'colsample_bytree': [0.8, 0.9, 1.0]
}
random_search_xgb = RandomizedSearchCV(xgb_model, param_distributions=param_dist_xgb, n_iter=10, cv=3, scoring='accuracy', n_jobs=-1, random_state=42)
random_search_xgb.fit(X_train_combined, y_train)

# Make predictions on the test set
test_predictions = random_search_xgb.predict(X_test_bert)

# Convert numerical predictions back to severity strings
severity_mapping_inverse = {v: k for k, v in severity_mapping.items()}
test_data['severity'] = [severity_mapping_inverse[pred] for pred in test_predictions]

# Evaluate model performance
print(classification_report(y_test, test_predictions, target_names=severity_mapping_inverse.values()))

# Prepare the result file in the format of the sample solution
result = test_data[['bug id', 'severity']].copy()
result_path = 'bugs-pred.csv'
result.to_csv(result_path, index=False)


The Model That Have Results Public 0.67 - Private 0.67


In [None]:
import pandas as pd
import numpy as np
import joblib
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import SGDClassifier
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from xgboost import XGBClassifier
from sklearn.metrics import classification_report, precision_recall_fscore_support
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer, PorterStemmer
from sentence_transformers import SentenceTransformer
from sklearn.utils import shuffle
from sklearn.svm import SVC

np.random.seed(42)

nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

train_data = pd.read_csv('bugs-train (1).csv')
test_data = pd.read_csv('bugs-test (1).csv')

stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()
stemmer = PorterStemmer()

def preprocess_text(text):
    text = text.lower()
    tokens = word_tokenize(text)
    filtered_tokens = [stemmer.stem(lemmatizer.lemmatize(word)) for word in tokens if word not in stop_words]
    return ' '.join(filtered_tokens)

train_data['summary'] = train_data['summary'].apply(preprocess_text)
test_data['summary'] = test_data['summary'].apply(preprocess_text)

model = SentenceTransformer('all-MiniLM-L6-v2')

X_train_bert = model.encode(train_data['summary'].tolist(), show_progress_bar=True, batch_size=64)
X_test_bert = model.encode(test_data['summary'].tolist(), show_progress_bar=True, batch_size=64)

severity_mapping = {'trivial': 0, 'enhancement': 1, 'minor': 2, 'normal': 3, 'major': 4, 'blocker': 5, 'critical': 6}
train_data['severity'] = train_data['severity'].map(severity_mapping).dropna()
y_train = train_data['severity'].values

X_train_combined, y_train = shuffle(X_train_bert, y_train, random_state=42)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_combined)
pca = PCA(n_components=100, random_state=42)
X_train_pca = pca.fit_transform(X_train_scaled)
X_test_scaled = scaler.transform(X_test_bert)
X_test_pca = pca.transform(X_test_scaled)

X_train_pca, X_val_pca, y_train, y_val = train_test_split(X_train_pca, y_train, test_size=0.2, random_state=42)

random_forest = RandomForestClassifier(n_estimators=200, random_state=42, n_jobs=-1)
xgboost = XGBClassifier(objective='multi:softmax', num_class=7, eval_metric='mlogloss', use_label_encoder=False, random_state=42)
svm = SVC(kernel='rbf', C=1, random_state=42)

ensemble_model = VotingClassifier(estimators=[
    ('RandomForest', random_forest),
    ('XGBoost', xgboost),
    ('SVM', svm)
], voting='hard', n_jobs=-1)

print("Ensemble modelini eğitme...")
ensemble_model.fit(X_train_pca, y_train)
y_pred_val = ensemble_model.predict(X_val_pca)

print("Ensemble modeli performansı:")
print(classification_report(y_val, y_pred_val, target_names=severity_mapping.keys()))

precision, recall, f1, _ = precision_recall_fscore_support(y_val, y_pred_val, average='macro')
results = {
    'Ensemble': {
        'precision': precision,
        'recall': recall,
        'f1': f1
    }
}

y_pred_test = ensemble_model.predict(X_test_pca)

test_data = pd.read_csv('bugs-test (1).csv')
test_data['Ensemble_severity'] = y_pred_test
reverse_severity_mapping = {v: k for k, v in severity_mapping.items()}
test_data['Ensemble_severity'] = test_data['Ensemble_severity'].map(reverse_severity_mapping)

submission = test_data[['bug_id', 'Ensemble_severity']]
submission.to_csv('submission.csv', index=False)

results_df = pd.DataFrame(results)
print(results_df)

print(submission.head())