In [1]:
import os
import javalang
from javalang import tree as jtree
from collections import Counter

import numpy as np
import pandas as pd
import re

import torch
import torch.nn as nn
import torch.optim as optim

from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
from torch.utils.data import TensorDataset, DataLoader

### Preprocessing data, including source files
from sklearn.preprocessing import MinMaxScaler
from imblearn.over_sampling import BorderlineSMOTE
from sklearn.model_selection import RepeatedStratifiedKFold


In [2]:
root_path  = "C:/Users/ACER/Downloads/DATASET"
project = ['ant', 'camel','jedit', 'log4j', 'lucene','poi', 'synapse', 'velocity', 'xalan', 'xerces']
project_lists = { 'xalan': ['2.5', '2.6', '2.7'],
    'xerces': ['1.1', '1.2', '1.3', '1.4.4'],
    'ant': ['1.3', '1.4', '1.5', '1.6', '1.7'],
    'camel': ['1.0', '1.2', '1.4', '1.6'],
    'jedit': ['3.2', '4.0', '4.1', '4.2', '4.3'],
    'log4j': ['1.0', '1.1', '1.2'],
    'lucene': ['2.0', '2.2', '2.4'],
    'poi': ['1.5', '2.0', '2.5', '3.0'],
    'synapse': ['1.0', '1.1', '1.2'],
    'velocity': ['1.4', '1.5', '1.6']
}

source  = "C:/Users/ACER/Downloads/DATASET/promise"
bug_repo = "C:/Users/ACER/Downloads/DATASET/software-metrics"
print(root_path)


C:/Users/ACER/Downloads/DATASET


Xu li file java: Ở đoạn này các đoạn code được chuyển thành cây AST, sau đó ta xử lí và phân loại từng node 

In [3]:
def parse_java_file(file_path):
    try:
        with open(file_path, 'r', encoding = 'utf-8', errors = 'ignore') as f:
            content  = f.read()
        tree = javalang.parse.parse(content)
        tokens = []

        for path, node in tree:
            node_type = type(node).__name__

            if node_type  in ["MethodInvocation", "SuperMethodInvocation"]:
                tokens.append(node.member) #Lấy tên phương thức được gọi (println, add,....)
            elif node_type == "ClassCreator":
                tokens.append(f"new_{node.type.name}")#Trường hợp code tạo object( new classname....)
            elif node_type in ['FormalParameter', 'BasicType', 'PackageDeclaration', 'InterfaceDeclaration', 
                              'CatchClauseParameter', 'ClassDeclaration', 'MemberReference', 'SuperMemberReference', 
                              'ConstructorDeclaration', 'ReferenceType', 'MethodDeclaration', 'VariableDeclarator',
                              'IfStatement', 'WhileStatement', 'DoStatement', 'ForStatement', 'AssertStatement', 
                              'BreakStatement', 'ContinueStatement', 'ReturnStatement', 'ThrowStatement', 
                              'SynchronizedStatement', 'TryStatement', 'SwitchStatement', 'BlockStatement',
                              'StatementExpression', 'TryResource', 'CatchClause', 'SwitchStatementCase', 
                              'ForControl', 'EnhancedForControl']:
                tokens.append(f"<{node_type}>") #Các trường hợp của code quan trọng còn lại như khai báo
                #biến, vòng lặp, mệnh đề so sánh,.... thay vì đánh dấu chung thì gọi cụ thể, đánh dấu bằng node type
                #của nó luôn

        return tokens
    except Exception as e:
        return []

Xử lý token từ data: duyệt qua toàn bộ cấu trúc thư mục, chỉ lấy ra các file code java, sử dụng hàm parse_java_file ở trên để biến mỗi file java thành một token vector tương ứng

In [4]:
def extract_tokens_from_dataset(dataset_path):
    all_tokens = []
    file_paths = []
    for root, dirs, files in os.walk(dataset_path):#duyệt toàn bộ cấu trúc thư mục
        #dataset, bắt đầu từ thư mục hiện tại, sau đó duyệt qua các thư mục con dirs
        #và cuối cùng duyệt các file trong đó
        for file in files:
            if file.endswith('.java'):#chỉ duyệt những file code java
                file_path = os.path.join(root, file)
                #sau đó trích xuất cây AST từ file java vừa duyệt ra từ file
                tokens = parse_java_file(file_path) #tokens nay la list chua cac token cua 1 file java
                if tokens:
                    all_tokens.append(tokens)
                    file_paths.append(file_path)
    return all_tokens, file_paths


In [5]:
extract_tokens_from_dataset("C:/Users/ACER/Downloads/DATASET/promise/lucene/lucene-2.4/lucene-2.4")
#root_path/promise/f"{project}"/f"{project}-{version}"/f"{project}-{version}"

([['<PackageDeclaration>',
   '<ClassDeclaration>',
   '<ReferenceType>',
   '<VariableDeclarator>',
   '<ReferenceType>',
   '<VariableDeclarator>',
   'new_HashSet',
   '<ReferenceType>',
   '<ReferenceType>',
   '<VariableDeclarator>',
   'new_HashSet',
   '<ReferenceType>',
   '<ConstructorDeclaration>',
   '<StatementExpression>',
   '<MemberReference>',
   'makeStopSet',
   '<MemberReference>',
   '<ConstructorDeclaration>',
   '<FormalParameter>',
   '<ReferenceType>',
   '<StatementExpression>',
   '<MemberReference>',
   'makeStopSet',
   '<MemberReference>',
   '<ConstructorDeclaration>',
   '<FormalParameter>',
   '<ReferenceType>',
   '<StatementExpression>',
   '<MemberReference>',
   'new_HashSet',
   '<ReferenceType>',
   'keySet',
   '<ConstructorDeclaration>',
   '<FormalParameter>',
   '<ReferenceType>',
   '<StatementExpression>',
   '<MemberReference>',
   'getWordSet',
   '<MemberReference>',
   '<MethodDeclaration>',
   '<FormalParameter>',
   '<ReferenceType>',
 

Encoding token vectors thành numerical vectors: hiện tịa mỗi file java đang được biểu diễn là một vector token chữ, bước này sẽ chuyển chúng thành file vector số để xứ lí


In [6]:
# tạo vocab tần suất
def create_vocabulary(all_tokens, min_freq =3):
    #từ mà có tần suất xuất hiện dưới 3 thì loại
    #tạo vocab cho embedding
    token_counter = Counter()

    for tokens in all_tokens:
        token_counter.update(tokens) #đếm số lần xuất hiện của mỗi từ trong vocab

    vocab = ['<PAD>', '<UNK>']
    #voi tokens có mặt trong vocab thì 1, không thì 1
    for token, freq in token_counter.items():
        if freq >= min_freq:
            vocab.append(token)
        
    return vocab


In [7]:
#chuyển token vectors thành numerical vectors
def token_to_sequence(all_tokens, vocab, max_length):
    token_to_idx = {token: idx for idx, token in enumerate(vocab)}
    pad_idx = 0 #<PAD>
    unk_idx = 1 #<UNK>

    sequences  = []

    for tokens in all_tokens:
        sequence = [ token_to_idx. get(token, unk_idx) for token in tokens[:max_length]]

        if len(sequence) < max_length:
            sequence.extend([pad_idx]  * (max_length - len(sequence)))
        sequences.append(sequence)

    return np.array(sequences)

Kết hợp file bug repo , tạo labels cho từng numerical vectors

In [8]:
def create_labels_from_csv(csv_path, source_file_paths):

    df = pd.read_csv(csv_path)

    bug_dict = {}

    # tạo từ csv
    for _, row in df.iterrows():
        lmao = row['name'].split(".")
        filename = lmao[-2] + '.' + lmao[-1]
        bug_count = row['bug']
        bug_dict[filename] = 1 if bug_count > 0 else 0

    labels = []
    valid_indices = []

    for i, file_path in enumerate(source_file_paths):

        lmau = str(file_path).split('\\')
        filename = lmau[-2] + '.' + lmau[-1]
        filename_no_ext = filename.replace('.java', '')

        if filename_no_ext in bug_dict:
            labels.append(bug_dict[filename_no_ext])
            valid_indices.append(i)

    return np.array(labels), valid_indices

Extract feature

In [9]:
def extract_traditional_features(file_paths, indexs_valid):
    features = []
    
    for idx in indexs_valid:
        file_path =  file_paths[idx]
        try:
            with open(file_path, 'r', encoding  ='utf-8', errors = 'ignore') as f:
                content = f.read()
            lines = content.split('\n')

            #basic metrics
            loc = len(lines)
            blank_lines = len([line for line in lines if not line.strip()])


            #comments (single + multi _ line)
            single_comments = len([line for line in lines if line.strip().startswith('//')])
            multi_comments = len(re.findall(r'/\*.*?\*/', content, re.DOTALL))
            comment_lines = single_comments + multi_comments

            #More accurate complexity metrics
            if_count = len(re.findall(r'\bif\s*\(', content))
            for_count = len(re.findall(r'\bcount\s*\(', content))
            while_count =  len(re.findall(r'\bwhile\s*\(', content))
            switch_count = len(re.findall(r'\bswitch\s*\(', content))
            catch_count =  len(re.findall(r'\bcatch\s*\(', content))

            #logical operators
            and_count = len(re.findall(r'&&', content))
            or_count = len(re.findall(r'\|\|', content))

            cyclomatic_complexity =  (if_count + for_count + while_count + switch_count
                                      + catch_count + and_count + or_count  +1)
            
            #more accurate method count
            method_pattern = r'(public|private|protected|\s)\s+[\w<>\[\]]+\s+\w+\s*\([^)]*\)\s*\{'
            method_count = len(re.findall(method_pattern, content))

            #Additional metrics
            class_count  =  len(re.findall(r'\bclass\s+\w+', content))
            interface_count =  len(re.findall(r'\binterface\s+\w+', content))

            features.append([
                loc, comment_lines, blank_lines,
                cyclomatic_complexity, method_count, class_count,
                if_count, for_count, while_count, switch_count,
                and_count, or_count
            ])
        except Exception as e:
            print(f"Error processing {file_path}: {e}")
            features.append([0] * 22)

    return np.array(features)

Model Training  -  DP CNN

In [10]:
class DP_CNN(nn.Module) :
    def __init__(self, traditional_feature_dim, vocab_size, max_seq_len):
        #traditional_feature_dim: cacs feature co ban co san
        #vocab_size = kich thuoc tu dien, so luong tu ma co the hieu de embedding
        #max_seq_len: do dai cua token list dai nhat ma 1 sample file code co
        #định nghĩa các layer có trong mô hình 
        super(DP_CNN, self).__init__()

        # Lớp embedding chuyển các token đoạn code thành vector số
        self.embedding  =  nn.Embedding(num_embeddings =  vocab_size, embedding_dim= 30 )

        #CNN - gồm 1 lớp tích chập
        self.conv1 = nn.Conv1d(in_channels= 30, out_channels=10, kernel_size = 5, padding = 2)
        self.bn1 = nn.BatchNorm1d(10)
        self.relu = nn.ReLU()
        self.pooling =  nn.MaxPool1d(max_seq_len)

        self.linear1 =   nn.Linear(10, 100)
        self.bn2 = nn.BatchNorm1d(100)
        self.linear2  = nn.Linear(traditional_feature_dim + 100, 1)

    def forward(self, traditional_features, token_sequences):
        embedded =  self.embedding(token_sequences)
        embedded = embedded.permute(0,2,1)
        conv_out =  self.relu(self.bn1(self.conv1(embedded)))
        pooled  = self.pooling(conv_out)
        pooled  =pooled.squeeze(2) 
        cnn_features = self.relu(self.bn2(self.linear1(pooled)))
        combined =  torch.cat((traditional_features, cnn_features), dim = 1)
        output =  self.linear2(combined)

        return output

In [11]:
!pip install torchinfo


Defaulting to user installation because normal site-packages is not writeable



[notice] A new release of pip is available: 24.3.1 -> 25.2
[notice] To update, run: python.exe -m pip install --upgrade pip


In kiến trúc model

In [12]:
from torchinfo import summary
import torch

model =  DP_CNN(traditional_feature_dim= 50, vocab_size=5000, max_seq_len=100)

#tao tensor gai co batch_size  =32
traditional_features = torch.randn(32, 50)
token_sequences = torch.randint(0, 5000, (32, 100))

summary(
    model,
    input_data =  (traditional_features, token_sequences),
    col_names = ("input_size", "output_size", "num_params", "kernel_size", "trainable")
)


Layer (type:depth-idx)                   Input Shape               Output Shape              Param #                   Kernel Shape              Trainable
DP_CNN                                   [32, 50]                  [32, 1]                   --                        --                        True
├─Embedding: 1-1                         [32, 100]                 [32, 100, 30]             150,000                   --                        True
├─Conv1d: 1-2                            [32, 30, 100]             [32, 10, 100]             1,510                     [5]                       True
├─BatchNorm1d: 1-3                       [32, 10, 100]             [32, 10, 100]             20                        --                        True
├─ReLU: 1-4                              [32, 10, 100]             [32, 10, 100]             --                        --                        --
├─MaxPool1d: 1-5                         [32, 10, 100]             [32, 10, 1]               --  

In [13]:


def train_process(X_train, y_train, traditional_train, X_test, y_test, traditional_test, vocab_size, MAX_LENGTH):
    # Debug: In ra kích thước các mảng đầu vào
    print("Kích thước X_train:", np.shape(X_train))
    print("Kích thước X_test:", np.shape(X_test))
    print("Kích thước traditional_train:", np.shape(traditional_train))
    print("Kích thước traditional_test:", np.shape(traditional_test))
    print("Kích thước y_train:", np.shape(y_train))
    print("Kích thước y_test:", np.shape(y_test))
    

    # Đảm bảo dữ liệu là 2D array
    traditional_train = np.array(traditional_train, dtype=np.float32)
    traditional_test = np.array(traditional_test, dtype=np.float32)
    y_train = np.array(y_train, dtype=np.float32)
    y_test = np.array(y_test, dtype=np.float32)
    

    
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    print(f"Using device: {device}")
    
    model = DP_CNN(
        traditional_feature_dim=12,  # Số lượng traditional features (12)
        vocab_size=vocab_size,
        max_seq_len=MAX_LENGTH
    ).to(device)
    
    # Chuẩn hóa traditional features
    scaler = MinMaxScaler()
    traditional_train_norm = scaler.fit_transform(traditional_train)
    traditional_test_norm = scaler.transform(traditional_test)
    
    # Chuyển thành tensor
    X_train_tensor = torch.LongTensor(X_train).to(device)
    X_test_tensor = torch.LongTensor(X_test).to(device)
    trad_train_tensor = torch.FloatTensor(traditional_train_norm).to(device)
    trad_test_tensor = torch.FloatTensor(traditional_test_norm).to(device)
    y_train_tensor = torch.FloatTensor(y_train).to(device)
    y_test_tensor = torch.FloatTensor(y_test).to(device)
    
    # DataLoader
    train_dataset = TensorDataset(X_train_tensor, trad_train_tensor, y_train_tensor)
    train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
    test_dataset = TensorDataset(X_test_tensor, trad_test_tensor, y_test_tensor)
    test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)
    
    criterion = nn.BCEWithLogitsLoss()
    optimizer = optim.Adam(model.parameters(), lr=0.001, weight_decay=1e-5)
    
    num_epochs = 25
    train_losses = []
    
    for epoch in range(num_epochs):
        model.train()
        epoch_losses = []
        all_preds = []
        all_labels = []
        
        for seq_batch, trad_batch, labels_batch in train_loader:
            optimizer.zero_grad()
            outputs = model(trad_batch, seq_batch)
            loss = criterion(outputs.squeeze(), labels_batch)
            loss.backward()
            optimizer.step()
            
            epoch_losses.append(loss.item())
            preds = (torch.sigmoid(outputs) > 0.5).float().squeeze()
            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(labels_batch.cpu().numpy())
        
        avg_loss = np.mean(epoch_losses)
        accuracy = accuracy_score(all_labels, all_preds)
        
        if (epoch + 1) % 5 == 0:
            print(f"Epoch {epoch+1:2d}/{num_epochs}: Loss={avg_loss:.4f}, Acc={accuracy:.4f}")
    
    model.eval()
    all_preds = []
    all_labels = []
    with torch.no_grad():
        for seq_batch, trad_batch, labels_batch in test_loader:
            outputs = model(trad_batch, seq_batch)
            preds = (torch.sigmoid(outputs) > 0.5).float().squeeze()
            
            preds_np = preds.cpu().numpy()
            labels_np = labels_batch.cpu().numpy()
            
            if preds_np.ndim == 0:
                preds_np = np.array([preds_np])
            if labels_np.ndim == 0:
                labels_np = np.array([labels_np])
            
            all_preds.extend(preds_np)
            all_labels.extend(labels_np)
        
        accuracy = accuracy_score(all_labels, all_preds)
        f1 = f1_score(all_labels, all_preds)
        
        print(f"Test Accuracy: {accuracy:.4f}, F1-score: {f1:.4f}")
        return f1, accuracy

In [14]:
def Coral(X_train, X_test, reg   = 1e-6):
    #covariance source va target
    cov_train  = np.cov(X_train, rowvar = False) + reg * np.eye(X_train.shape[1]) 
    cov_test = np.cov(X_test, rowvar= False) + reg*np.eye(X_test.shape[1])
    # Whitening: source -> identity
    U_train, S_train, _ = np.linalg.svd(cov_train)
    cov_train_inv_sqrt = (U_train @ np.diag(1.0 / np.sqrt(S_train)) @ U_train.T)

    # Coloring: identity -> target
    U_test, S_test, _ = np.linalg.svd(cov_test)
    cov_test_sqrt = (U_test @ np.diag(np.sqrt(S_test)) @ U_test.T)

    # Transform source
    X_train_new = (X_train @ cov_train_inv_sqrt) @ cov_test_sqrt
    return X_train_new

In [15]:
import numpy as np
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import MinMaxScaler



def train_in_project(project_name, versions, versions_next):
    all_acc = []
    all_f1 = []
    
    print(f'Train on {project_name}-{versions}:')
    TRAIN_PATH = f"{source}/{project_name}/{project_name}-{versions}/{project_name}-{versions}"
    TEST_PATH = f"{source}/{project_name}/{project_name}-{versions_next}/{project_name}-{versions_next}"
    TRAIN_LABELS = f"{bug_repo}/{project_name}/{project_name}-{versions}.csv"
    TEST_LABELS = f"{bug_repo}/{project_name}/{project_name}-{versions_next}.csv"
    print(f'Train source path: {TRAIN_PATH}')
    print(f'Test source path: {TEST_PATH}')
    print(f'Train labels: {TRAIN_LABELS}')
    print(f'Test labels: {TEST_LABELS}')

    train_tokens, train_files = extract_tokens_from_dataset(TRAIN_PATH)
    test_tokens, test_files = extract_tokens_from_dataset(TEST_PATH)
    

    
    vocab = create_vocabulary(train_tokens, min_freq=3)
    vocab_size = len(vocab)
    MAX_LENGTH = 2000
    X_train_seq = token_to_sequence(train_tokens, vocab, MAX_LENGTH)
    X_test_seq = token_to_sequence(test_tokens, vocab, MAX_LENGTH)
    y_train, train_valid_indices = create_labels_from_csv(TRAIN_LABELS, train_files)
    y_test, test_valid_indices = create_labels_from_csv(TEST_LABELS, test_files)
    X_train_final = X_train_seq[train_valid_indices]
    X_test_final = X_test_seq[test_valid_indices]
    

    traditional_train = extract_traditional_features(train_files, train_valid_indices)
    traditional_test = extract_traditional_features(test_files, test_valid_indices)
    
    # Chuẩn hóa traditional features trước SMOTE
    scaler = MinMaxScaler()
    traditional_train = np.array(traditional_train, dtype=np.float32)
    traditional_test = np.array(traditional_test, dtype=np.float32)
    y_train = np.array(y_train, dtype=np.float32)
    traditional_train_norm = scaler.fit_transform(traditional_train)
    
    
    smote = SMOTE()
    try:
        traditional_train_smote, y_train_smote = smote.fit_resample(traditional_train_norm, y_train)
        
        # Đồng bộ token sequences
        n_samples_smote = len(y_train_smote)
        random_indices = np.random.choice(len(X_train_final), size=n_samples_smote, replace=True)
        X_train_smote = X_train_final[random_indices]
        
        X_train_final_cc = X_train_smote
        y_train_cc = y_train_smote
        traditional_train_cc = traditional_train_smote
        print(f"Original class distribution: {np.bincount(y_train.astype(int))}")
        print(f"After SMOTE: {np.bincount(y_train_smote.astype(int))}")
    except ValueError as e:
        print(f"SMOTE failed for {project_name}-{versions}: {e}. Using original data.")
        X_train_final_cc = X_train_final
        y_train_cc = y_train
        traditional_train_cc = traditional_train_norm
    
    print(f"Train shapes: X_train={X_train_final_cc.shape}, y_train={y_train_cc.shape}, traditional_train={traditional_train_cc.shape}")
    print(f"Test shapes: X_test={X_test_final.shape}, y_test={y_test.shape}, traditional_test={traditional_test.shape}")
    
    result = train_process(
        X_train_final_cc, y_train_cc, traditional_train_cc,
        X_test_final, y_test, traditional_test,
        vocab_size, MAX_LENGTH
    )
    all_f1.append(result[0])
    all_acc.append(result[1])
    return all_f1, all_acc

def train_cross(project_name, project_next):
    all_acc = []
    all_f1 = []
    
    TRAIN_PATH = f"{source}/{project_name}/{project_name}-{project_lists[project_name][-1]}/{project_name}-{project_lists[project_name][-1]}"
    TEST_PATH = f"{source}/{project_next}/{project_next}-{project_lists[project_next][0]}/{project_next}-{project_lists[project_next][0]}"
    TRAIN_LABELS = f"{bug_repo}/{project_name}/{project_name}-{project_lists[project_name][-1]}.csv"
    TEST_LABELS = f"{bug_repo}/{project_next}/{project_next}-{project_lists[project_next][0]}.csv"

    print(f'Train source path: {TRAIN_PATH}')
    print(f'Test source path: {TEST_PATH}')
    print(f'Train labels: {TRAIN_LABELS}')
    print(f'Test labels: {TEST_LABELS}')
    
    train_tokens, train_files = extract_tokens_from_dataset(TRAIN_PATH)
    test_tokens, test_files = extract_tokens_from_dataset(TEST_PATH)
    

    vocab = create_vocabulary(train_tokens, min_freq=3)
    vocab_size = len(vocab)
    MAX_LENGTH = 2000
    X_train_seq = token_to_sequence(train_tokens, vocab, MAX_LENGTH)
    X_test_seq = token_to_sequence(test_tokens, vocab, MAX_LENGTH)
    y_train, train_valid_indices = create_labels_from_csv(TRAIN_LABELS, train_files)
    y_test, test_valid_indices = create_labels_from_csv(TEST_LABELS, test_files)
    X_train_final = X_train_seq[train_valid_indices]
    X_test_final = X_test_seq[test_valid_indices]
    

    
    traditional_train = extract_traditional_features(train_files, train_valid_indices)
    traditional_test = extract_traditional_features(test_files, test_valid_indices)
    
    # Chuẩn hóa traditional features trước SMOTE và CORAL
    scaler = MinMaxScaler()
    traditional_train = np.array(traditional_train, dtype=np.float32)
    traditional_test = np.array(traditional_test, dtype=np.float32)
    y_train = np.array(y_train, dtype=np.float32)
    traditional_train_norm = scaler.fit_transform(traditional_train)
    traditional_test_norm = scaler.transform(traditional_test)
    
    # Áp dụng SMOTE trên traditional features đã chuẩn hóa
    smote = SMOTE()
    try:
        traditional_train_smote, y_train_smote = smote.fit_resample(traditional_train_norm, y_train)
        
        # Đồng bộ token sequences
        n_samples_smote = len(y_train_smote)
        random_indices = np.random.choice(len(X_train_final), size=n_samples_smote, replace=True)
        X_train_smote = X_train_final[random_indices]
        
        X_train_final_cc = X_train_smote
        y_train_cc = y_train_smote
        traditional_train_cc = traditional_train_smote
        print(f"Original class distribution: {np.bincount(y_train.astype(int))}")
        print(f"After SMOTE: {np.bincount(y_train_smote.astype(int))}")
    except ValueError as e:
        print(f"SMOTE failed for {project_name}-{project_lists[project_name][-1]}: {e}. Using original data.")
        X_train_final_cc = X_train_final
        y_train_cc = y_train
        traditional_train_cc = traditional_train_norm
    
    # Áp dụng CORAL để điều chỉnh phân phối
    try:
        traditional_train_cc = Coral(traditional_train_cc, traditional_test_norm, reg=1e-6)
        print(f"Applied CORAL to align distributions between {project_name} and {project_next}")
    except np.linalg.LinAlgError as e:
        print(f"CORAL failed for {project_name}-{project_lists[project_name][-1]}: {e}. Using SMOTE data without CORAL.")
    
    print(f"Train shapes: X_train={X_train_final_cc.shape}, y_train={y_train_cc.shape}, traditional_train={traditional_train_cc.shape}")
    print(f"Test shapes: X_test={X_test_final.shape}, y_test={y_test.shape}, traditional_test={traditional_test.shape}")
    
    result = train_process(
        X_train_final_cc, y_train_cc, traditional_train_cc,
        X_test_final, y_test, traditional_test,
        vocab_size, MAX_LENGTH
    )
    all_f1.append(result[0])
    all_acc.append(result[1])
    return all_f1, all_acc

In [16]:
project_results_f1 = {}
project_results_acc = {}

for i in range(len(project)):
    for j in range(len(project_lists[project[i]])):
        if project_lists[project[i]][j] == project_lists[project[i]][-1]:
            try:
                if project_lists[project[i]][j] != '1.4.4':
                    result = train_cross(project[i],project[i+1])
                    project_results_f1[f'{project[i]}-{project_lists[project[i]][j]}']  =result[0]
                    project_results_acc[f'{project[i]}-{project_lists[project[i]][j]}']  =result[1]
                else:
                    break
            except Exception as e:
                print(f"Loi loi loz roi {project_lists[project[i]][j]} : {e}")
                continue
        else:
            try:
                result = train_in_project(project[i], project_lists[project[i]][j], project_lists[project[i]][j+1])
                project_results_f1[f'{project[i]}-{project_lists[project[i]][j]}'] = result[0]
                project_results_acc[f'{project[i]}-{project_lists[project[i]][j]}'] = result[1]
            except Exception as e:
                print(f"Loi loi loz roi {project_lists[project[i]][j]} : {e}")
                continue
                

f1_sum = 0
count = 0
valid_keys = []
for key in project_results_f1:
    f1_value = project_results_f1[key][0]  
    if f1_value > 0:  
        f1_sum += f1_value
        count += 1
        valid_keys.append(key)
    else:
        print(f"Skipping {key}: F1-score = 0 (empty or failed dataset)")

if count > 0:
    avg_f1 = f1_sum / count
    print(f"Average F1-score across {count} valid runs: {avg_f1:.4f}")
    print(f"Valid runs: {valid_keys}")
else:
    print("No valid F1-scores to average (all datasets empty or failed).")

        


Train on ant-1.3:
Train source path: C:/Users/ACER/Downloads/DATASET/promise/ant/ant-1.3/ant-1.3
Test source path: C:/Users/ACER/Downloads/DATASET/promise/ant/ant-1.4/ant-1.4
Train labels: C:/Users/ACER/Downloads/DATASET/software-metrics/ant/ant-1.3.csv
Test labels: C:/Users/ACER/Downloads/DATASET/software-metrics/ant/ant-1.4.csv
Original class distribution: [99 17]
After SMOTE: [99 99]
Train shapes: X_train=(198, 2000), y_train=(198,), traditional_train=(198, 12)
Test shapes: X_test=(166, 2000), y_test=(166,), traditional_test=(166, 12)
Kích thước X_train: (198, 2000)
Kích thước X_test: (166, 2000)
Kích thước traditional_train: (198, 12)
Kích thước traditional_test: (166, 12)
Kích thước y_train: (198,)
Kích thước y_test: (166,)
Using device: cuda
Epoch  5/25: Loss=0.6418, Acc=0.6970
Epoch 10/25: Loss=0.5917, Acc=0.6919
Epoch 15/25: Loss=0.5535, Acc=0.7323
Epoch 20/25: Loss=0.5965, Acc=0.7273
Epoch 25/25: Loss=0.5323, Acc=0.7273
Test Accuracy: 0.2169, F1-score: 0.3564
Train on ant-1.4:

In [17]:
pd.read_csv("C:/Users/ACER/Downloads/DATASET/software-metrics/ant/ant-1.3.csv").head()

Unnamed: 0,name,wmc,dit,noc,cbo,rfc,lcom,ca,ce,npm,...,dam,moa,mfa,cam,ic,cbm,amc,max_cc,avg_cc,bug
0,org.apache.tools.ant.taskdefs.ExecuteOn,11,4,2,14,42,29,2,12,5,...,1.0,1,0.885057,0.232323,3,4,34.545455,3,1.2727,0
1,org.apache.tools.ant.DefaultLogger,14,1,1,8,32,49,4,4,12,...,1.0,0,0.0,0.307692,0,0,16.857143,6,1.6429,2
2,org.apache.tools.ant.taskdefs.TaskOutputStream,3,2,0,1,9,0,0,1,1,...,1.0,1,0.714286,0.666667,1,1,17.333333,1,0.6667,0
3,org.apache.tools.ant.taskdefs.Cvs,12,3,0,12,37,32,0,12,12,...,1.0,1,0.770833,0.458333,0,0,24.083333,3,1.4167,0
4,org.apache.tools.ant.taskdefs.Copyfile,6,3,0,4,21,1,0,4,6,...,1.0,0,0.880952,0.416667,2,2,21.0,1,0.8333,0
