In [None]:
import pandas as pd
import numpy as np

def format_features(df):
    """
    Formats the features in the 'Features' column of the dataframe.
    Converts each string of features into a comma-separated string after cleaning it.
    Converts numbers from scientific notation to normal float representation.
    """
    formatted_features = []

    for feature_str in df['Features']:
        if feature_str is not None and feature_str != "":
            # Remove unwanted characters like brackets and newlines, then split by spaces
            cleaned_feature_str = feature_str.replace('[', '').replace(']', '').replace('\n', '').replace('  ', ' ')
            # Split the cleaned string into individual numbers
            features = cleaned_feature_str.split()

            # Convert each feature to float and format it to avoid scientific notation
            formatted_features_list = [f"{float(feature):.5f}" for feature in features]
            # Join the formatted features with a comma
            formatted_feature = ','.join(formatted_features_list)
        else:
            # Handle the case where features are None or empty
            formatted_feature = ''

        formatted_features.append(formatted_feature)

    df['Features'] = formatted_features
    return df


# Usage example.
df = pd.read_csv('data/GCN_NFAs.csv')
df = format_features(df)
df.to_csv('data/GCN_NFAs.csv', index=False)


In [6]:
import pandas as pd

def format_vector_string(vector_string):
    """Process a single vector_string to retain each number with five decimal places"""
    numbers = vector_string.split(',')
    formatted_numbers = [f"{float(num):.3f}" for num in numbers]
    return ','.join(formatted_numbers)

# Read CSV file.
df = pd.read_csv('data/NFAs_paper_prering2alt.csv')

# Apply a function to process the 'vector_string' column.
df['Features'] = df['Features'].apply(format_vector_string)

# Save the processed DataFrame back to a CSV file.
df.to_csv('data/NFAs_prering2alt.csv', index=False)


In [3]:
import pandas as pd

try:
    df = pd.read_csv('data/Data_GCN_prering2alt100.csv')
except UnicodeDecodeError:
    df = pd.read_csv('data/Data_GCN_prering2alt100.csv', encoding='latin1')  # 尝试使用 latin1 编码


# Define a function to strip quotes from the ends and concatenate the strings
# def concatenate_features(f1, f2):
#     # Strip quotes from the ends of the strings
#     f1_cleaned = f1.strip('"')
#     f2_cleaned = f2.strip('"')
#     # Concatenate with a comma
#     return f"{f1_cleaned},{f2_cleaned}"
def concatenate_features(f1, f2):
    # Check if f1 or f2 is NaN (a float in pandas), and replace it with empty string if so
    if pd.isna(f1):
        f1_cleaned = ''
    else:
        f1_cleaned = str(f1).strip('"')  # Convert to string and strip quotes

    if pd.isna(f2):
        f2_cleaned = ''
    else:
        f2_cleaned = str(f2).strip('"')  # Convert to string and strip quotes

    # Concatenate with a comma
    return f"{f1_cleaned},{f2_cleaned}"

# Apply the function to each row
df['Combined_Features'] = df.apply(lambda row: concatenate_features(row['Features1'], row['Features2']), axis=1)

# Save the modified DataFrame back to a new CSV file
df.to_csv('data/Data_GCN_prering2alt100.csv', index=False)


In [10]:
# SelfAttention 1
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F

# load data
df = pd.read_csv('data/GCN_NFAs_paper_prering2alt.csv')

# Clean and convert string features to a list of floating-point numbers.
def preprocess_features(x):
    return [float(i) for i in x.replace('[', '').replace(']', '').split(',')]

df['Features1'] = df['Features1'].apply(preprocess_features)
df['Features2'] = df['Features2'].apply(preprocess_features)

# Ensure that all feature lists have the same length.
def check_and_pad_feature_list(feature_list, target_length=110):
    if len(feature_list) < target_length:
        return feature_list + [0.0] * (target_length - len(feature_list))
    elif len(feature_list) > target_length:
        return feature_list[:target_length]
    return feature_list

df['Features1'] = df['Features1'].apply(lambda x: check_and_pad_feature_list(x, 110))
df['Features2'] = df['Features2'].apply(lambda x: check_and_pad_feature_list(x, 110))

# Set the dimensionality of each feature vector.
feature_dim = 110  
combined_feature_dim = feature_dim * 2  

# Define a self-attention module.
class SelfAttention(nn.Module):
    def __init__(self, embed_size, heads):
        super(SelfAttention, self).__init__()
        self.embed_size = embed_size
        self.heads = heads
        self.head_dim = embed_size // heads  

        assert self.head_dim * heads == embed_size, "Embed size needs to be divisible by heads"

        self.values = nn.Linear(self.head_dim, self.head_dim, bias=False)
        self.keys = nn.Linear(self.head_dim, self.head_dim, bias=False)
        self.queries = nn.Linear(self.head_dim, self.head_dim, bias=False)
        self.fc_out = nn.Linear(heads * self.head_dim, embed_size)

    def forward(self, values, keys, queries, mask=None):
        N = queries.shape[0]

        # Reshape values, keys, queries
        values = values.reshape(N, -1, self.heads, self.head_dim)
        keys = keys.reshape(N, -1, self.heads, self.head_dim)
        queries = queries.reshape(N, -1, self.heads, self.head_dim)

        energy = torch.einsum("nqhd,nkhd->nhqk", [queries, keys])
        if mask is not None:
            energy = energy.masked_fill(mask == 0, float("-inf"))
        attention = torch.softmax(energy / (self.embed_size ** (1 / 2)), dim=3)

        out = torch.einsum("nhql,nlhd->nqhd", [attention, values]).reshape(N, -1, self.heads * self.head_dim)
        out = self.fc_out(out)
        return out

# Define a feature fusion model.
class FeatureFusionModel(nn.Module):
    def __init__(self, feature_dim, num_heads):
        super(FeatureFusionModel, self).__init__()
        self.self_attention = SelfAttention(feature_dim, num_heads)
        self.fc = nn.Linear(feature_dim, feature_dim)

    def forward(self, features):
        attention_output = self.self_attention(features, features, features)
        fused_features = torch.mean(attention_output, dim=1)
        return self.fc(fused_features)

# Create an instance of the model.
model = FeatureFusionModel(feature_dim=220, num_heads=4)  # Use the concatenated feature dimension.

# Convert to PyTorch tensor.
feature1_tensor = torch.tensor(df['Features1'].tolist(), dtype=torch.float32)
feature2_tensor = torch.tensor(df['Features2'].tolist(), dtype=torch.float32)

# Ensure the correct dimensionality of feature vectors.
assert feature1_tensor.shape[1] == feature_dim and feature2_tensor.shape[1] == feature_dim, "Feature dimensions do not match."

# Concatenate features
combined_features_tensor = torch.cat((feature1_tensor, feature2_tensor), dim=1)

# Use model to fuse features.
fused_features = model(combined_features_tensor)

fused_features_detached = fused_features.detach().numpy()  # Convert to NumPy array.
fused_features_list = fused_features_detached.tolist()  # Convert to Python array. 

# Convert each element in the list to a string for saving to CSV.
df['Fused_Features'] = [','.join(map(str, f)) for f in fused_features_list]

# Save the updated DataFrame.
df.to_csv('data/GCN_NFAs_prering2alt_f4.csv', index=False)

In [None]:
# SelfAttention 2
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

def expand_features(feature, target_dim=2048):
    return [float(i) for i in feature.replace('[', '').replace(']', '').split(',')] + [0] * (target_dim - len(feature.split(',')))

class SelfAttention(nn.Module):
    def __init__(self, embed_size, heads):
        super(SelfAttention, self).__init__()
        self.embed_size = embed_size
        self.heads = heads
        self.head_dim = embed_size // heads 

        assert self.head_dim * heads == embed_size, "Embed size needs to be divisible by heads"

        self.values = nn.Linear(self.head_dim, self.head_dim, bias=False).to(device)
        self.keys = nn.Linear(self.head_dim, self.head_dim, bias=False).to(device)
        self.queries = nn.Linear(self.head_dim, self.head_dim, bias=False).to(device)
        self.fc_out = nn.Linear(heads * self.head_dim, embed_size).to(device)

    def forward(self, values, keys, queries, mask=None):
        N = queries.shape[0]

        values = values.reshape(N, -1, self.heads, self.head_dim).to(device)
        keys = keys.reshape(N, -1, self.heads, self.head_dim).to(device)
        queries = queries.reshape(N, -1, self.heads, self.head_dim).to(device)
        
        queries[:, -feature_dim:, :, :] *= 2  

        energy = torch.einsum("nqhd,nkhd->nhqk", [queries, keys])
        if mask is not None:
            mask = mask.to(device)
            energy = energy.masked_fill(mask == 0, float("-inf"))
        attention = torch.softmax(energy / (self.embed_size ** (1 / 2)), dim=3)

        out = torch.einsum("nhql,nlhd->nqhd", [attention, values]).reshape(N, -1, self.heads * self.head_dim)
        out = self.fc_out(out)
        return out

class FeatureFusionModel(nn.Module):
    def __init__(self, feature_dim, num_heads):
        super(FeatureFusionModel, self).__init__()
        self.self_attention = SelfAttention(feature_dim * 3, num_heads).to(device)
        self.fc = nn.Linear(feature_dim * 3, feature_dim * 3).to(device)

    def forward(self, feature1, feature2, feature3):
        combined_features = torch.cat((feature1, feature2, feature3), dim=1).to(device)
        attention_output = self.self_attention(combined_features, combined_features, combined_features)
        fused_features = torch.mean(attention_output, dim=1)
        return self.fc(fused_features)

# df = pd.read_csv('data/dataset_last_final.csv')
df = pd.read_csv('data/dataset_last_final.csv', encoding='ISO-8859-1')

df['Features1'] = df['Features1'].apply(lambda x: expand_features(x))
df['Features2'] = df['Features2'].apply(lambda x: expand_features(x))
df['Features3'] = df['Features3'].apply(lambda x: expand_features(x))

feature_dim = 2048

model = FeatureFusionModel(feature_dim=2048, num_heads=8)

if torch.cuda.device_count() < 1:
    print("use", torch.cuda.device_count(), "个 GPUs!")
    model = nn.DataParallel(model)

model.to(device)

feature1_tensor = torch.tensor(df['Features1'].tolist(), dtype=torch.float32).to(device)
feature2_tensor = torch.tensor(df['Features2'].tolist(), dtype=torch.float32).to(device)
feature3_tensor = torch.tensor(df['Features3'].tolist(), dtype=torch.float32).to(device)

fused_features = model(feature1_tensor, feature2_tensor, feature3_tensor)

fused_features_detached = fused_features.detach().cpu().numpy()  
fused_features_list = fused_features_detached.tolist()  

df['Combined_Features'] = [','.join(map(str, f)) for f in fused_features_list]
print(df.head())

df.to_csv('data/dataset_last_final.csv', index=False)

In [None]:
import pandas as pd

# load data
df = pd.read_csv('data/GCN_NFAs_paper_prering2alt.csv')

# Convert string features to list of floating-point numbers.
df['Features1'] = df['Features1'].apply(lambda x: [float(i) for i in x.split(',')])
df['Features2'] = df['Features2'].apply(lambda x: [float(i) for i in x.split(',')])
# df['Features3'] = df['Features3'].apply(lambda x: [float(i) for i in x.split(',')])
# Calculate the maximum length of each feature column.
max_length_features1 = max(len(f) for f in df['Features1'])
max_length_features2 = max(len(f) for f in df['Features2'])
max_length_features3 = max(len(f) for f in df['Features3'])
print(max_length_features1)
print(max_length_features3)
# Pad each of the two feature lists separately.
# df['Features1'] = df['Features1'].apply(lambda x: x + [0] * (120 - len(x)))
# df['Features2'] = df['Features2'].apply(lambda x: x + [0] * (120 - len(x)))

# Save the processed DataFrame to a CSV file.
# df.to_csv('data/Data_GCN_prering2alt_padded.csv', index=False)
