In [7]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor
from sklearn.datasets import load_iris, load_wine, load_breast_cancer
from sklearn.metrics import classification_report
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.preprocessing import LabelEncoder

In [8]:
# Function to train and test the KNN model and calculate the classification report
def train_test_knn(X_train, X_test, y_train, y_test):
    # Creating the KNN classifier
    knn = KNeighborsClassifier(metric='manhattan', n_neighbors=3)
    # Training the model
    knn.fit(X_train, y_train)
    # Making predictions
    y_pred = knn.predict(X_test)
    # Calculating the classification report
    report = classification_report(y_test, y_pred, digits=4)  # Using 4 decimal places in the report
    return report

datasets = {
    'Iris': load_iris(return_X_y=True),
    'Wine': load_wine(return_X_y=True)
}

# Iterating over the datasets
for dataset_name, (X, y) in datasets.items():
    # Ensuring X is numeric
    X = pd.DataFrame(X).apply(pd.to_numeric, errors='coerce')

    # Rounding the values in X
    X = X.round(4)

    # Checking if the labels are of type str and converting if necessary
    if isinstance(y, pd.Series) and y.dtype == 'object':
        y = pd.factorize(y)[0]

    # Splitting the dataset into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Training and testing the KNN model and calculating the classification report
    report = train_test_knn(X_train, X_test, y_train, y_test)
    print(f"Dataset: {dataset_name}")
    print(f"Classification Report:\n{report}\n")

def convert_to_cpn_format(X):
    if isinstance(X, pd.DataFrame):
        # If X is a DataFrame, convert it to a list of lists
        X_list = X.values.tolist()
    else:
        # If X is a NumPy array, keep it as is
        X_list = X.tolist()
    return f"1`{X_list}"

def convert_labels_to_cpn_format(y):
    return "1`" + "[" + ",".join(map(str, y.tolist())) + "]"

# Iterating again over the datasets for conversion
for dataset_name, (X, y) in datasets.items():
    # Splitting the dataset into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Converting training and testing sets to CPN Tools format
    X_train_cpn = convert_to_cpn_format(X_train)
    X_test_cpn = convert_to_cpn_format(X_test)

    # Converting labels to CPN Tools list format
    y_train_cpn = convert_labels_to_cpn_format(y_train)
    y_test_cpn = convert_labels_to_cpn_format(y_test)

    # Displaying the converted data (optional)
    print(f"CPN Tools Format for {dataset_name} - X_train: {X_train_cpn}")
    print(f"CPN Tools Format for {dataset_name} - X_test: {X_test_cpn}")

    print("\nTraining set labels in CPN Tools list format:")
    print(y_train_cpn)

    print("\nTesting set labels in CPN Tools list format:")
    print(y_test_cpn)

    print("---------------------------------------------------------------")

    # Saving training and testing sets to text files with the dataset name
    with open(f"X_train_{dataset_name}_cpn.txt", "w") as file:
        file.write(str(X_train_cpn))

    with open(f"X_test_{dataset_name}_cpn.txt", "w") as file:
        file.write(str(X_test_cpn))

    with open(f"y_train_{dataset_name}_cpn.txt", "w") as file:
        file.write(y_train_cpn)

    with open(f"y_test_{dataset_name}_cpn.txt", "w") as file:
        file.write(y_test_cpn)

Dataset: Iris
Classification Report:
              precision    recall  f1-score   support

           0     1.0000    1.0000    1.0000        10
           1     1.0000    1.0000    1.0000         9
           2     1.0000    1.0000    1.0000        11

    accuracy                         1.0000        30
   macro avg     1.0000    1.0000    1.0000        30
weighted avg     1.0000    1.0000    1.0000        30


Dataset: Wine
Classification Report:
              precision    recall  f1-score   support

           0     0.9231    0.8571    0.8889        14
           1     0.9231    0.8571    0.8889        14
           2     0.6000    0.7500    0.6667         8

    accuracy                         0.8333        36
   macro avg     0.8154    0.8214    0.8148        36
weighted avg     0.8513    0.8333    0.8395        36


CPN Tools Format for Iris - X_train: 1`[[4.6, 3.6, 1.0, 0.2], [5.7, 4.4, 1.5, 0.4], [6.7, 3.1, 4.4, 1.4], [4.8, 3.4, 1.6, 0.2], [4.4, 3.2, 1.3, 0.2], [6.3, 2.5, 5

In [9]:
# Function to train and test the KNN model and calculate the classification report
def train_test_knn(X_train, X_test, y_train, y_test):
    knn = KNeighborsClassifier(metric='chebyshev', n_neighbors=5)
    knn.fit(X_train, y_train)
    y_pred = knn.predict(X_test)
    report = classification_report(y_test, y_pred, digits=5)
    return report

# Load the Breast Cancer Wisconsin (BCW) dataset
X, y = load_breast_cancer(return_X_y=True)

# Ensuring X is numeric
X = pd.DataFrame(X).apply(pd.to_numeric, errors='coerce')

# Selecting the 10 most important features
selector = SelectKBest(score_func=f_classif, k=10)
X_new = selector.fit_transform(X, y)

# Splitting the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_new, y, test_size=0.2, random_state=42)

# Training and testing the KNN model and calculating the classification report
report = train_test_knn(X_train, X_test, y_train, y_test)
print(f"Dataset: Breast Cancer")
print(f"Classification Report:\n{report}\n")

# Function to convert to CPN format
def convert_to_cpn_format(X):
    if isinstance(X, pd.DataFrame):
        X_list = X.values.tolist()
    else:
        X_list = X.tolist()
    return f"1`{X_list}"

def convert_labels_to_cpn_format(y):
    return "1`" + "[" + ",".join(map(str, y.tolist())) + "]"

# Converting training and testing sets to CPN Tools format
X_train_cpn = convert_to_cpn_format(X_train)
X_test_cpn = convert_to_cpn_format(X_test)

# Converting labels to CPN Tools list format
y_train_cpn = convert_labels_to_cpn_format(y_train)
y_test_cpn = convert_labels_to_cpn_format(y_test)

# Saving training and testing sets to text files
with open("X_train_breast_cancer_cpn.txt", "w") as file:
    file.write(X_train_cpn)

with open("X_test_breast_cancer_cpn.txt", "w") as file:
    file.write(X_test_cpn)

with open("y_train_breast_cancer_cpn.txt", "w") as file:
    file.write(y_train_cpn)

with open("y_test_breast_cancer_cpn.txt", "w") as file:
    file.write(y_test_cpn)

Dataset: Breast Cancer
Classification Report:
              precision    recall  f1-score   support

           0    0.97500   0.90698   0.93976        43
           1    0.94595   0.98592   0.96552        71

    accuracy                        0.95614       114
   macro avg    0.96047   0.94645   0.95264       114
weighted avg    0.95690   0.95614   0.95580       114




In [10]:
# Loading the adult.data dataset into a DataFrame
data = pd.read_csv("adult.data", header=None)

# Limiting the DataFrame to 2000 samples
data = data.sample(n=2000, random_state=42)

# Creating a copy of the original data
data_encoded = data.copy()

# Iterating over the categorical columns and applying factorize
for column in data_encoded.select_dtypes(include=['object']).columns:
    data_encoded[column] = pd.factorize(data_encoded[column])[0]

# Splitting the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(data_encoded.iloc[:, 0:-1], data_encoded.iloc[:, -1], test_size=0.2, random_state=42)

# Applying the KNN algorithm
knn = KNeighborsClassifier(metric='chebyshev', n_neighbors=12)
knn.fit(X_train, y_train)

# Making predictions
y_pred = knn.predict(X_test)

# Evaluating the model's performance
report = classification_report(y_test, y_pred, digits=3)
print("Classification Report:")
print(report)

def convert_to_cpn_format(X):
    if isinstance(X, pd.DataFrame):
        # If X is a DataFrame, convert it to a list of lists
        X_list = X.values.tolist()
    else:
        # If X is a NumPy array, keep it as is
        X_list = X.tolist()
    return f"1`{X_list}"

def convert_labels_to_cpn_format(y):
    return "1`" + "[" + ",".join(map(str, y.tolist())) + "]"

X_train_cpn = convert_to_cpn_format(X_train.values)
X_test_cpn = convert_to_cpn_format(X_test.values)

# Converting labels to CPN Tools list format
y_train_cpn = convert_labels_to_cpn_format(y_train.values)
y_test_cpn = convert_labels_to_cpn_format(y_test.values)

print("\nTraining Set in CPN Tools format:")
print(X_train_cpn)

print("\nTesting Set in CPN Tools format:")
print(X_test_cpn)

print("\nTraining Set Labels in CPN Tools list format:")
print(y_train_cpn)

print("\nTesting Set Labels in CPN Tools list format:")
print(y_test_cpn)

# Saving training and testing sets to text files
with open("X_train_cpn_adult.txt", "w") as file:
    file.write(X_train_cpn)

with open("X_test_cpn_adult.txt", "w") as file:
    file.write(X_test_cpn)

with open("y_train_cpn_adult.txt", "w") as file:
    file.write(y_train_cpn)

with open("y_test_cpn_adult.txt", "w") as file:
    file.write(y_test_cpn)

Classification Report:
              precision    recall  f1-score   support

           0      0.780     1.000     0.876       305
           1      1.000     0.095     0.173        95

    accuracy                          0.785       400
   macro avg      0.890     0.547     0.525       400
weighted avg      0.832     0.785     0.709       400


Training Set in CPN Tools format:
1`[[36, 0, 180686, 1, 9, 1, 3, 2, 0, 1, 0, 0, 45, 0], [33, 0, 62155, 0, 10, 2, 5, 0, 1, 1, 0, 0, 35, 0], [47, 1, 216414, 12, 16, 1, 4, 2, 0, 1, 0, 0, 40, 0], [37, 0, 179731, 1, 9, 2, 7, 3, 0, 0, 0, 0, 35, 0], [56, 6, 192325, 0, 10, 0, 11, 0, 0, 0, 0, 0, 20, 0], [39, 0, 188069, 2, 13, 1, 2, 2, 0, 1, 0, 0, 40, 4], [47, 2, 243631, 2, 13, 1, 4, 2, 3, 1, 0, 0, 40, 17], [21, 0, 143062, 1, 9, 2, 2, 4, 0, 1, 0, 0, 40, 0], [30, 0, 30226, 6, 7, 0, 5, 4, 0, 1, 0, 0, 40, 0], [19, 0, 97189, 0, 10, 2, 5, 4, 0, 0, 0, 0, 22, 0], [30, 5, 125159, 2, 13, 2, 4, 0, 1, 1, 14084, 0, 45, 4], [27, 1, 553473, 2, 13, 1, 8, 1, 1, 0, 0,

In [11]:
# Function to calculate regression metrics
def calculate_metrics(y_true, y_pred):
    mse = mean_squared_error(y_true, y_pred)
    rmse = np.sqrt(mse)
    mae = mean_absolute_error(y_true, y_pred)
    r2 = r2_score(y_true, y_pred)
    return mse, rmse, mae, r2

# Load the data
data = pd.read_csv("abalone.data", header=None)

# Rename the columns
data.columns = ["Sex", "Length", "Diameter", "Height", "Whole_weight", "Shucked_weight", "Viscera_weight", "Shell_weight", "Rings"]

# Label encoding for the categorical column 'Sex'
label_encoder = LabelEncoder()
data['Sex'] = label_encoder.fit_transform(data['Sex'])

# Sampling 2000 samples from the dataset
data_sampled = data.sample(n=2000, random_state=42)

# Separating features (X) and target (y)
X = data_sampled.iloc[:, 0:-1]
y = data_sampled.iloc[:, -1]

# Splitting the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Training the KNN Regressor model
model = KNeighborsRegressor(n_neighbors=11, metric='euclidean')  # Using 11 neighbors as an example
model.fit(X_train, y_train)

# Making predictions
y_pred = model.predict(X_test)

# Calculating the metrics
mse, rmse, mae, r2 = calculate_metrics(y_test, y_pred)

# Displaying the metrics
print(f"MSE: {mse}")
print(f"RMSE: {rmse}")
print(f"MAE: {mae}")
print(f"R²: {r2}")
print("-" * 30)

def convert_to_cpn_format(X):
    if isinstance(X, pd.DataFrame):
        # If X is a DataFrame, convert it to a list of lists
        X_list = X.values.tolist()
    else:
        # If X is a NumPy array, keep it as is
        X_list = X.tolist()
    return f"1`{X_list}"

def convert_labels_to_cpn_format(y):
    return "1`" + "[" + ",".join(map(str, y.tolist())) + "]"

X_train_cpn = convert_to_cpn_format(X_train.values)
X_test_cpn = convert_to_cpn_format(X_test.values)

# Converting labels to CPN Tools list format
y_train_cpn = convert_labels_to_cpn_format(y_train.values)
y_test_cpn = convert_labels_to_cpn_format(y_test.values)

print("\nTraining Set in CPN Tools format:")
print(X_train_cpn)

print("\nTesting Set in CPN Tools format:")
print(X_test_cpn)

print("\nTraining Set Labels in CPN Tools list format:")
print(y_train_cpn)

print("\nTesting Set Labels in CPN Tools list format:")
print(y_test_cpn)

# Saving training and testing sets to text files
with open("X_train_cpn_abalone.txt", "w") as file:
    file.write(X_train_cpn)

with open("X_test_cpn_abalone.txt", "w") as file:
    file.write(X_test_cpn)

with open("y_train_cpn_abalone.txt", "w") as file:
    file.write(y_train_cpn)

with open("y_test_cpn_abalone.txt", "w") as file:
    file.write(y_test_cpn)

MSE: 5.436962809917356
RMSE: 2.331729574782924
MAE: 1.6552272727272725
R²: 0.44039844377650217
------------------------------

Training Set in CPN Tools format:
1`[[0.0, 0.69, 0.55, 0.2, 1.569, 0.687, 0.3675, 0.46], [1.0, 0.595, 0.43, 0.165, 0.9845, 0.4525, 0.207, 0.2725], [1.0, 0.455, 0.335, 0.105, 0.4055, 0.175, 0.092, 0.1185], [1.0, 0.575, 0.445, 0.17, 0.8015, 0.3475, 0.1465, 0.25], [0.0, 0.44, 0.345, 0.105, 0.4285, 0.165, 0.083, 0.132], [2.0, 0.555, 0.44, 0.15, 0.838, 0.4155, 0.146, 0.23], [1.0, 0.24, 0.18, 0.055, 0.0555, 0.0235, 0.013, 0.018], [1.0, 0.45, 0.35, 0.11, 0.514, 0.253, 0.1045, 0.14], [0.0, 0.595, 0.465, 0.15, 1.1005, 0.5415, 0.166, 0.265], [2.0, 0.66, 0.485, 0.155, 1.2275, 0.61, 0.274, 0.3], [1.0, 0.445, 0.345, 0.13, 0.4075, 0.1365, 0.0645, 0.18], [2.0, 0.66, 0.515, 0.155, 1.4415, 0.7055, 0.3555, 0.335], [0.0, 0.47, 0.365, 0.12, 0.543, 0.2295, 0.1495, 0.15], [0.0, 0.595, 0.495, 0.235, 1.366, 0.5065, 0.219, 0.52], [0.0, 0.505, 0.41, 0.15, 0.644, 0.285, 0.145, 0.21], [0.

In [12]:
# Function to calculate regression metrics
def calculate_metrics(y_true, y_pred):
    mse = mean_squared_error(y_true, y_pred)
    rmse = np.sqrt(mse)
    mae = mean_absolute_error(y_true, y_pred)
    r2 = r2_score(y_true, y_pred)
    return mse, rmse, mae, r2

# Load the data
data = pd.read_csv("machine.data", header=None)

# Rename the columns
data.columns = ["Vendor", "Model", "MYCT", "MMIN", "MMAX", "CACH", "CHMIN", "CHMAX", "PRP", "ERP"]

# Select the features and the target (PRP)
X = data.iloc[:, 2:8]
y = data["PRP"]

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train the KNN Regressor model
model = KNeighborsRegressor(n_neighbors=2, metric='manhattan')
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)

# Calculate the metrics
mse, rmse, mae, r2 = calculate_metrics(y_test, y_pred)

# Display the metrics
print(f"MSE: {mse}")
print(f"RMSE: {rmse}")
print(f"MAE: {mae}")
print(f"R²: {r2}")
print("-" * 30)

def convert_to_cpn_format(X):
    if isinstance(X, pd.DataFrame):
        # If X is a DataFrame, convert it to a list of lists
        X_list = X.values.tolist()
    else:
        # If X is a NumPy array, keep it as is
        X_list = X.tolist()
    return f"1`{X_list}"

def convert_labels_to_cpn_format(y):
    return "1`" + "[" + ",".join(map(str, y.tolist())) + "]"

X_train_cpn = convert_to_cpn_format(X_train.values)
X_test_cpn = convert_to_cpn_format(X_test.values)

# Converting labels to CPN Tools list format
y_train_cpn = convert_labels_to_cpn_format(y_train.values)
y_test_cpn = convert_labels_to_cpn_format(y_test.values)

print("\nTraining Set in CPN Tools format:")
print(X_train_cpn)

print("\nTesting Set in CPN Tools format:")
print(X_test_cpn)

print("\nTraining Set Labels in CPN Tools list format:")
print(y_train_cpn)

print("\nTesting Set Labels in CPN Tools list format:")
print(y_test_cpn)

# Saving training and testing sets to text files
with open("X_train_cpn_machine.txt", "w") as file:
    file.write(X_train_cpn)

with open("X_test_cpn_machine.txt", "w") as file:
    file.write(X_test_cpn)

with open("y_train_cpn_machine.txt", "w") as file:
    file.write(y_train_cpn)

with open("y_test_cpn_machine.txt", "w") as file:
    file.write(y_test_cpn)

MSE: 3225.589285714286
RMSE: 56.79427159242634
MAE: 32.63095238095238
R²: 0.9366374253256045
------------------------------

Training Set in CPN Tools format:
1`[[40, 8000, 16000, 32, 8, 16], [26, 16000, 32000, 64, 8, 24], [30, 8000, 64000, 128, 12, 176], [105, 1000, 4000, 0, 3, 24], [30, 16000, 32000, 256, 16, 24], [140, 2000, 4000, 0, 4, 8], [203, 1000, 2000, 0, 1, 5], [50, 2000, 8000, 8, 1, 5], [320, 128, 6000, 0, 1, 12], [240, 512, 2000, 8, 1, 5], [225, 1000, 4000, 2, 3, 6], [320, 256, 5000, 4, 1, 6], [110, 5000, 5000, 142, 8, 64], [50, 2000, 16000, 24, 6, 16], [115, 2000, 4000, 2, 1, 5], [90, 256, 1000, 0, 3, 10], [75, 2000, 16000, 128, 1, 38], [26, 8000, 32000, 64, 8, 32], [60, 4000, 16000, 64, 5, 8], [220, 1000, 8000, 16, 1, 2], [26, 8000, 32000, 0, 8, 24], [50, 2000, 16000, 24, 1, 6], [180, 768, 12000, 6, 1, 31], [124, 1000, 8000, 0, 1, 8], [160, 1000, 4000, 8, 1, 14], [25, 1310, 2620, 131, 12, 24], [60, 2000, 8000, 65, 1, 8], [64, 5240, 20970, 30, 12, 24], [320, 512, 5000, 4, 

In [13]:
# Function to calculate regression metrics
def calculate_metrics(y_true, y_pred):
    mse = mean_squared_error(y_true, y_pred)
    rmse = np.sqrt(mse)
    mae = mean_absolute_error(y_true, y_pred)
    r2 = r2_score(y_true, y_pred)
    return mse, rmse, mae, r2

# Load the data
data = pd.read_csv("AirQualityUCI.csv", sep=";", decimal=",")  # Use the appropriate separator and decimal for the file

# Select features and the target (PM10 as an example)
X = data[['CO(GT)', 'PT08.S1(CO)', 'NMHC(GT)', 'C6H6(GT)', 'NOx(GT)',
           'PT08.S3(NOx)', 'NO2(GT)', 'PT08.S4(NO2)', 'PT08.S5(O3)',
           'T', 'RH', 'AH']]
y = data['PT08.S2(NMHC)']

# Remove rows with missing values
data_cleaned = pd.concat([X, y], axis=1).dropna()
X_cleaned = data_cleaned[X.columns]
y_cleaned = data_cleaned['PT08.S2(NMHC)']

# Sample 2000 random samples
if len(X_cleaned) > 2000:
    X_cleaned, _, y_cleaned, _ = train_test_split(X_cleaned, y_cleaned, train_size=2000, random_state=42)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_cleaned, y_cleaned, test_size=0.2, random_state=42)

# Train the KNN Regressor model
model = KNeighborsRegressor(n_neighbors=3, metric='chebyshev')
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)

# Calculate the metrics
mse, rmse, mae, r2 = calculate_metrics(y_test, y_pred)

# Display the metrics
print(f"MSE: {mse}")
print(f"RMSE: {rmse}")
print(f"MAE: {mae}")
print(f"R²: {r2}")
print("-" * 30)

# Function to convert numerical values, replacing '-' with '~' for negative values
def convert_negative_to_tilde(value):
    if value < 0:
        return f"~{abs(value)}"
    return f"{value}"

# Function to convert X to CPN format, applying the transformation for negatives
def convert_to_cpn_format(X):
    X_list = X.tolist() if isinstance(X, np.ndarray) else X.values.tolist()
    X_cpn = [[convert_negative_to_tilde(val) for val in row] for row in X_list]
    # Keeping each sample correctly separated
    return "1`[" + ", ".join(["[" + ",".join(row) + "]" for row in X_cpn]) + "]"

# Function to convert labels (y) to CPN format, applying the transformation for negatives
def convert_labels_to_cpn_format(y):
    y_cpn = [convert_negative_to_tilde(val) for val in y]
    return "1`[" + ",".join(y_cpn) + "]"

# Convert training and testing sets to CPN Tools format
X_train_cpn = convert_to_cpn_format(X_train.values)
X_test_cpn = convert_to_cpn_format(X_test.values)

# Convert labels to CPN Tools list format
y_train_cpn = convert_labels_to_cpn_format(y_train.values)
y_test_cpn = convert_labels_to_cpn_format(y_test.values)

print("\nTraining Set in CPN Tools format:")
print(X_train_cpn)

print("\nTesting Set in CPN Tools format:")
print(X_test_cpn)

print("\nTraining Set Labels in CPN Tools list format:")
print(y_train_cpn)

print("\nTesting Set Labels in CPN Tools list format:")
print(y_test_cpn)

# Save training and testing sets to text files
with open("X_train_cpn_air_quality.txt", "w") as file:
    file.write(X_train_cpn)

with open("X_test_cpn_air_quality.txt", "w") as file:
    file.write(X_test_cpn)

with open("y_train_cpn_air_quality.txt", "w") as file:
    file.write(y_train_cpn)

with open("y_test_cpn_air_quality.txt", "w") as file:
    file.write(y_test_cpn)

MSE: 5012.014999999999
RMSE: 70.79558602059876
MAE: 54.045
R²: 0.9523614284530159
------------------------------

Training Set in CPN Tools format:
1`[[1.9,1186.0,~200.0,8.6,290.0,641.0,181.0,1272.0,1025.0,7.0,73.7,0.7425], [4.3,1537.0,~200.0,22.9,564.0,384.0,158.0,1941.0,1660.0,16.0,67.8,1.2276], [~200.0,1049.0,~200.0,6.9,~200.0,699.0,~200.0,1605.0,882.0,23.5,59.8,1.7141], [2.2,1402.0,88.0,9.0,131.0,1140.0,114.0,1555.0,1074.0,11.9,54.0,0.7502], [~200.0,886.0,~200.0,2.4,~200.0,1075.0,~200.0,1484.0,522.0,27.1,54.8,1.9309], [2.6,~200.0,~200.0,~200.0,397.0,~200.0,148.0,~200.0,~200.0,~200.0,~200.0,~200.0], [4.0,1418.0,542.0,19.5,231.0,681.0,129.0,1890.0,1418.0,19.2,35.7,0.7826], [3.4,1324.0,~200.0,14.4,492.0,540.0,177.0,1441.0,1319.0,18.9,34.4,0.7436], [3.6,1400.0,~200.0,21.2,366.0,520.0,100.0,1916.0,1465.0,27.7,39.2,1.4356], [0.6,844.0,~200.0,2.5,100.0,2095.0,63.0,1064.0,757.0,8.3,80.4,0.8861], [2.2,1100.0,~200.0,6.0,327.0,768.0,172.0,1070.0,1158.0,6.3,59.8,0.5761], [2.6,1059.0,~200.0,4.7

In [14]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

# Function to calculate regression metrics
def calculate_metrics(y_true, y_pred):
    mse = mean_squared_error(y_true, y_pred)
    rmse = np.sqrt(mse)
    mae = mean_absolute_error(y_true, y_pred)
    r2 = r2_score(y_true, y_pred)
    return mse, rmse, mae, r2

# Load the dataset
data = pd.read_csv('Concrete_Data.csv')

# Select features and target
X = data.iloc[:, :-1]  # All columns except the last
y = data.iloc[:, -1]   # Last column as target

# Function to clean and convert numeric values
def clean_numeric(value):
    if isinstance(value, str):
        return value.replace(',', '.').strip()  # Replace comma with dot for float conversion
    return value

# Apply cleaning and conversion to the dataset
X = X.applymap(clean_numeric)
y = y.map(clean_numeric)

# Convert all data to float
X = X.astype(float)
y = y.astype(float)

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train the KNN Regressor model
model = KNeighborsRegressor(n_neighbors=7, metric='euclidean')
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)

# Calculate the metrics
mse, rmse, mae, r2 = calculate_metrics(y_test, y_pred)

# Display the metrics
print(f"MSE: {mse}")
print(f"RMSE: {rmse}")
print(f"MAE: {mae}")
print(f"R²: {r2}")
print("-" * 30)

# Function to convert data to CPN format
def convert_to_cpn_format(X):
    return f"1`{X.values.tolist()}"

# Function to convert labels to CPN format
def convert_labels_to_cpn_format(y):
    return "1`" + "[" + ",".join(map(str, y.tolist())) + "]"

# Convert training and testing sets to CPN format
X_train_cpn = convert_to_cpn_format(X_train)
X_test_cpn = convert_to_cpn_format(X_test)

# Convert labels to CPN format
y_train_cpn = convert_labels_to_cpn_format(y_train)
y_test_cpn = convert_labels_to_cpn_format(y_test)

# Print converted sets
print("\nTraining Set in CPN Tools format:")
print(X_train_cpn)

print("\nTesting Set in CPN Tools format:")
print(X_test_cpn)

print("\nTraining Set Labels in CPN Tools format:")
print(y_train_cpn)

print("\nTesting Set Labels in CPN Tools format:")
print(y_test_cpn)

# Save the training and testing sets to text files
with open("X_train_cpn_Concrete_Data.txt", "w") as file:
    file.write(X_train_cpn)

with open("X_test_cpn_Concrete_Data.txt", "w") as file:
    file.write(X_test_cpn)

with open("y_train_cpn_Concrete_Data.txt", "w") as file:
    file.write(y_train_cpn)

with open("y_test_cpn_Concrete_Data.txt", "w") as file:
    file.write(y_test_cpn)

MSE: 77.27214723598176
RMSE: 8.790457737568719
MAE: 6.995922330097088
R²: 0.7001199994011011
------------------------------

Training Set in CPN Tools format:
1`[[158.6, 148.9, 116.0, 175.1, 15.0, 953.3, 719.7, 28.0], [424.0, 22.0, 132.0, 178.0, 8.5, 822.0, 750.0, 28.0], [275.1, 0.0, 121.4, 159.5, 9.9, 1053.6, 777.5, 3.0], [252.0, 97.0, 76.0, 194.0, 8.0, 835.0, 821.0, 28.0], [168.9, 42.2, 124.3, 158.3, 10.8, 1080.8, 796.2, 3.0], [181.4, 0.0, 167.0, 169.6, 7.6, 1055.6, 777.8, 28.0], [154.8, 183.4, 0.0, 193.3, 9.1, 1047.4, 696.7, 28.0], [178.0, 129.8, 118.6, 179.9, 3.6, 1007.3, 746.8, 56.0], [212.0, 141.3, 0.0, 203.5, 0.0, 973.4, 750.0, 90.0], [102.0, 153.0, 0.0, 192.0, 0.0, 887.0, 942.0, 3.0], [134.7, 0.0, 165.7, 180.2, 10.0, 961.0, 804.9, 28.0], [251.4, 0.0, 118.3, 188.5, 5.8, 1028.4, 757.7, 56.0], [350.0, 0.0, 0.0, 203.0, 0.0, 974.0, 775.0, 14.0], [362.6, 189.0, 0.0, 164.9, 11.6, 944.7, 755.8, 28.0], [491.0, 26.0, 123.0, 210.0, 3.9, 882.0, 699.0, 3.0], [275.0, 180.0, 120.0, 162.0, 10.

  X = X.applymap(clean_numeric)


In [15]:
# Loading the data
X = pd.read_csv('imputed_features_filtrado_3_classes.csv', decimal=',')
X = X.iloc[:,1:]  # Assuming the first column is an index or unused identifier

y = pd.read_csv('rotulos_robot.csv')
y = y.iloc[:,1:]  # Assuming the first column is an index or unused identifier

# Converting labels to numeric
y = y.iloc[:, 0].astype('category').cat.codes  # Assuming there is only one label column and converting it to numeric codes

# Ensuring all data in X are numeric
X = X.apply(pd.to_numeric, errors='coerce')

# Splitting the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Rounding the values in X
X_train = X_train.round(4)
X_test = X_test.round(4)

# Applying the KNN algorithm
knn = KNeighborsClassifier(metric='euclidean', n_neighbors=1)
knn.fit(X_train, y_train)

# Making predictions
y_pred = knn.predict(X_test)

# Evaluating model performance
report = classification_report(y_test, y_pred, digits=4)  # Using 4 decimal places in the report
print("Classification report:")
print(report)

# Function to convert numeric values, replacing '-' with '~' for negative values
def convert_negative_to_tilde(value):
    if value < 0:
        return f"~{abs(value)}"
    return f"{value}"

# Function to convert X to CPN format, applying transformation on negatives
def convert_to_cpn_format(X):
    X_list = X.tolist() if isinstance(X, np.ndarray) else X.values.tolist()
    X_cpn = [[convert_negative_to_tilde(val) for val in row] for row in X_list]
    # Here we keep each sample separated correctly
    return "1`[" + ", ".join(["[" + ",".join(row) + "]" for row in X_cpn]) + "]"

# Function to convert labels (y) to CPN format, applying transformation on negatives
def convert_labels_to_cpn_format(y):
    y_cpn = [convert_negative_to_tilde(val) for val in y]
    return "1`[" + ",".join(y_cpn) + "]"

X_train_cpn = convert_to_cpn_format(X_train.values)
X_test_cpn = convert_to_cpn_format(X_test.values)

# Converting labels to CPN Tools list format
y_train_cpn = convert_labels_to_cpn_format(y_train.values)
y_test_cpn = convert_labels_to_cpn_format(y_test.values)

print("\nTraining set in CPN Tools format:")
print(X_train_cpn)

print("\nTest set in CPN Tools format:")
print(X_test_cpn)

print("\nLabels of the training set in CPN Tools list format:")
print(y_train_cpn)

print("\nLabels of the test set in CPN Tools list format:")
print(y_test_cpn)

# Saving training and testing sets to text files
with open("X_train_cpn_port.txt", "w") as file:
    file.write(X_train_cpn)

with open("X_test_cpn_port.txt", "w") as file:
    file.write(X_test_cpn)

with open("y_train_cpn_port.txt", "w") as file:
    file.write(y_train_cpn)

with open("y_test_cpn_pot.txt", "w") as file:
    file.write(y_test_cpn)

Classification report:
              precision    recall  f1-score   support

           0     0.9375    0.9574    0.9474        47
           1     0.9259    1.0000    0.9615        25
           2     1.0000    0.8125    0.8966        16

    accuracy                         0.9432        88
   macro avg     0.9545    0.9233    0.9352        88
weighted avg     0.9456    0.9432    0.9422        88


Training set in CPN Tools format:
1`[[12.271,71.8,~7.6,1.0,17.5,10.8571,~26.0675,12.2533,47.4342,15.1429,~2.4775,0.0,1.25], [76.509,~7.0,4.2,0.5,2158.2222,7.7857,~12.0246,~143.2918,40.3361,10.5714,2.2034,0.0,0.25], [23.8653,54.0,3.0,~0.25,5.4722,4.0,~20.1859,~29.9058,23.5372,9.7143,~7.9757,0.0,0.2222], [36.77,6.0,7.2,~2.3333,2.2222,5.2857,~18.6774,~4.0451,24.0416,40.1429,~48.6563,0.2857,2.16], [4.7418,13.0,~5.8,0.0,2.7344,4.5,6.0884,9.2533,22.561,6.1429,8.552,0.0,0.0], [48.0986,~2.0,6.8,~0.5,790.4,5.5,4.3592,24.3156,32.4191,14.4286,~4.8722,0.0,0.0], [10.8454,76.0,~19.6,1.0,0.8889,3.6429,0