In [None]:
import pandas as pd # type: ignore
import json
import random

# Set the dataset type and other configurations
DATASET_TYPE = "general"
USE_DYNAMIC_PROMPT = True
N_CLASSES = 3  # Set to 2 for binary classification, 3 for ternary classification

# Load the dataset based on type
if DATASET_TYPE == "general":
    csv_file = "KNN_final_imputed_data.csv"
elif DATASET_TYPE == "numerical":
    csv_file = "Binary_GIST_SE_8_DT.csv"

data = pd.read_csv(csv_file, encoding="utf-8")

data.columns = [col.replace("\u00b0C", "°C") for col in data.columns]

# Define the target column and attribute columns
target_column = "No. of Graphene Layers"
attribute_columns = [col for col in data.columns if col != target_column]

# Define units for attributes (for general datasets only)
units = {
    "Pressure (mbar)": "mbar",
    "Temperature (°C)": "°C",
    "Growth Time (min)": "min",
    "H2": "sccm",
    "CH4": "sccm",
    "C2H4": "sccm",
    "Ar": "sccm",
    "C2H2": "sccm",
}

if DATASET_TYPE == "general":
    # Function to encode graphene layers
    def encode_graphene_layers(data, n_classes=2):
        """Encode 'No. of Graphene Layers' as a categorical variable."""
        data[target_column] = data[target_column].replace(['ML', 'Unknown'], 10.0)
        data[target_column] = pd.to_numeric(data[target_column], errors='coerce')
        data[target_column] = data[target_column].fillna(0)

        if n_classes == 2:
            data[target_column] = data[target_column].apply(lambda x: 0 if x <= 1 else 1)
        elif n_classes == 3:
            data[target_column] = data[target_column].apply(lambda x: 0 if x < 1.5
                                                             else 1 if 1.5 <= x < 2.5
                                                             else 2)
        elif n_classes == 4:
            data[target_column] = data[target_column].apply(lambda x: 0 if x < 1.5
                                                             else 1 if 1.5 <= x < 2.5
                                                             else 2 if 2.5 <= x < 3.5
                                                             else 3)

        data[target_column] = data[target_column].astype('category')
        return data

    # Apply encoding for general datasets
    data = encode_graphene_layers(data, n_classes=N_CLASSES)

print(f"Final dataset size after processing: {len(data)}")

jsonl_data = []

for _, row in data.iterrows():
    if DATASET_TYPE == "general":
        # User prompt with units
        attributes_description = [
            f"{col} is {row[col]} {units[col]}" if col in units else f"{col} is {row[col]}"
            for col in attribute_columns
        ]
    else:
        # User prompt without units (numerical dataset)
        attributes_description = [f"{col} is {row[col]}" for col in attribute_columns]

    user_prompt = ", ".join(attributes_description)

    # Assistant response: description of the target
    assistant_response = f"{row[target_column]}."

    # Select system prompt based on dataset type
    if DATASET_TYPE == "general":
        system_prompt = (
            "You are an expert in Chemical Vapor Deposition (CVD) of Graphene with a strong "
            "understanding of process variables. Your task is to predict the number of graphene layers "
            "based on the following attributes: "
            + ", ".join(attribute_columns) +
            ". The target variable to predict is: " + target_column + "."
        ) if USE_DYNAMIC_PROMPT else (
            "You are an expert in Chemical Vapor Deposition (CVD) of Graphene. "
            "Your task is to predict the number of graphene layers based on process variables."
        )
    else:  # Numerical dataset system prompt
        system_prompt = (
            "You are an expert in Chemical Vapor Deposition (CVD) of Graphene with a strong "
            "understanding of process variables. Your task is to predict the number of graphene layers "
            "based on the following attributes: "
            + ", ".join(attribute_columns) +
            ". The target variable to predict is: " + target_column + "."
        )
        '''system_prompt = (
            "You are an expert in data-driven prediction. The dataset consists of pre-processed, "
            "feature-engineered, and normalized attributes extracted using advanced techniques for graphene CVD. "
            "The values are unitless, transformed features. "
            "Your task is to predict the number of graphene layers based on these featurized attributes."
        )'''

    # Append to JSONL format
    jsonl_data.append({
        "messages": [
            {"role": "system", "content": system_prompt},
            {"role": "user", "content": user_prompt},
            {"role": "assistant", "content": assistant_response}
        ]
    })

# Shuffle the data for randomness
random.shuffle(jsonl_data)

# Split the data into train (80%) and validation (20%)
split_index = int(0.8 * len(jsonl_data))
train_data, validation_data = jsonl_data[:split_index], jsonl_data[split_index:]

# Save train data to JSONL file
train_gpt = "train_dataset.jsonl"
with open(train_gpt, "w", encoding="utf-8") as f:
    for entry in train_data:
        f.write(json.dumps(entry, ensure_ascii=False) + "\n")

# Save validation data to JSONL file
validation_gpt = "validation_dataset.jsonl"
with open(validation_gpt, "w", encoding="utf-8") as f:
    for entry in validation_data:
        f.write(json.dumps(entry, ensure_ascii=False) + "\n")

print(f"Train JSONL file has been created: {train_gpt}")
print(f"Validation JSONL file has been created: {validation_gpt}")
print(f"System prompt used: {'Dynamic' if USE_DYNAMIC_PROMPT else 'Constant'}")
print(f"Classification type used: {'Ternary' if N_CLASSES == 3 else 'Binary'}")
print(f"Dataset type used: {DATASET_TYPE.capitalize()}")


Final dataset size after processing: 164
Train JSONL file has been created: train_dataset.jsonl
Validation JSONL file has been created: validation_dataset.jsonl
System prompt used: Dynamic
Classification type used: Ternary
Dataset type used: General


In [None]:
%pip install openai



In [None]:
import os
os.environ["OPENAI_API_KEY"] = " "

In [None]:
import json
import openai

# Initialize the OpenAI client
client = openai.OpenAI()

# Path to your JSONL file
jsonl_file_path = "validation_dataset.jsonl"

# List to store model responses
responses = []

# Read the JSONL file and process each entry
with open(jsonl_file_path, "r", encoding="utf-8") as file:
    for line in file:
        data = json.loads(line)  # Load each JSON object

        # Extract messages
        messages = data["messages"]

        # Filter only the system and user messages
        filtered_messages = [
            msg for msg in messages if msg["role"] in ["system", "user"]
        ]

        # Send request to the fine-tuned model
        try:
            completion = client.chat.completions.create(
                model="ft:gpt-4o-mini-2024-07-18:cmrlresearchlab:knn-grternarylayers:B0foOlYg",
                messages=filtered_messages
            )

            # Extract assistant's response
            response = completion.choices[0].message.content
            responses.append(response)

        except openai.OpenAIError as e:
            print(f"Error processing entry: {e}")

# Print all collected responses
print(responses)

['0.', '2.', '0.', '0.', '0.', '0.', '0.', '0.', '0.', '0.', '0.', '0.', '0.', '0.', '0.', '0.', '0.', '0.', '0.', '0.', '0.', '0.', '0.', '0.', '0.', '0.', '0.', '0.', '0.', '0.', '0.', '0.', '0.']


In [None]:
import json
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report

# User-defined choice: "binary" or "ternary"
classification_type = input("Enter classification type (binary/ternary): ").strip().lower()

# Validate input
if classification_type not in ["binary", "ternary"]:
    raise ValueError("Invalid input! Please enter 'binary' or 'ternary'.")

# Lists to store true labels
true_labels = []

# Read the JSONL file and extract true labels
with open(jsonl_file_path, "r", encoding="utf-8") as file:
    for line in file:
        data = json.loads(line)  # Load each JSON object

        # Extract the true label (assistant's response)
        true_label = next((msg["content"] for msg in data["messages"] if msg["role"] == "assistant"), None)

        # Convert true label to an integer (handle "0.", "1.", "2.")
        if true_label is not None:
            true_labels.append(int(true_label.strip().replace(".", "")))  # Convert "2." -> 2, "1." -> 1, "0." -> 0

# Ensure predicted labels are also integers
predicted_labels = [int(pred.strip().replace(".", "")) for pred in responses]

# Compute evaluation metrics based on classification type
if classification_type == "binary":
    average_method = "binary"
else:  # Ternary classification
    average_method = "macro"

# Calculate performance metrics
accuracy = accuracy_score(true_labels, predicted_labels)
precision = precision_score(true_labels, predicted_labels, average=average_method)
recall = recall_score(true_labels, predicted_labels, average=average_method)
f1 = f1_score(true_labels, predicted_labels, average=average_method)
report = classification_report(true_labels, predicted_labels)

# Print results
print(f"\nClassification Type: {classification_type.capitalize()}")
print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1 Score: {f1:.4f}")
print("\nClassification Report:\n", report)