# Aluminum Alloy Inverse Design by GPT2 with LoRA

In [1]:
import pandas as pd
import torch
from transformers import (
    GPT2Tokenizer, GPT2LMHeadModel, 
    TrainingArguments, Trainer,
    DataCollatorForLanguageModeling
)
from peft import LoraConfig, get_peft_model
from datasets import Dataset
import warnings
import pandas as pd
from sklearn.metrics import mean_absolute_error, mean_squared_error
import numpy as np

# Suppress warnings
warnings.filterwarnings("ignore")
# fixed element order
ELEMENTS = ["Ag","Al","B","Be","Bi","Cd","Co","Cr","Cu","Er","Eu","Fe","Ga","Li","Mg",
            "Mn","Ni","Pb","Sc","Si","Sn","Ti","V","Zn","Zr"]


2025-10-04 10:54:49.892126: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1759575290.092243      19 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1759575290.151422      19 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [2]:
# !python -m pip install --upgrade pip
# !pip install bitsandbytes

# Define Data Format

In [3]:
import pandas as pd
import torch
from transformers import (
    GPT2Tokenizer, GPT2LMHeadModel, 
    TrainingArguments, Trainer,
    DataCollatorForLanguageModeling
)
from peft import LoraConfig, get_peft_model
from datasets import Dataset

# Load and preprocess data for inverse design
def load_data(csv_path):
    df = pd.read_csv(csv_path,index_col=0).reset_index(drop=True)

    # Handle missing values for mechanical properties
    for col in ['Elongation (%)', 'Tensile Strength (MPa)', 'Yield Strength (MPa)']:
        if col in df.columns:
            df[col] = df[col].fillna(-1)

    # Identify ingredient columns (all numeric except the known property columns)
    exclude_cols = ['Elongation (%)', 'Tensile Strength (MPa)', 'Yield Strength (MPa)', 'Processing']
    ingredient_cols = ELEMENTS

    # Create training texts for inverse design
    texts = ["Given target properties and processing, suggest alloy compositions."]
    for _, row in df.iterrows():
        # Expected properties (input)
        prop_text = (
            f"Target -> Tensile: {row['Tensile Strength (MPa)']}MPa, "
            f"Yield: {row['Yield Strength (MPa)']}MPa, "
            f"Elongation: {row['Elongation (%)']}%"
        )

        # Alloy composition (output)
        ingredients = ", ".join([f"{col}: {row[col]}" for col in ingredient_cols])

        # Processing info
        processing = row['Processing'] if 'Processing' in df.columns else "N/A"

        text = f"{prop_text} under Processing: {processing} | Suggested Composition: {ingredients}"
        texts.append(text)

    return texts


# Define Train and Predict functions

In [4]:
# training 
def train_on_gpu(csv_path):
    print("🚀 Starting GPU training...")
    
    # Clear GPU cache
    if torch.cuda.is_available():
        torch.cuda.empty_cache()
        print(f"GPU: {torch.cuda.get_device_name()}")
        print(f"GPU Memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.1f} GB")
    
    # Load tokenizer and model
    tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
    tokenizer.pad_token = tokenizer.eos_token
    
    # Load model and move to GPU with float16
    model = GPT2LMHeadModel.from_pretrained("gpt2")
    model = model.to('cuda')
    model = model.half()  # Use float16 for GPU efficiency
    
    # Load and preprocess data
    texts = load_data(csv_path)
    print(f"📊 Loaded {len(texts)} training examples")
    
    # Tokenize data
    tokens = tokenizer(
        texts,
        truncation=True,
        padding=True,
        max_length=256,
        return_tensors="pt"
    )
    
    # Create dataset
    dataset = Dataset.from_dict(tokens)
    
    # Configure LoRA
    lora_config = LoraConfig(
        r=8,
        lora_alpha=32,
        target_modules=["c_attn", "c_proj"],
        lora_dropout=0.1
    )
    
    # Apply LoRA to model
    model = get_peft_model(model, lora_config)
    model.print_trainable_parameters()
    
    # Training arguments optimized for GPU
    training_args = TrainingArguments(
        output_dir="./aluminum-gpu-model",
        num_train_epochs=10,
        per_device_train_batch_size=8,
        per_device_eval_batch_size=8,
        fp16=True,
        logging_steps=50,
        save_steps=200,
        save_total_limit=2,
        report_to="none",
        remove_unused_columns=False
    )
    
    # Data collator
    data_collator = DataCollatorForLanguageModeling(
        tokenizer=tokenizer,
        mlm=False
    )
    
    # Initialize trainer
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=dataset,
        data_collator=data_collator,
        processing_class=tokenizer,
    )
    
    # Start training
    print("🎯 Training started...")
    trainer.train()
    
    # Save model
    trainer.save_model()
    print("✅ Model saved to ./aluminum-gpu-model")
    
    return model, tokenizer

# Inference 
def predict(model, tokenizer, processing,tensile_strength, yield_strength, elongation):
    model.eval()
    prop_text = (
        f"Target -> Tensile: {tensile_strength}MPa, "
        f"Yield: {yield_strength}MPa, "
        f"Elongation: {elongation}%"
    )
    input_text = f"{prop_text} under Processing: {processing} | Suggested Composition:"

    inputs = tokenizer(input_text, return_tensors="pt").to('cuda')
    
    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=160,
            temperature=0.7,
            do_sample=True,
            pad_token_id=tokenizer.eos_token_id
        )
    
    response = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return response

# Evaluation 

In [5]:
import re

def parse_prediction(pred_text):
    """
    Convert model output (string or list) into a numeric list ordered by ELEMENTS.
    """
    values = {el: 0.0 for el in ELEMENTS}

    if isinstance(pred_text, (list, np.ndarray)):
        # assume already numeric list in same order
        for el, val in zip(ELEMENTS, pred_text):
            values[el] = float(val)
    elif isinstance(pred_text, str):
        # parse text like "Al: 85, Mg: 5, Zn: 10"
        matches = re.findall(r"([A-Za-z]+)\s*[:=]\s*([0-9.]+)", pred_text)
        for el, val in matches:
            if el in values:
                values[el] = float(val)
    else:
        raise ValueError("Unknown prediction format")

    return [values[el] for el in ELEMENTS]
def evaluate(model, tokenizer, test_path):
    df_test = pd.read_csv(test_path,index_col=0).reset_index(drop=True)

    exclude_cols = ['Tensile Strength (MPa)', 'Yield Strength (MPa)', 'Elongation (%)', 'Processing']
    ingredient_cols = [c for c in df_test.columns if c not in exclude_cols and c in ELEMENTS]

    y_true, y_pred = [], []
    for _, row in df_test.iterrows():
        # expected properties (inputs)
        tensile = row['Tensile Strength (MPa)']
        yield_strength = row['Yield Strength (MPa)']
        elong = row['Elongation (%)']
        processing = row['Processing']

        # ground-truth composition in fixed order
        true_comp = [row.get(el, 0.0) for el in ELEMENTS]

        # model prediction
        pred_raw = predict(model, tokenizer, processing,tensile, yield_strength, elong)
        pred_comp = parse_prediction(pred_raw)

        y_true.append(true_comp)
        y_pred.append(pred_comp)

    y_true = np.array(y_true)
    y_pred = np.array(y_pred)

    print("📊 Evaluation results per ingredient:")
    results = {}
    for i, el in enumerate(ELEMENTS):
        mae = mean_absolute_error(y_true[:, i], y_pred[:, i])
        rmse = mean_squared_error(y_true[:, i], y_pred[:, i], squared=False)
        results[el] = {"MAE": mae, "RMSE": rmse}
        print(f"  {el:<3} | MAE: {mae:.4f} | RMSE: {rmse:.4f}")

    overall_mae = np.mean([v["MAE"] for v in results.values()])
    overall_rmse = np.mean([v["RMSE"] for v in results.values()])
    print("\n🔎 Overall performance:")
    print(f"  Avg MAE:  {overall_mae:.4f}")
    print(f"  Avg RMSE: {overall_rmse:.4f}")

    return results

# Run

In [6]:
# Run training
print("🤖 Aluminum Alloy Inverse Design - GPU Training")
model, tokenizer = train_on_gpu("/kaggle/input/alloy-dataset/train.csv")

🤖 Aluminum Alloy Inverse Design - GPU Training
🚀 Starting GPU training...
GPU: Tesla P100-PCIE-16GB
GPU Memory: 17.1 GB


tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

📊 Loaded 924 training examples


No label_names provided for model class `PeftModel`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


trainable params: 811,008 || all params: 125,250,816 || trainable%: 0.6475
🎯 Training started...


`loss_type=None` was set in the config but it is unrecognised.Using the default loss: `ForCausalLMLoss`.


Step,Training Loss
50,2.2397
100,1.5606
150,1.0259
200,0.8032
250,0.6867
300,0.6437
350,0.5911
400,0.56
450,0.5572
500,0.5284


✅ Model saved to ./aluminum-gpu-model


In [7]:
# Test prediction
print("\n🧪 Testing prediction...")
prediction = predict(model, tokenizer, "Solutionised  + Artificially peak aged",300, 250, 12)
print(f"Prediction: {prediction}")


🧪 Testing prediction...
Prediction: Target -> Tensile: 300MPa, Yield: 250MPa, Elongation: 12% under Processing: Solutionised  + Artificially peak aged | Suggested Composition: Ag: 0.0, Al: 0.8834, B: 0.0, Be: 0.0, Bi: 0.0, Cd: 0.0, Co: 0.0, Cr: 0.0025, Cu: 0.0006, Er: 0.0, Eu: 0.0, Fe: 0.0025, Ga: 0.0, Li: 0.0, Mg: 0.01, Mn: 0.0005, Ni: 0.0, Pb: 0.0, Sc: 0.0, Si: 0.0012, Sn: 0.0, Ti: 0.0014, V: 0.0, Zn: 0.0, Zr: 0


In [8]:
evalaute_results = evaluate(model, tokenizer, "/kaggle/input/alloy-dataset/test.csv")
evalaute_results

📊 Evaluation results per ingredient:
  Ag  | MAE: 0.0080 | RMSE: 0.0850
  Al  | MAE: 0.0719 | RMSE: 0.1536
  B   | MAE: 0.0000 | RMSE: 0.0000
  Be  | MAE: 0.0000 | RMSE: 0.0000
  Bi  | MAE: 0.0000 | RMSE: 0.0005
  Cd  | MAE: 0.0000 | RMSE: 0.0002
  Co  | MAE: 0.0001 | RMSE: 0.0008
  Cr  | MAE: 0.0020 | RMSE: 0.0049
  Cu  | MAE: 0.0150 | RMSE: 0.0242
  Er  | MAE: 0.0001 | RMSE: 0.0009
  Eu  | MAE: 0.0000 | RMSE: 0.0001
  Fe  | MAE: 0.0025 | RMSE: 0.0048
  Ga  | MAE: 0.0001 | RMSE: 0.0012
  Li  | MAE: 0.0022 | RMSE: 0.0073
  Mg  | MAE: 0.0139 | RMSE: 0.0192
  Mn  | MAE: 0.0034 | RMSE: 0.0067
  Ni  | MAE: 0.0002 | RMSE: 0.0015
  Pb  | MAE: 0.0001 | RMSE: 0.0005
  Sc  | MAE: 0.0005 | RMSE: 0.0021
  Si  | MAE: 0.0089 | RMSE: 0.0296
  Sn  | MAE: 0.0009 | RMSE: 0.0132
  Ti  | MAE: 0.0013 | RMSE: 0.0034
  V   | MAE: 0.0000 | RMSE: 0.0003
  Zn  | MAE: 0.0119 | RMSE: 0.0267
  Zr  | MAE: 0.0002 | RMSE: 0.0005

🔎 Overall performance:
  Avg MAE:  0.0057
  Avg RMSE: 0.0155


{'Ag': {'MAE': 0.007963203463203464, 'RMSE': 0.08496291653508127},
 'Al': {'MAE': 0.07193608157532465, 'RMSE': 0.15362016966966963},
 'B': {'MAE': 1.7316017316017317e-06, 'RMSE': 1.315903389919538e-05},
 'Be': {'MAE': 1.9913419913419913e-07, 'RMSE': 2.961513594526078e-06},
 'Bi': {'MAE': 4.540939393939394e-05, 'RMSE': 0.0004885501954161164},
 'Cd': {'MAE': 1.9480519480519483e-05, 'RMSE': 0.00018898223650461364},
 'Co': {'MAE': 7.792207792207793e-05, 'RMSE': 0.0007501803534954529},
 'Cr': {'MAE': 0.002041128051948052, 'RMSE': 0.004875043472988771},
 'Cu': {'MAE': 0.015042426482683943, 'RMSE': 0.024167077620625366},
 'Er': {'MAE': 0.0001432900432900433, 'RMSE': 0.0008692684888040666},
 'Eu': {'MAE': 1.2987012987012554e-05, 'RMSE': 0.00011800286151053804},
 'Fe': {'MAE': 0.0024782441545406863, 'RMSE': 0.004814907857889838},
 'Ga': {'MAE': 7.878787878787879e-05, 'RMSE': 0.001151772584575656},
 'Li': {'MAE': 0.002247992554112554, 'RMSE': 0.007328014764451078},
 'Mg': {'MAE': 0.0138887115432

# Save and Upload to Hugging Face Hub

In [9]:
from huggingface_hub import HfApi, HfFolder, login

# Upload to Hugging Face Hub
def upload_to_huggingface(model, tokenizer, model_name, organization=None):
    """
    Upload model and tokenizer to Hugging Face Hub
    
    Args:
        model: Your trained model
        tokenizer: Your tokenizer
        model_name: Name for your model (e.g., "aluminum-alloy-inverse-design")
        organization: Your HF organization name (optional)
    """
    
    # Login to Hugging Face (you'll need an access token)
    # You can get your token from https://huggingface.co/settings/tokens
    login(token="-")  # Replace with your token
    
    # Create full model name
    if organization:
        repo_name = f"{organization}/{model_name}"
    else:
        repo_name = model_name
    
    print(f"🚀 Uploading model to Hugging Face Hub: {repo_name}")
    
    # Save model and tokenizer locally first
    model.save_pretrained("./saved_model")
    tokenizer.save_pretrained("./saved_model")
    
    # Upload using push_to_hub method
    model.push_to_hub(
        repo_id=repo_name,
        commit_message="Add aluminum alloy inverse design model"
    )
    
    tokenizer.push_to_hub(
        repo_id=repo_name,
        commit_message="Add tokenizer for aluminum alloy model"
    )
    
    print(f"✅ Successfully uploaded to: https://huggingface.co/{repo_name}")
# Define your model details
MODEL_NAME = "aluminum-alloy-inverse-design"
# ORGANIZATION = "your-organization"  # Optional

try:
    # Method 1: Using push_to_hub (recommended)
    upload_to_huggingface(model, tokenizer, MODEL_NAME)
    
    # Method 2: Using HfApi (alternative)
    # upload_using_api(MODEL_NAME)
    
except Exception as e:
    print(f"❌ Error uploading model: {e}")

❌ Error uploading model: Invalid user token.
