# Try it Yourself!

- Using the MPRester API find 2 element oxides that are stable. Sample at least 5 different properties including band gap. Don't forget to clean the data!
- If the band gap is between 0.5-3 then change the value to 1 to signifiy a semiconductor. If it's outside that range change the value to 0 to signify it's not a semiconductor (a metal or insulator).
- Set up a NN to classify if something is a semiconductor or not. Make sure to create a train test split for validation!
- Perform hyperparameter tuning on the model and compare the performance from pre-tuning to post-tuning

In [1]:
import os
import torch
import random
import torch.nn as nn
import torch.optim as optim
import pandas as pd
from mp_api.client import MPRester
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from torch.utils.data import DataLoader, TensorDataset
from mp_api.client import MPRester

In [2]:
# api key
filename = r'api.txt'

def get_file_contents(filename):
    try:
        with open(filename, 'r') as f:
            return f.read().strip()
    except FileNotFoundError:
        print("'%s' file not found" % filename)
        
api_key=get_file_contents(filename)

In [9]:
mpr = MPRester(api_key)
entries = mpr.materials.summary.search(
        chemsys="O-Li,O-Na,O-K,O-Be,O-Mg,O-Ca,O-Si,O-Al,O-Hf,O-Ti,O-Zr,O-Ga,O-In,O-Zn,O-Ta,O-Ge",
        band_gap=(0.000001,50),
        fields=["formula_pretty", "band_gap", "density", "formation_energy_per_atom", "volume","energy_above_hull"]
    )

Retrieving SummaryDoc documents:   0%|          | 0/676 [00:00<?, ?it/s]

In [19]:
df = pd.DataFrame(columns=('formula_pretty', 'band_gap', 'density', 'formation_energy_per_atom', 'volume','energy_above_hull'))
for i, entry in enumerate(entries):
    df.loc[i] = [
        entry.formula_pretty,
        entry.band_gap,
        entry.density,
        entry.formation_energy_per_atom,
        entry.volume,
        entry.energy_above_hull
    ]
stable_df = df.loc[df["energy_above_hull"] < 0.1]

In [25]:
stable_df["semiconductor"] = stable_df['band_gap'].apply(lambda x: 1 if x < 3 and x > 0.5 else 0)
target = "semiconductor"
predictors = ['density', 'formation_energy_per_atom', 'volume', 'energy_above_hull']
X = stable_df[predictors].values
y = stable_df[target].values

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  stable_df["semiconductor"] = stable_df['band_gap'].apply(lambda x: 1 if x < 3 and x > 0.5 else 0)


In [26]:
scaler = StandardScaler()
X = scaler.fit_transform(X)
unique_labels = pd.unique(y)
label_mapping = {old_label: new_label for new_label, old_label in enumerate(unique_labels)}
y = pd.Series(y).map(label_mapping).values

X = torch.tensor(X, dtype=torch.float32)
y = torch.tensor(y, dtype=torch.long)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [29]:
batch_size = 128
train_dataset = TensorDataset(X_train, y_train)
test_dataset = TensorDataset(X_test, y_test)
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)
num_classes = len(torch.unique(y))
print("Number of classes:", num_classes) # should be 2 (0 and 1)
input_size = X_train.shape[1]
hidden_size = 128

Number of classes: 2


In [30]:
model = nn.Sequential(
    nn.Linear(input_size, hidden_size),
    nn.ReLU(),
    nn.Linear(hidden_size, num_classes)
)
criterion = nn.CrossEntropyLoss() # for 0 1 classification
optimizer = optim.Adam(model.parameters(), lr=0.1)
def train_model(num_epochs=50, explicit=True):
    for epoch in range(num_epochs):
        for features, labels in train_loader:
            optimizer.zero_grad()
            outputs = model(features)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()
        if explicit:
            print(f"Epoch {epoch+1}/{num_epochs}, Loss: {loss.item()}")
train_model(50)

Epoch 1/50, Loss: 1.3449344635009766
Epoch 2/50, Loss: 0.24967649579048157
Epoch 3/50, Loss: 0.20486865937709808
Epoch 4/50, Loss: 0.31549492478370667
Epoch 5/50, Loss: 0.21626673638820648
Epoch 6/50, Loss: 0.22689318656921387
Epoch 7/50, Loss: 0.16771072149276733
Epoch 8/50, Loss: 0.2092466801404953
Epoch 9/50, Loss: 0.16498033702373505
Epoch 10/50, Loss: 0.2461671382188797
Epoch 11/50, Loss: 0.228485107421875
Epoch 12/50, Loss: 0.19885089993476868
Epoch 13/50, Loss: 0.0990452915430069
Epoch 14/50, Loss: 0.1871366649866104
Epoch 15/50, Loss: 0.2081928551197052
Epoch 16/50, Loss: 0.20718082785606384
Epoch 17/50, Loss: 0.06058437004685402
Epoch 18/50, Loss: 0.15397939085960388
Epoch 19/50, Loss: 0.17496377229690552
Epoch 20/50, Loss: 0.1800684928894043
Epoch 21/50, Loss: 0.1761961579322815
Epoch 22/50, Loss: 0.133975088596344
Epoch 23/50, Loss: 0.11070585250854492
Epoch 24/50, Loss: 0.12472222000360489
Epoch 25/50, Loss: 0.1532943844795227
Epoch 26/50, Loss: 0.15572160482406616
Epoch 27

In [31]:
correct = 0
total = 0
with torch.no_grad():
    for features, labels in test_loader:
        outputs = model(features)
        _, predicted = torch.max(outputs.data, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()

print(f"Accuracy: {100 * correct / total}%")

Accuracy: 87.83783783783784%


# Hyperparameter tuning - grid search

In [32]:
hidden_sizes = [64, 128, 256]
learning_rates = [0.001, 0.01, 0.1]
num_epochs_list = [20, 50, 100]
batch_sizes = [64, 128]

best_accuracy = 0
best_params = {}

for hidden_size in hidden_sizes:
    for learning_rate in learning_rates:
        for num_epochs in num_epochs_list:
            for batch_size in batch_sizes:
                train_dataset = TensorDataset(X_train, y_train)
                test_dataset = TensorDataset(X_test, y_test)
                train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
                test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)
                
                input_size = X_train.shape[1]
                model = nn.Sequential(
                    nn.Linear(input_size, hidden_size),
                    nn.ReLU(),
                    nn.Linear(hidden_size, num_classes)
                )

                criterion = nn.CrossEntropyLoss()
                optimizer = optim.Adam(model.parameters(), lr=learning_rate)

                train_model(num_epochs, explicit=False)

                correct = 0
                total = 0
                with torch.no_grad():
                    for features, labels in test_loader:
                        outputs = model(features)
                        _, predicted = torch.max(outputs.data, 1)
                        total += labels.size(0)
                        correct += (predicted == labels).sum().item()

                accuracy = 100 * correct / total

                if accuracy > best_accuracy:
                    best_accuracy = accuracy
                    best_params = {
                        'hidden_size': hidden_size,
                        'learning_rate': learning_rate,
                        'num_epochs': num_epochs,
                        'batch_size': batch_size
                    }

print("Best accuracy:", best_accuracy)
print("Best parameters:", best_params)

Best accuracy: 93.24324324324324
Best parameters: {'hidden_size': 64, 'learning_rate': 0.1, 'num_epochs': 50, 'batch_size': 64}


Better performance, let's try random search.
# Hyperparameter tuning - random search

In [33]:
hidden_sizes = [64, 128, 256]
learning_rates = [0.001, 0.01, 0.1]
num_epochs_list = [20, 50, 100]
batch_sizes = [64, 128]

num_samples = 15

best_accuracy = 0
best_params = {}

for _ in range(num_samples):
    hidden_size = random.choice(hidden_sizes)
    learning_rate = random.choice(learning_rates)
    num_epochs = random.choice(num_epochs_list)
    batch_size = random.choice(batch_sizes)

    train_dataset = TensorDataset(X_train, y_train)
    test_dataset = TensorDataset(X_test, y_test)
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

    input_size = X_train.shape[1]

    model = nn.Sequential(
        nn.Linear(input_size, hidden_size),
        nn.ReLU(),
        nn.Linear(hidden_size, num_classes)
    )

    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=learning_rate)

    train_model(num_epochs, explicit=False)
    
    correct = 0
    total = 0
    with torch.no_grad():
        for features, labels in test_loader:
            outputs = model(features)
            _, predicted = torch.max(outputs.data, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()

    accuracy = 100 * correct / total

    if accuracy > best_accuracy:
        best_accuracy = accuracy
        best_params = {
            'hidden_size': hidden_size,
            'learning_rate': learning_rate,
            'num_epochs': num_epochs,
            'batch_size': batch_size
        }

print("Best accuracy:", best_accuracy)
print("Best parameters:", best_params)

Best accuracy: 94.5945945945946
Best parameters: {'hidden_size': 256, 'learning_rate': 0.1, 'num_epochs': 50, 'batch_size': 64}
