In [None]:
import numpy as np
import matplotlib.pyplot as plt

# ----- make nice figures -----
import matplotlib as mpl
mpl.rcParams['figure.dpi']= 200
from cycler import cycler
COLORS = ['#F00D2C', '#242482', '#0071BE', '#4E8F00', '#553C67', '#DA5319']
default_cycler = cycler(color=COLORS)
plt.rc('axes', prop_cycle=default_cycler) 
# -----------------------------

In [None]:
data = np.loadtxt('data/perovskite_data.txt')

# Shuffle data
I_perm = np.random.permutation(len(data))
data = data[I_perm, :]

# Class data - whether we form Perovskite or not
c = data[:, 0]

# Extract all but the 0-th column
X = data[:, 1:]

# Normalize data
X = (X - np.mean(X, axis=0))/np.std(X, axis = 0)

In [None]:
# Training, validation and testing set
num_points = len(c)
num_train = int(0.8*0.8*num_points)
num_valid = int(0.8*0.2*num_points)
num_test = len(c) - num_train - num_valid

X_train = X[:num_train, :]
c_train = c[:num_train]

X_valid = X[num_train:(num_train + num_valid), :]
c_valid = c[num_train:(num_train + num_valid)]

# get the last "num_test" rows
X_test = X[-num_test:, :]
c_test = c[-num_test:]

print("         Total data size: " + str(num_points))
print("  Training data set size: " + str(num_train))
print("Validation data set size: " + str(num_valid))
print("   Testing data set size: " + str(num_test))

# Bagging

https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.BaggingClassifier.html

In [None]:
from sklearn.ensemble import BaggingClassifier
model = BaggingClassifier(n_estimators = 3, max_samples = 0.1)
model.fit(X_train, c_train)

In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, confusion_matrix, f1_score

# check with testing data
c_valid_model = model.predict(X_valid)

# Precision, recall confusion matrix
precision = precision_score(c_valid, c_valid_model)
recall = recall_score(c_valid, c_valid_model)
acc = accuracy_score(c_valid, c_valid_model)
f1 = f1_score(c_valid, c_valid_model)

print("Bagging:")
print("Precision = " + str(precision))
print("   Recall = " + str(recall))
print(" Accuracy = " + str(acc))
print("       F1 = " + str(acc))

In [None]:
n_estimators = range(1, 10)
max_samples = np.linspace(0.05, 1.0, 10)
accuracies = np.zeros([len(n_estimators), len(max_samples)])

for i, n in enumerate(n_estimators):
    for j, s in enumerate(max_samples):
        model = BaggingClassifier(n_estimators = n, max_samples = s)
        model.fit(X_train, c_train)
        c_valid_model = model.predict(X_valid)
        accuracies[i,j] = accuracy_score(c_valid, c_valid_model)

In [None]:
# flat index of best accuracy
ij_max = np.argmax(accuracies)
print(ij_max)

In [None]:
# Convert back into (i,j) pair
ij_max = np.unravel_index(ij_max, accuracies.shape)
print(ij_max)

In [None]:
best_n = n_estimators[ij_max[0]]
best_s = max_samples[ij_max[1]]

model = BaggingClassifier(n_estimators = best_n, max_samples = best_s)
model.fit(X_train, c_train)
c_valid_model = model.predict(X_valid)

# score best model
precision = precision_score(c_valid, c_valid_model)
recall = recall_score(c_valid, c_valid_model)
acc = accuracy_score(c_valid, c_valid_model)
f1 = f1_score(c_valid, c_valid_model)

print("Bagging:")
print("Precision = " + str(precision))
print("   Recall = " + str(recall))
print(" Accuracy = " + str(acc))
print("       F1 = " + str(acc))

# Random Forests

https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestClassifier.html

In [None]:
from sklearn.ensemble import RandomForestClassifier
model = RandomForestClassifier(n_estimators = 3)
model.fit(X_train, c_train)

In [None]:
# check with testing data
c_valid_model = model.predict(X_valid)

# Precision, recall confusion matrix
precision = precision_score(c_valid, c_valid_model)
recall = recall_score(c_valid, c_valid_model)
acc = accuracy_score(c_valid, c_valid_model)
f1 = f1_score(c_valid, c_valid_model)

print("Random Forest:")
print("Precision = " + str(precision))
print("   Recall = " + str(recall))
print(" Accuracy = " + str(acc))
print("       F1 = " + str(acc))

In [None]:
importances = model.feature_importances_

# Plot the features importances using a bar chart
feature_names = ["A-EN - O-EN", "rA/rO", "A-ionicity", "dA-O", "B-EN - O-EN", "rB/rO", "B-ionicity", "dB-O", "dA-O/dB-O", "A-EN - B-EN", "rA/rB", "t_BV", "t_IR", "GII"]
x_plot = [2*i for i in range(len(importances))]
plt.barh(x_plot, model.feature_importances_)
plt.yticks(x_plot, feature_names)
plt.grid(axis = 'x')
plt.xlabel('Importance')

# Adaboost

In [None]:
from sklearn.ensemble import AdaBoostClassifier
model = AdaBoostClassifier()
model.fit(X_train, c_train)

In [None]:
# check with testing data
c_valid_model = model.predict(X_valid)

# Precision, recall confusion matrix
precision = precision_score(c_valid, c_valid_model)
recall = recall_score(c_valid, c_valid_model)
acc = accuracy_score(c_valid, c_valid_model)
f1 = f1_score(c_valid, c_valid_model)

print("Adaboost:")
print("Precision = " + str(precision))
print("   Recall = " + str(recall))
print(" Accuracy = " + str(acc))
print("       F1 = " + str(acc))

In [None]:
# get the individual classifiers
individual_models = model.estimators_

# individual model predictions
individual_acc = np.zeros(len(individual_models))
for i, model_i in enumerate(individual_models):
    # check with testing data
    c_valid_model = model_i.predict(X_valid)
    
    # convert to -1
    c_valid_model[c_valid_model == 0] = -1
    
    # accuracy
    individual_acc[i] = accuracy_score(c_valid, c_valid_model)

In [None]:
plt.hist(individual_acc, rwidth=0.9)
plt.xlabel('Accuracy')
plt.ylabel('Frequency')
plt.title('Adaboost: individual model accuracies')

In [None]:
print(np.mean(individual_acc))

In [None]:
# Compare to random forest
model = BaggingClassifier(n_estimators = 50, max_samples = best_s)
model.fit(X_train, c_train)

In [None]:
# get the individual classifiers
individual_models = model.estimators_

# individual model predictions
individual_acc = np.zeros(len(individual_models))
for i, model_i in enumerate(individual_models):
    # check with testing data
    c_valid_model = model_i.predict(X_valid)
    
    c_valid_model[c_valid_model == 0] = -1
    
    # accuracy
    individual_acc[i] = accuracy_score(c_valid, c_valid_model)

In [None]:
print(np.mean(individual_acc))

In [None]:
plt.hist(individual_acc, rwidth=0.9)
plt.xlabel('Accuracy')
plt.ylabel('Frequency')
plt.title('Bagging: individual model accuracies')