In [1]:
!git clone https://github.com/icomse/5th_workshop_MachineLearning.git
import os
os.chdir('5th_workshop_MachineLearning/data')
data_dir = os.getcwd()

# Make sure you can see the files we'll need

In [None]:
os.listdir(data_dir)

# Import the python modules we'll use and configure matplotlib



In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os

from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.ensemble import RandomForestClassifier
from sklearn.inspection import permutation_importance

plt.style.use(os.path.join(data_dir, 'configs', 'plot_style.mplstyle'))

# load the data from the ["New Tolerance Factor" paper](https://www.science.org/doi/10.1126/sciadv.aav0693)

In [None]:
fcsv = os.path.join(data_dir, 'perovskites', 'perovskite_data.csv')
df = pd.read_csv(fcsv)
df.head()

# Consider a new feature called the "octahedral factor" (mu = rB/rX)

In [None]:
df['mu'] = df['rB (Ang)'] / df['rX (Ang)']
df.head()

# Visualize how perovskites and nonperovskites segment on a 2D map made of Goldschmidt's tolerance factor, t, and the octahedral factor, mu

In [None]:
fig = plt.figure(figsize=(3,2.5))
ax = plt.subplot(111)

# just look at original training set
df_train = df.get(df.is_train == 1)

# plot perovskite (+1) class
t_perov = df_train.t.get((df_train.exp_label == 1)).values
mu_perov = df_train.mu.get((df_train.exp_label == 1)).values
ax = plt.scatter(t_perov, mu_perov, color='blue', alpha=0.2, marker='o', lw=0, label='perovskite')

# plot nonperovskite (-1) class
t_nonperov = df_train.t.get((df_train.exp_label == -1)).values
mu_nonperov = df_train.mu.get((df_train.exp_label == -1)).values
ax = plt.scatter(t_nonperov, mu_nonperov, color='red', alpha=0.2, marker='^', lw=0, label='nonperovskite')

# label
ax = plt.ylabel(r'$\mu=\frac{r_B}{r_X}$')
ax = plt.xlabel(r'$t=\frac{r_A+r_X}{\sqrt{2}(r_B+r_X)}$')
plt.legend()

# **Hands-on**: Which feature (t or mu) is more important for this classification problem?

## Guidelines
- determine (quantitatively) whether t or mu is more valuable in separating perovskites from nonperovskites
- consider a decision tree that can only make two decisions (i.e., two linear decision boundaries are drawn)
- plot the decision boundaries

## Hints
- information gain will be useful
- you might restrict the `max_leaf_nodes`
- decision tree models can be visualized with `sklearn.tree.plot_tree`

### First, we'll prepare our feature matrix (X_train) and target array (y_train)

In [None]:
y_train = df_train.exp_label.values
X_train = df_train[['t', 'mu']].values

print(y_train.shape)
print(X_train.shape)

### Now, fit a decision tree

In [None]:
dt = DecisionTreeClassifier(criterion='log_loss', max_leaf_nodes=3)
dt.fit(X_train, y_train)

### Let's see what our learned model looks like

In [None]:
plot_tree(dt)

### Now, let's quantify the importance of each feature (t and mu)

In [None]:
def impurity(split):
  """
  Calculate impurity for binary classification problem (0, 1)

  Args:
    split (2-item tuple):
      split[0] = number of samples in class 0
      split[1] = number of samples in class 1

  Returns:
    impurity = -(p0*ln(p0) + p1*ln(p1))
  """
  n_class0, n_class1 = split
  n_samples = n_class0 + n_class1
  frac_class0 = n_class0 / n_samples
  frac_class1 = 1 - frac_class0

  log_frac_class0 = np.log(frac_class0) if frac_class0 != 0 else 0
  log_frac_class1 = np.log(frac_class1) if frac_class1 != 0 else 0

  return -(frac_class0*log_frac_class0 + frac_class1*log_frac_class1)

### Recall our previous splits (A and B)

In [None]:
before_split = (40, 40)

A_left = (30, 10)
A_right = (10, 30)

B_left = (20, 40)
B_right = (20, 0)

In [None]:
impurity(before_split)

In [None]:
impurity(A_left)

In [None]:
impurity(A_right)

In [None]:
impurity(B_left)

In [None]:
impurity(B_right)

In [None]:
def information_gain(before_split, after_split_left, after_split_right):
  """
  Calculate information gain (weighted impurity decrease) for binary classification

  Args:
    before_split (2-item tuple):
      (n_class0, n_class1) before splitting

    after_split_left (2-item tuple):
      (n_class0, n_class1) resulting from the <= criterion of the split

    after_split_right (2-item tuple):
      (n_class0, n_class1) resulting from the > criterion of the split

  Returns:
    information gain by splitting
  """

  impurity_before = impurity(before_split)
  impurity_left = impurity(after_split_left)
  impurity_right = impurity(after_split_right)

  n_left = np.sum(after_split_left)
  n_right = np.sum(after_split_right)
  n_total = np.sum(before_split)

  return n_total * (impurity_before - n_left / n_total * impurity_left - n_right / n_total * impurity_right)

In [None]:
information_gain(before_split, A_left, A_right)

In [None]:
information_gain(before_split, B_left, B_right)

### OK, now back to the perovskites

In [None]:
before_mu_split = (np.count_nonzero(y_train == -1), np.count_nonzero(y_train == 1))

In [None]:
before_mu_split

In [None]:
mu_boundary = 0.427

after_mu_split_right = (np.count_nonzero((X_train[:, 1] > mu_boundary) & (y_train == -1)),
                        np.count_nonzero((X_train[:, 1] > mu_boundary) & (y_train == 1)))

after_mu_split_left = (np.count_nonzero((X_train[:, 1] <= mu_boundary) & (y_train == -1)),
                       np.count_nonzero((X_train[:, 1] <= mu_boundary) & (y_train == 1)))


In [None]:
after_mu_split_left

In [None]:
after_mu_split_right

In [None]:
before_t_split = after_mu_split_right

In [None]:
t_boundary = 0.835

after_t_split_right = (np.count_nonzero((X_train[:, 1] > mu_boundary) &
                                        (X_train[:, 0] > t_boundary) &
                                         (y_train == -1)),
                       np.count_nonzero((X_train[:, 1] > mu_boundary) &
                                        (X_train[:, 0] > t_boundary) &
                                         (y_train == 1)))

after_t_split_left = (np.count_nonzero((X_train[:, 1] > mu_boundary) &
                                        (X_train[:, 0] <= t_boundary) &
                                         (y_train == -1)),
                       np.count_nonzero((X_train[:, 1] > mu_boundary) &
                                        (X_train[:, 0] <= t_boundary) &
                                         (y_train == 1)))


In [None]:
IG_mu = information_gain(before_mu_split, after_mu_split_right, after_mu_split_left)

In [None]:
IG_mu

In [None]:
IG_t = information_gain(before_t_split, after_t_split_right, after_t_split_left)

In [None]:
IG_t

### Alternatively, sklearn nicely does this for us already

In [None]:
features = ['t', 'mu']
importances = dt.feature_importances_
print(dict(zip(features, importances)))

### Why the difference compared to our numbers?

In [None]:
our_importances = [IG_t / (IG_mu + IG_t), IG_mu / (IG_mu + IG_t)]
print(dict(zip(features, our_importances)))

### Visualize our learned decision boundary

In [None]:
fig = plt.figure(figsize=(3,2.5))
ax = plt.subplot(111)

# just look at original training set
df_train = df.get(df.is_train == 1)

# plot perovskite (+1) class
t_perov = df_train.t.get((df_train.exp_label == 1)).values
mu_perov = df_train.mu.get((df_train.exp_label == 1)).values
ax = plt.scatter(t_perov, mu_perov, color='blue', alpha=0.2, marker='o', lw=0, label='perovskite')

# plot nonperovskite (-1) class
t_nonperov = df_train.t.get((df_train.exp_label == -1)).values
mu_nonperov = df_train.mu.get((df_train.exp_label == -1)).values
ax = plt.scatter(t_nonperov, mu_nonperov, color='red', alpha=0.2, marker='^', lw=0, label='nonperovskite')

# plot_boundaries
ax = plt.plot([0.835, 1000], [0.427, 0.427], color='black', ls='--')
ax = plt.plot([0.835, 0.835], [0.427, 1000], color='black', ls='--')

# label
ax = plt.ylabel(r'$\mu=\frac{r_B}{r_X}$')
ax = plt.xlabel(r'$t=\frac{r_A+r_X}{\sqrt{2}(r_B+r_X)}$')
ax = plt.ylim([df.mu.min(), df.mu.max()])
ax = plt.xlim([df.t.min(), df.t.max()])

plt.legend()

### Would the information gain have been the same if t made the first split instead of mu?

In [None]:
before_t_split2 = before_mu_split

after_t_split_right2 = (np.count_nonzero((X_train[:, 0] > t_boundary) & (y_train == -1)), np.count_nonzero((X_train[:, 0] > t_boundary) & (y_train == 1)))
after_t_split_left2 = (np.count_nonzero((X_train[:, 0] <= t_boundary) & (y_train == -1)), np.count_nonzero((X_train[:, 0] <= t_boundary) & (y_train == 1)))

before_mu_split2 = after_t_split_right2

after_mu_split_right2 = (np.count_nonzero((X_train[:, 1] > mu_boundary) &
                                        (X_train[:, 0] > t_boundary) &
                                         (y_train == -1)),
                       np.count_nonzero((X_train[:, 1] > mu_boundary) &
                                        (X_train[:, 0] > t_boundary) &
                                         (y_train == 1)))
after_mu_split_left2 = (np.count_nonzero((X_train[:, 1] <= mu_boundary) &
                                        (X_train[:, 0] > t_boundary) &
                                         (y_train == -1)),
                       np.count_nonzero((X_train[:, 1] <= mu_boundary) &
                                        (X_train[:, 0] > t_boundary) &
                                         (y_train == 1)))

In [None]:
IG_t_2 = information_gain(before_t_split2, after_t_split_right2, after_t_split_left2)
IG_mu_2 = information_gain(before_mu_split2, after_mu_split_right2, after_mu_split_left2)

In [None]:
IG_t_2

In [None]:
IG_mu_2

### Not the same! Part of instability of decision trees

# Now, lets consider importances for an ensemble of decision trees (random forest)

## This time, we'll use the ionic features (oxidation states and radii)

In [None]:
ions = ['A', 'B', 'X']
df = df.rename(columns={'r%s (Ang)' % v : 'r%s' % v for v in ions})
df_train = df.get((df.is_train == 1))
features = ['n%s' % ion for ion in ions] + ['r%s' % ion for ion in ions]


In [None]:
features

# Fit a generic random forest to these features (again predicting perovskite vs nonperovskite)

In [None]:
rf = RandomForestClassifier(random_state=44)

X_train, y_train = df_train[features].values, df_train['exp_label'].values
rf.fit(X_train, y_train)

# Use sklearn to compute the information gain over this forest of decision trees

In [None]:
rf_importances = rf.feature_importances_
print(features)
print(rf_importances)

# Let's visualize these

In [None]:
def plot_importances(features_and_their_importances, ylabel='importance'):
    """
    Args:
        features_and_their_importances (dict):
            {feature (str) : importance (float)}
    Returns:
        matplotlib bar chart of sorted importances
    """
    axis_width = 1.5
    maj_tick_len = 6
    fontsize = 14
    bar_color = 'lightblue'
    align = 'center'
    label = '__nolegend__'

    n_features = len(features_and_their_importances)
    sorted_features = sorted(features_and_their_importances,
                             key=features_and_their_importances.get,
                             reverse=True)
    sorted_importances = [features_and_their_importances[f] for f in sorted_features]

    ax = plt.bar(range(n_features), sorted_importances,
                 color=bar_color, align=align, label=label)
    ax = plt.xticks(range(n_features), sorted_features, rotation=90)
    ax = plt.xlim([-1, n_features])
    ax = plt.ylabel(ylabel, fontsize=fontsize)
    ax = plt.tick_params('both', length=maj_tick_len, width=axis_width,
                         which='major', right=True, top=True)
    ax = plt.xticks(fontsize=fontsize)
    ax = plt.yticks(fontsize=fontsize)
    ax = plt.tight_layout()
    return ax

In [None]:
features_and_their_importances = dict(zip(features, rf_importances))

fig = plt.figure()
ax = plt.subplot(111)
ax = plot_importances(features_and_their_importances)

# **Hands-on**: Are these reliable??

## Guidelines:
- include a random variable among your features and repeat this process
- train, compute importances, plot

In [None]:
np.random.seed(44)

df['random'] = np.random.rand(len(df))
df_train = df.get(df.is_train == 1)


In [None]:
df.head()

In [None]:
features += ['random']

In [None]:
X_train = df_train[features].values
rf2 = RandomForestClassifier(random_state=44)
rf2.fit(X_train, y_train)

In [None]:
features_and_their_importances2 = dict(zip(features, rf2.feature_importances_))

In [None]:
fig = plt.figure(figsize=(5,2.5))
ax = plt.subplot(111)
ax = plot_importances(features_and_their_importances2)

# Concerning! Let's discuss another way to compute importances

# OK, now let's give permutation importances a try

In [None]:
pm_importances = permutation_importance(rf2, X_train, y_train, random_state=44)['importances_mean']

In [None]:
features_to_pm_importances = dict(zip(features, pm_importances))

In [None]:
fig = plt.figure(figsize=(5,2.5))
ax = plt.subplot(111)
ax = plot_importances(features_to_pm_importances)

# Looks better, but still makes me a bit uncomfortable!
- what is this telling us?
- what's something else we can try?

In [None]:
X_test = df[features].get(df.is_train == -1).values
y_test = df['exp_label'].get(df.is_train == -1).values

pm_importances_test = permutation_importance(rf2, X_test, y_test, random_state=44, n_repeats=5)['importances_mean']
features_to_pm_importances_test = dict(zip(features, pm_importances_test))

In [None]:
fig = plt.figure(figsize=(5,2.5))
ax = plt.subplot(111)
ax = plot_importances(features_to_pm_importances_test)