In [2]:
# Import train_test_split function
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
import numpy as np
import pandas as pd

## Getting and preparing the data

In [3]:
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/covtype/covtype.data.gz"
df = pd.read_csv(url, header=None)
# set a seed value
np.random.seed(123) 
# reduce size dataset
df = df.sample(n=10000) 

X = df.iloc[:, :-1]
y = df.iloc[:, -1] - 1   #-1 so that we have classes ranging 0-6


# Split dataset into training set and test set
X, X_test, y, y_test = train_test_split(X, y, test_size=0.2) 
X, X_calib, y, y_calib = train_test_split(X, y, test_size=0.2) 

In [4]:
print(df)

          0    1   2    3    4     5    6    7    8     9   ...  45  46  47  \
257726  2924   48  15  420   69   666  224  207  114   900  ...   1   0   0   
407572  2857  185   8   95   12  2195  222  246  158  2340  ...   0   0   0   
552796  3410  208  13  702    6  1421  211  251  174  4001  ...   0   1   0   
389602  2798   66  17  391   68  2731  234  204   96  1909  ...   0   0   0   
509827  2970   40  12  313  107  2456  221  213  124  2174  ...   0   1   0   
...      ...  ...  ..  ...  ...   ...  ...  ...  ...   ...  ...  ..  ..  ..   
225636  2804   24  13  361   78  1575  213  211  133  1452  ...   0   0   0   
460540  3229  253  13   90    5  3145  191  248  195  1249  ...   0   0   0   
237930  3130   76  13  182  -44  1460  236  216  109  1396  ...   0   0   0   
281956  2362  111  38  212  115  1587  253  171    9   342  ...   0   0   0   
361638  3266   28   7  360  -12  1554  218  225  143  1613  ...   0   0   0   

        48  49  50  51  52  53  54  
257726   0   0

In [5]:
print(f"{len(X)} samples in training set, {len(X_test)} samples in test set.")

6400 samples in training set, 2000 samples in test set.


## Fitting the Random Forest Classifier

In [6]:
clf = RandomForestClassifier(max_depth=2, n_estimators=100, random_state=0)
clf.fit(X, y)
y_test_hat = clf.predict(X_test)

## Making prediction for calibration set, calculating si and q-hat

In [7]:
predictions = clf.predict_proba(X_calib)
si = 1 - predictions[np.arange(len(y_calib)),y_calib] 
qhat = np.quantile(si, 1 - .2)  # we ignore (n+1)/n

In [8]:
print(np.around(predictions[:5], decimals=3))

[[0.307 0.498 0.109 0.004 0.024 0.031 0.028]
 [0.286 0.626 0.032 0.002 0.015 0.017 0.022]
 [0.48  0.432 0.026 0.002 0.011 0.013 0.036]
 [0.324 0.542 0.052 0.003 0.021 0.029 0.029]
 [0.45  0.441 0.036 0.002 0.013 0.018 0.04 ]]


In [9]:
print(si[0:5])


[0.8912802  0.37440534 0.52013382 0.45770397 0.55043747]


## Evaluating using test set

In [11]:
print(np.around(clf.predict_proba(X_test)[:5], decimals=3))

[[0.337 0.529 0.053 0.003 0.019 0.028 0.031]
 [0.175 0.362 0.269 0.026 0.014 0.137 0.016]
 [0.204 0.374 0.239 0.026 0.013 0.124 0.018]
 [0.435 0.444 0.044 0.003 0.014 0.021 0.039]
 [0.434 0.39  0.03  0.002 0.012 0.015 0.117]]


In [12]:
prediction_sets = (clf.predict_proba(X_test) >= 1 - qhat) 
print(prediction_sets[:5])

[[False  True False False False False False]
 [False  True False False False False False]
 [False  True False False False False False]
 [ True  True False False False False False]
 [ True  True False False False False False]]


In [13]:
indices = np.argwhere(prediction_sets == True).tolist()

# Convert the indices to a list
indices_list = [[] for _ in range(len(prediction_sets))]
for index in indices:
    indices_list[index[0]].append(index[1])

In [15]:
scores = 0
for i in range(len(y_test)):
    prediction_set = indices_list[i]
    true_target  = y_test.values[i]
    score = y_test.values[i] in indices_list[i]
    scores += score
    print(indices_list[i], y_test.values[i], y_test.values[i] in indices_list[i])
final_score = scores / len(y_test)
print(f"Accuracy of {final_score * 100}%")

[1] 1 True
[1] 2 False
[1] 2 False
[0, 1] 0 True
[0, 1] 6 False
[0, 1] 0 True
[0, 1] 1 True
[0, 1] 0 True
[1] 2 False
[1] 1 True
[0, 1] 0 True
[0, 1] 0 True
[1] 1 True
[0, 1] 0 True
[0, 1] 0 True
[0, 1] 1 True
[1] 4 False
[0, 1] 0 True
[0, 1] 1 True
[0, 1] 1 True
[0, 1] 6 False
[0, 1] 0 True
[1] 1 True
[0, 1] 1 True
[1] 1 True
[0, 1] 0 True
[1] 1 True
[0, 1] 0 True
[1] 1 True
[1] 1 True
[0, 1] 0 True
[1] 1 True
[0, 1] 0 True
[0, 1] 0 True
[1] 5 False
[0, 1] 1 True
[0, 1] 1 True
[0, 1] 0 True
[1] 1 True
[1] 2 False
[0, 1] 0 True
[1] 1 True
[1] 1 True
[0, 1] 1 True
[1] 1 True
[1] 0 False
[1] 5 False
[1] 1 True
[1] 0 False
[1] 1 True
[1] 0 False
[0, 1] 0 True
[0, 1] 0 True
[1] 5 False
[0, 1] 0 True
[0, 1] 1 True
[1] 1 True
[0, 1] 1 True
[0, 1] 0 True
[1] 1 True
[1] 0 False
[1] 1 True
[0, 1] 0 True
[1] 1 True
[0, 1] 6 False
[0, 1] 1 True
[1] 1 True
[1] 1 True
[1] 1 True
[0, 1] 0 True
[0, 1] 0 True
[1] 2 False
[1] 1 True
[0, 1] 1 True
[1] 5 False
[1] 1 True
[1] 2 False
[0, 1] 1 True
[0, 1] 