# 1. Import Libraries 

In [1]:
import os
import numpy as np
import pandas as pd
import seaborn as sns
from PIL import Image
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier

%load_ext autoreload
%autoreload 2

In [2]:
train_csv_path = r'..\data\train_test_csv\train_data.csv'
test_csv_path = r'..\data\train_test_csv\test_data.csv'
train_df = pd.read_csv(train_csv_path)
test_df = pd.read_csv(test_csv_path)

# Display the number of imported images
print(f"Number of training images: {len(train_df)}")
print(f"Number of test images: {len(test_df)}")

Number of training images: 2641
Number of test images: 294


In [3]:
train_df.Label.value_counts()

Label
airplanes            720
Motorbikes           718
BACKGROUND_Google    420
Faces_easy           392
Faces                391
Name: count, dtype: int64

In [4]:
test_df.Label.value_counts()

Label
Motorbikes           80
airplanes            80
BACKGROUND_Google    47
Faces                44
Faces_easy           43
Name: count, dtype: int64

In [5]:
train_df.head()

Unnamed: 0,HOG_0,HOG_1,HOG_2,HOG_3,HOG_4,HOG_5,HOG_6,HOG_7,HOG_8,HOG_9,...,HOG_8091,HOG_8092,HOG_8093,HOG_8094,HOG_8095,HOG_8096,HOG_8097,HOG_8098,HOG_8099,Label
0,0.038366,0.034774,0.031683,0.10324,0.138038,0.176418,0.246472,0.246472,0.158125,0.025145,...,0.035912,0.016476,0.035359,0.14758,0.306789,0.306789,0.164016,0.039841,0.038114,BACKGROUND_Google
1,0.241152,0.037368,0.034159,0.025061,0.070951,0.01441,0.003767,0.001928,0.178829,0.337243,...,0.118079,0.005067,0.0073,0.002927,0.005443,0.00369,0.009717,0.003888,0.121595,Motorbikes
2,0.140515,0.0,0.0,0.0,0.000891,0.0,0.0,0.0,0.14055,0.159198,...,0.375181,0.183314,0.098506,0.060341,0.084255,0.042076,0.144795,0.156756,0.375181,airplanes
3,0.379993,0.030309,0.008011,0.000308,0.014956,0.006318,0.015724,0.045974,0.379993,0.379993,...,0.349345,0.023273,0.001012,0.000892,0.031166,0.0,2.3e-05,0.008693,0.349345,Motorbikes
4,0.307204,0.083733,0.018682,0.060296,0.079989,0.028442,0.012159,0.041207,0.307204,0.307204,...,0.268115,0.033532,0.041168,0.218352,0.248037,0.221883,0.268115,0.214999,0.268115,airplanes


In [6]:
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()
train_df['Label'] = label_encoder.fit_transform(train_df['Label'])
test_df['Label'] = label_encoder.transform(test_df['Label'])

# 2. Creating Labelled Unlabelled Data

In [7]:
from sklearn.model_selection import train_test_split

def partition_and_make_unlabeled(df, test_size=0.5, random_state=None):
    labeled_df1, labeled_df2 = train_test_split(df, test_size=test_size, stratify=df['Label'], random_state=random_state)
    unlabeled_df = labeled_df2.drop('Label', axis=1)
    
    return labeled_df1, labeled_df2, unlabeled_df

In [8]:
labeled_df1, labeled_df2, unlabeled_df = partition_and_make_unlabeled(train_df, test_size=0.80, random_state=42)

In [9]:
unlabeled_df.head()

Unnamed: 0,HOG_0,HOG_1,HOG_2,HOG_3,HOG_4,HOG_5,HOG_6,HOG_7,HOG_8,HOG_9,...,HOG_8090,HOG_8091,HOG_8092,HOG_8093,HOG_8094,HOG_8095,HOG_8096,HOG_8097,HOG_8098,HOG_8099
1485,0.158628,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.156872,0.180218,...,0.496588,0.496588,0.026402,0.020954,0.004773,0.010718,0.002488,0.004995,0.006333,0.496588
2620,0.037935,0.0,0.0,0.0,0.026298,0.0,0.0,0.0,0.037935,0.018102,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
624,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2064,0.029575,0.010799,0.002153,0.012303,0.032226,0.000376,0.001213,0.000133,0.01478,0.100381,...,0.120798,0.02641,0.005916,0.001769,0.0,0.05448,0.0,0.003855,0.001288,0.026359
743,0.143465,0.266407,0.266407,0.266407,0.093926,0.047119,0.015492,0.037342,0.083198,0.142346,...,0.141935,0.168858,0.100689,0.041478,0.055842,0.149941,0.128183,0.138132,0.206807,0.250292


# 3. Model Builiding

#### Training Forest and Trees

In [10]:
def train_rf(X, n_estimators=100):
    rf = RandomForestClassifier(n_estimators=n_estimators, criterion='gini', max_depth=5, oob_score = True, random_state = 0)
    rf.fit(X.iloc[:,:-1].values, X.iloc[:,-1].values)
    return rf

In [11]:
def train_tree(X):
    dt = DecisionTreeClassifier(criterion='gini', max_depth=5, random_state = 0)
    dt.fit(X.iloc[:,:-1].values, X.iloc[:,-1].values)
    return dt

# Optimization

In [12]:
def compute_p_star(num_classes):
    return np.ones(num_classes) / num_classes

In [13]:
def draw_random_label(p_star, Xu):
    num_samples = len(Xu)
    labels = np.random.choice(len(p_star), size=num_samples, p=p_star)
    return labels

### Oob error

In [14]:
def oobe(F, X, m):
    oob_error = 1 - F.oob_score_
    return oob_error

### Temperature function 

### Stopping Criteria

In [15]:
def stopping_condition(emF, e0F, m, max_epochs):
    return emF > e0F or m >= max_epochs

### Forest Enhancement

In [16]:
def semi_supervised_random_forests(Xl, Xu, num_trees, max_epochs, alpha=1, T=1, cooling_parameter=1, start_value=1):
    F = train_rf(Xl, num_trees)
    m = 0
    e0F = oobe(F, Xl, m)  
    emF = 0
    while True:
        m += 1
        p_star = compute_p_star(len(np.unique(Xl.Label)))
        for i in range(num_trees):
            y_hat_u = draw_random_label(p_star, Xu)
            Xu_ = Xu.copy()
            Xu_['Label'] = y_hat_u
            Xn = pd.concat([Xl, Xu_], ignore_index=True)
            F.estimators_[i] = train_tree(Xn)
        emF = oobe(F, Xl, m)
        if stopping_condition(emF, e0F, m, max_epochs): 
            break
        e0F = emF
    return F

# 4. Model Training

In [17]:
Xl = labeled_df1.copy()
Xu = unlabeled_df.copy()

num_trees = 50
max_epochs = 10

semi_supervised_forest = semi_supervised_random_forests(Xl, Xu, num_trees, max_epochs, alpha, T, cooling_parameter, start_value)

NameError: name 'alpha' is not defined

# 5. Model Evaluation 

In [None]:
y_pred = semi_supervised_forest.predict(test_df.iloc[:,:-1].values)
y_true = test_df.values[:,-1]
y_true = y_true.astype(int)
y_pred = y_pred.astype(int)

In [None]:
labels_num = label_encoder.inverse_transform(test_df['Label'])
labels = np.unique(labels_num)
labels = list(labels)

In [None]:
def plot_confusion_matrix(y_true, y_pred, label_text, label_num):
    cm = confusion_matrix(y_true, y_pred, labels=label_num)
    plt.figure(figsize=(10, 8))
    sns.heatmap(cm, annot=True, fmt="d", cmap="Blues", xticklabels=label_text, yticklabels=label_text)
    plt.xlabel('Predicted labels')
    plt.ylabel('True labels')
    plt.title('Confusion Matrix for Semi-supervised Random Forest')
    plt.show()

plot_confusion_matrix(y_true, y_pred, labels, np.unique(train_df.values[:,-1]))

In [None]:
def metrics(y_true, y_pred):

    confusion_matrix = np.zeros((y_true.max() + 1, y_true.max() + 1))
    for i in range(len(y_true)):
        confusion_matrix[y_true[i], y_pred[i]] += 1

    # precision and recall
    precision = []
    recall = []
    for i in range(y_true.max() + 1):
        tp = confusion_matrix[i, i]
        fp = np.sum(confusion_matrix[:, i]) - tp
        fn = np.sum(confusion_matrix[i, :]) - tp
        precision.append(tp / (tp + fp))
        recall.append(tp / (tp + fn))
    precisions = np.mean(precision)
    recalls = np.mean(recall)

    # Calculating accuracy
    accuracy = np.trace(confusion_matrix) / np.sum(confusion_matrix)

    # Calculating F1 score
    f1_score = 2 * (precisions * recalls) / (precisions + recalls) if (precisions + recalls) != 0 else 0

    return precisions, recalls, precision, recall, accuracy, f1_score

precision , recall, precision_list, recall_list, accuracy, f1_score = metrics(y_true, y_pred)

In [None]:
test_size = len(test_df)
data = {
    'Dataset': ['Caltech-101'],
    'Test Size': [test_size]
}

df = pd.DataFrame(data)

In [None]:
df['Precision'] = precision
df['Recall'] = recall
df['Accuracy'] = accuracy
df['F1 Score'] = f1_score
df

In [None]:
df2 = pd.DataFrame()
df2["Category"] = labels
df2["Precision"] = precision_list
df2["Recall"] = recall_list
df2