In [1]:
# import necessary libraries
import pandas as pd
import matplotlib.pyplot as plt  
from sklearn.model_selection import train_test_split  
from sklearn.ensemble import RandomForestClassifier 
from sklearn.metrics import accuracy_score, confusion_matrix

In [2]:
# load the dataset into a pandas DataFrame
df = pd.read_csv('dermatology.csv', delimiter='\t')

# convert 'Age' column to numeric for prepare to categorization
df['Age'] = pd.to_numeric(df['Age'], errors='coerce')

# categorize the 'Age' column as age is continuous
for i in range(len(df)):
    age = df.iloc[i]['Age']
    
    if age == '?':
        df.iloc[i, df.columns.get_loc('Age')] = None 
        continue
        
    category = 0
    if 18 < age <= 35:
        category = 1
    elif age <= 50:
        category = 2
    elif age <= 65:
        category = 3
    elif age <= 125:
        category = 4
    df.iloc[i, df.columns.get_loc('Age')] = category

# clean the dataset for missing and NaN values
df = df.dropna()

# display the first five data entries
print(df.head())

   Erythema  Scathing  Definite Borders  Itching  Koebner   Polygonal  \
0         2         2                 0        3         0          0   
1         3         3                 3        2         1          0   
2         2         1                 2        3         1          3   
3         2         2                 2        0         0          0   
4         2         3                 2        2         2          2   

   Follicular  Oral  Knee  Scalp  ...  Disapperance  Vacuolisation  \
0           0     0     1      0  ...             0              0   
1           0     0     1      1  ...             0              0   
2           0     3     0      0  ...             0              2   
3           0     0     3      2  ...             3              0   
4           0     2     0      0  ...             2              3   

   Spongiosis  Retes  Follicular.1  Perifollicular  Inflamatory  Band-like  \
0           3      0             0               0            

In [3]:
# define the independent (feature) variables
X = df.iloc[:, :-1].values

# define the dependent (target) variable
y = df.iloc[:, -1].values

# split the dataset into 70% training and 30% testing subsets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=42)

In [4]:
# initialize a Random Forest Classifier with 50 trees and default tree depth
rfc = RandomForestClassifier(n_estimators=50, random_state=42)

# train the Random Forest model on the training data
rfc.fit(X_train, y_train)

In [5]:
# make predictions for the test set
predictions = rfc.predict(X_test)

# calculate and print the accuracy of the model
print("Accuracy Score:", accuracy_score(y_test, predictions))

# calculate and print the confusion matrix from the predictions of the model
print("Confusion Matrix:\n", confusion_matrix(y_test, predictions))

Accuracy Score: 0.9636363636363636
Confusion Matrix:
 [[40  0  0  0  0  0]
 [ 0 16  0  0  0  0]
 [ 0  0 19  0  0  0]
 [ 0  4  0 11  0  0]
 [ 0  0  0  0 16  0]
 [ 0  0  0  0  0  4]]
