### SKLearn Dataset Loading

In [1]:
from sklearn.datasets import load_iris
iris = load_iris()
X = iris.data
y = iris.target
feature_names = iris.feature_names
target_names = iris.target_names
print("Feature names:", feature_names)
print("Target names:", target_names)
print("\nFirst 10 rows of X:\n", X[:10])



Feature names: ['sepal length (cm)', 'sepal width (cm)', 'petal length (cm)', 'petal width (cm)']
Target names: ['setosa' 'versicolor' 'virginica']

First 10 rows of X:
 [[5.1 3.5 1.4 0.2]
 [4.9 3.  1.4 0.2]
 [4.7 3.2 1.3 0.2]
 [4.6 3.1 1.5 0.2]
 [5.  3.6 1.4 0.2]
 [5.4 3.9 1.7 0.4]
 [4.6 3.4 1.4 0.3]
 [5.  3.4 1.5 0.2]
 [4.4 2.9 1.4 0.2]
 [4.9 3.1 1.5 0.1]]


### Spliting Dataset in SKLearn

In [2]:
from sklearn.model_selection import train_test_split

# Split dataset into 70% training data and 30% testing data
X_train, X_test, y_train, y_test = train_test_split(
   X, y, test_size = 0.3, random_state = 1
)

print(f'X.shape: {X.shape[0]}')
print(f'X_train.shape: {X_train.shape[0]}')
print(f'X_test.shape: {X_test.shape[0]}')

print(f'y.shape: {y.shape[0]}')
print(f'y_train.shape: {y_train.shape[0]}')
print(f'y_test.shape: {y_test.shape[0]}')

X.shape: 150
X_train.shape: 105
X_test.shape: 45
y.shape: 150
y_train.shape: 105
y_test.shape: 45


### Train the Model in SKLearn

In [3]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn import metrics

classifier_knn = KNeighborsClassifier(n_neighbors = 3)
classifier_knn.fit(X_train, y_train)
y_pred = classifier_knn.predict(X_test)

# Finding accuracy by comparing actual response values (y_test) with predicted response value (y_pred)
print(f'Accuracy: {metrics.accuracy_score(y_test, y_pred)}')

# Providing prediction data and the model will make prediction out of that data
prediction_data = [[5, 5, 3, 2], [2, 4, 3, 5]]

preds = classifier_knn.predict(prediction_data)
pred_species = [iris.target_names[p] for p in preds] 
print(f'Predictions: {pred_species}')

Accuracy: 0.9777777777777777
Predictions: ['versicolor', 'virginica']


### Model Persistence

In [4]:
import joblib

# Dump/store the model (classifier_knn)
joblib.dump(classifier_knn, 'models/iris_classifier_knn.joblib')

# Load/call the dumped_model (iris_classifier_knn)
loaded_model = joblib.load('models/iris_classifier_knn.joblib')

# Use loaded model for another prediction
another_prediction_data = [[4, 3.5, 1.3, 1], [4.1, 2.6, 1.4, 0.3]]
new_preds = loaded_model.predict(another_prediction_data)
new_pred_species = [iris.target_names[p] for p in new_preds] 
print(f'Predictions: {new_pred_species}')

Predictions: ['setosa', 'setosa']


### XGBoost Dataset Loading

In [5]:
from numpy import loadtxt
from xgboost import DMatrix

# Dataset loading 
dataset = loadtxt('../datasets/pima_indians_diabetes.csv', delimiter=',')

# Split data into X and y
X = dataset[:,0:8]
Y = dataset[:,8]

# Split data into train and test sets
seed = 7
test_size = 0.33
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=test_size, random_state=seed)

# Create DMatrix for XGBoost
dtrain = DMatrix(X_train, label=y_train)
dtest = DMatrix(X_test, label=y_test)

### Build (XGBoost) Model

In [6]:
from xgboost import train
from sklearn.metrics import accuracy_score

params = {
    'objective': 'binary:logistic',  # Binary classification
    'eval_metric': 'logloss'         # Logarithmic loss metric
}
num_round = 100  # Number of boosting rounds

model = train(params, dtrain, num_round)

print(f'Model: {model}')

# Make predictions for test data
pred = model.predict(dtest)
predictions = [round(value) for value in pred]
print(f'Predictions: {predictions}')

# Evaluate predictions
accuracy = accuracy_score(y_test, predictions)
print("Accuracy: %.2f%%" % (accuracy * 100.0))

Model: <xgboost.core.Booster object at 0x12cf0b530>
Predictions: [0, 1, 1, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1, 0, 1, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1]
Accuracy: 72.83%


### Save the Model

In [7]:
model.save_model('models/pima_indians_diabetes_xgboost_binary_logistic')



### If Error Happen

In [8]:
# If you got error message: You are running 32-bit Python on a 64-bit OS

# Check where your Python runnning by
import platform 
print(platform.architecture()) 

# Or
import struct 
print(struct.calcsize("P") * 8)

# Solve the problem by execute brew install libomp on your terminal to install libomp


('64bit', '')
64
