# 1. Regression

In [1]:
pip install ucimlrepo

Collecting ucimlrepo
  Downloading ucimlrepo-0.0.6-py3-none-any.whl (8.0 kB)
Installing collected packages: ucimlrepo
Successfully installed ucimlrepo-0.0.6


In [2]:
from ucimlrepo import fetch_ucirepo

# fetch dataset
abalone = fetch_ucirepo(id=1)

# data (as pandas dataframes)
X = abalone.data.features
y = abalone.data.targets

# metadata
print(abalone.metadata)

# variable information
print(abalone.variables)


{'uci_id': 1, 'name': 'Abalone', 'repository_url': 'https://archive.ics.uci.edu/dataset/1/abalone', 'data_url': 'https://archive.ics.uci.edu/static/public/1/data.csv', 'abstract': 'Predict the age of abalone from physical measurements', 'area': 'Biology', 'tasks': ['Classification', 'Regression'], 'characteristics': ['Tabular'], 'num_instances': 4177, 'num_features': 8, 'feature_types': ['Categorical', 'Integer', 'Real'], 'demographics': [], 'target_col': ['Rings'], 'index_col': None, 'has_missing_values': 'no', 'missing_values_symbol': None, 'year_of_dataset_creation': 1994, 'last_updated': 'Mon Aug 28 2023', 'dataset_doi': '10.24432/C55C7W', 'creators': ['Warwick Nash', 'Tracy Sellers', 'Simon Talbot', 'Andrew Cawthorn', 'Wes Ford'], 'intro_paper': None, 'additional_info': {'summary': 'Predicting the age of abalone from physical measurements.  The age of abalone is determined by cutting the shell through the cone, staining it, and counting the number of rings through a microscope -- 

In [3]:
#import libraries
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler, OrdinalEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 24) #train test split

#feature encoding
encoder = OneHotEncoder() #define encoder
encoded_X_train_feat = encoder.fit_transform(X_train.iloc[:,[0]]).toarray() #fit and transform the first feature in the train set
encoded_X_test_feat = encoder.transform(X_test.iloc[:,[0]]).toarray() #fit and transform the first feature in the test set

encoded_X_train = np.concatenate((encoded_X_train_feat, np.array(X_train)[:,1:]), axis=1) #replace the original feature with the encoded feature in the train set
encoded_X_test = np.concatenate((encoded_X_test_feat, np.array(X_test)[:,1:]), axis=1) #replace the original feature with the encoded feature in the test set

#feature scaling
scaler = StandardScaler() #define scaler
scaled_X_train = scaler.fit_transform(encoded_X_train) #normalize the features in the train set
scaled_X_test = scaler.transform(encoded_X_test) #normalize the features in the test set
scaled_X_train #display result

array([[-0.68028273,  1.47099482, -0.76234128, ..., -1.41822336,
        -1.38337267, -1.50833708],
       [-0.68028273, -0.67981205,  1.31174847, ...,  0.43132268,
         0.16854062,  0.1184727 ],
       [-0.68028273, -0.67981205,  1.31174847, ...,  1.39523362,
         0.99683452,  1.16686122],
       ...,
       [-0.68028273,  1.47099482, -0.76234128, ..., -1.51886371,
        -1.58361955, -1.63125159],
       [ 1.46997706, -0.67981205, -0.76234128, ..., -0.56837149,
        -0.28201485, -0.49609988],
       [-0.68028273,  1.47099482, -0.76234128, ..., -1.317583  ,
        -1.18767686, -1.44687982]])

In [4]:
from sklearn.metrics import mean_absolute_error #, root_mean_squared_error
from sklearn.ensemble import RandomForestClassifier
from sklearn import svm

regr = svm.SVR(kernel="poly", C=100, gamma="auto", degree=2, epsilon=0.1, coef0=1)
regr.fit(scaled_X_train, y_train)
y_pred = regr.predict(scaled_X_test)

mean_absolute_error(np.round(y_pred), y_test), np.mean(((np.round(y_pred) - np.array(y_test).ravel())**2))**0.5

  y = column_or_1d(y, warn=True)


(1.4425837320574162, 2.256304299271065)

# 3. Clustering

In [50]:
import pandas as pd

data = pd.read_csv('./ObesityDataSet.csv')

In [51]:
data.head()

Unnamed: 0,Gender,Age,Height,Weight,family_history_with_overweight,FAVC,FCVC,NCP,CAEC,SMOKE,CH2O,SCC,FAF,TUE,CALC,MTRANS,NObeyesdad
0,Female,21.0,1.62,64.0,yes,no,2.0,3.0,Sometimes,no,2.0,no,0.0,1.0,no,Public_Transportation,Normal_Weight
1,Female,21.0,1.52,56.0,yes,no,3.0,3.0,Sometimes,yes,3.0,yes,3.0,0.0,Sometimes,Public_Transportation,Normal_Weight
2,Male,23.0,1.8,77.0,yes,no,2.0,3.0,Sometimes,no,2.0,no,2.0,1.0,Frequently,Public_Transportation,Normal_Weight
3,Male,27.0,1.8,87.0,no,no,3.0,3.0,Sometimes,no,2.0,no,2.0,0.0,Frequently,Walking,Overweight_Level_I
4,Male,22.0,1.78,89.8,no,no,2.0,1.0,Sometimes,no,2.0,no,0.0,0.0,Sometimes,Public_Transportation,Overweight_Level_II


In [54]:
X, y = data.iloc[:,:-1], data.iloc[:,[-1]]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 24) #train test split

object_mask = X_train.dtypes == 'object'
float_mask = X_train.dtypes == np.float64

#feature encoding
encoder = OneHotEncoder()
encoded_X_train = encoder.fit_transform(X_train[X_train.columns[object_mask]]).toarray()
encoded_X_test = encoder.transform(X_test[X_test.columns[object_mask]]).toarray()

#feature scaling
scaled_X_train = scaler.fit_transform(X_train[X_train.columns[float_mask]])
scaled_X_test = scaler.transform(X_test[X_test.columns[float_mask]])

processed_X_train = np.concatenate([encoded_X_train, scaled_X_train], axis=1)
processed_X_test = np.concatenate([encoded_X_test, scaled_X_test], axis=1)

#label encoding
encoder = OrdinalEncoder()
encoded_y_train = encoder.fit_transform(y_train)
encoded_y_test = encoder.transform(y_test)

processed_X_train

array([[ 0.        ,  1.        ,  0.        , ...,  0.31975174,
         1.13927071, -1.0945577 ],
       [ 1.        ,  0.        ,  0.        , ..., -1.03124695,
        -1.04152247,  0.54020994],
       [ 1.        ,  0.        ,  0.        , ..., -0.93956897,
        -0.0783783 ,  0.27676223],
       ...,
       [ 0.        ,  1.        ,  1.        , ..., -0.02486672,
         2.31881707, -1.0945577 ],
       [ 0.        ,  1.        ,  0.        , ..., -0.26334118,
        -0.02588029, -1.0945577 ],
       [ 0.        ,  1.        ,  1.        , ..., -0.02486672,
         1.14779691, -1.0945577 ]])

In [57]:
encoded_y_train

array([[2.],
       [2.],
       [2.],
       ...,
       [1.],
       [5.],
       [2.]])

In [43]:
processed_X_train.shape, processed_X_test.shape

((1688, 31), (423, 31))

In [62]:
from sklearn.cluster import KMeans

#fit k-means clustering
kmeans = KMeans(n_clusters=4, random_state=0, n_init="auto").fit(processed_X_train)

In [75]:
centers = kmeans.cluster_centers_ #centroid points
closest_cluster = [] #list to save the index of closest clusters for each sample

#iterate over each sample in processed_X_test
for i in range(processed_X_test.shape[0]):
  #find closest centroid point by calculating the euclidean distance and finding the smallest valued index.
  idx = np.argmin([np.linalg.norm(processed_X_test[i] - c) for c in centers])
  #append the closest centroid point to the 'closest_cluster' list
  closest_cluster.append(idx)

closest_cluster[:5]

[1, 2, 2, 3, 0]

In [87]:
y_pred = [] #predict label list

#iterate over each closest cluster index of samples
for cluster_idx in closest_cluster:
  y_list = encoded_y_train[kmeans.labels_ == cluster_idx].ravel() #labels of points associated with a cluster indexed 'cluster_idx'
  value, counts = np.unique(y_list, return_counts=True) #count the number of each labels in the particular cluster
  idx = np.argmax(counts) #index of the largest count
  y_pred.append(value[idx]) #append the most frequent label to the y_pred list

In [91]:
from sklearn.metrics import accuracy_score
accuracy_score(y_pred, encoded_y_test)

0.3120567375886525

In [117]:
def KMeans_pred(k=4):
  kmeans = KMeans(n_clusters=k, random_state=0, n_init="auto").fit(processed_X_train)

  centers = kmeans.cluster_centers_ #centroid points
  closest_cluster = [] #list to save the index of closest clusters for each sample

  #iterate over each sample in processed_X_test
  for i in range(processed_X_test.shape[0]):
    #find closest centroid point
    idx = np.argmin([np.linalg.norm(processed_X_test[i] - c) for c in centers])
    #append the closest centroid point to the 'closest_cluster' list
    closest_cluster.append(idx)

  y_pred = [] #predict label list

  #iterate over each closest cluster index of samples
  for cluster_idx in closest_cluster:
    y_list = encoded_y_train[kmeans.labels_ == cluster_idx].ravel() #labels of points associated with a cluster indexed 'cluster_idx'
    value, counts = np.unique(y_list, return_counts=True) #count the number of each labels in the particular cluster
    idx = np.argmax(counts) #index of the largest count
    y_pred.append(value[idx]) #append the most frequent label to the y_pred list

  print("Accuracy:", accuracy_score(y_pred, encoded_y_test))

  correctly_classified_num = sum(encoded_y_test == np.array(y_pred).reshape(-1,1))
  print("Number of correctly classified samples:", correctly_classified_num)

In [118]:
KMeans_pred(k=4)

Accuracy: 0.3120567375886525
Number of correctly classified samples: [132]


In [119]:
KMeans_pred(k=5)

Accuracy: 0.40425531914893614
Number of correctly classified samples: [171]


In [120]:
KMeans_pred(k=6)

Accuracy: 0.408983451536643
Number of correctly classified samples: [173]


In [121]:
KMeans_pred(k=7)

Accuracy: 0.408983451536643
Number of correctly classified samples: [173]


In [122]:
KMeans_pred(k=8)

Accuracy: 0.4326241134751773
Number of correctly classified samples: [183]


In [123]:
KMeans_pred(k=9)

Accuracy: 0.41843971631205673
Number of correctly classified samples: [177]


In [124]:
KMeans_pred(k=10)

Accuracy: 0.42080378250591016
Number of correctly classified samples: [178]


Optimal value of k = 8