In [3]:
!pip install ucimlrepo

Collecting ucimlrepo
  Downloading ucimlrepo-0.0.6-py3-none-any.whl (8.0 kB)
Installing collected packages: ucimlrepo
Successfully installed ucimlrepo-0.0.6


In [4]:
from ucimlrepo import fetch_ucirepo
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

**Erasing colinearities**

In [5]:
import heapq

In [39]:
def test_collinearity(columns, indices, threshold = 1e-10):
  """
  Helper function, testing whether a certain subset of columns is collinear.
  :param columns: the whole set of columns.
  :param indices: indices belonging to the subset.
  :param threshold: the value of determinant, going below which will be
    considered being numerically collinear.
  :return: True if collinear, False otherwise.
  """
  used_columns = []
  for index in indices:
    used_columns.append(columns[index])

  X = np.column_stack(used_columns)
  XX = X.transpose() @ X
  if np.linalg.det(XX) < threshold:
    return True
  else:
    return False

def remove_collinear(X):
  """
  Removes the minimum number of columns to ensure the result matrix will be
  full rank.
  :param X: a numpy matrix one needs a non-collinear version of.
  :return: a numpy matrix with collinearities removed and a set containing
    indices of removed columns.
  """
  columns = []
  p = len(X[0])
  for i in range(p):
    columns.append(X[:,i])

  columns_used = []
  columns_stashed = set()
  columns_removed = set()
  for i in range(p):
    columns_used.append(i)

  heapq.heapify(columns_used)

  last_removed = -1
  while(True):
    if len(columns_used) == 0:
      break

    if test_collinearity(columns, columns_used):
      last_removed = heapq.heappop(columns_used)
      columns_stashed.add(last_removed)
    else:
      if last_removed == -1:
        # If the whole remaining subset is non-collinear, it's time to stop.

        break
      else:
        # If removing a certain column made the subset non-collinear, it means
        # that this column is a good candidate for removal.

        columns_stashed.remove(last_removed)
        columns_removed.add(last_removed)

        # Returning stashed away columns back to the subset.
        for index in columns_stashed:
          columns_used.append(index)
        heapq.heapify(columns_used)
        columns_stashed.clear()
        last_removed = -1

  # Recreating the matrix
  is_used = [False for i in range(p)]
  for index in columns_used:
    is_used[index] = True

  columns_used = []
  for i in range(p):
    if is_used[i]:
      columns_used.append(columns[i])

  X_clean = np.column_stack(columns_used)
  return X_clean, columns_removed


**Heart Disease dataset** (https://archive.ics.uci.edu/dataset/45/heart+disease)

In [41]:
# Let's perform some EDA:

# fetch dataset
data_heart_disease = fetch_ucirepo(id=45)

# data (as pandas dataframes)
X = data_heart_disease.data.features
y = data_heart_disease.data.targets

print(X.dtypes)

age           int64
sex           int64
cp            int64
trestbps      int64
chol          int64
fbs           int64
restecg       int64
thalach       int64
exang         int64
oldpeak     float64
slope         int64
ca          float64
thal        float64
dtype: object


In [42]:
# Checking the amount of missing values.
print('missing values\n')
for name in X.columns:
  print(name+': '+str(X[name].isnull().sum()))

print('\nAmount of data points:',len(X))

missing values

age: 0
sex: 0
cp: 0
trestbps: 0
chol: 0
fbs: 0
restecg: 0
thalach: 0
exang: 0
oldpeak: 0
slope: 0
ca: 4
thal: 2

Amount of data points: 303


As one can see, the number of missing values is miniscule compared to the amount of data points we have, so not to complicate stuff too much, we can just remove the incomplete observations if need be.

In [43]:
# Checking the number of categories for each variable.
print('different values present\n')
for name in X.columns:
  print(name+': '+str(X[name].nunique()))

different values present

age: 41
sex: 2
cp: 4
trestbps: 50
chol: 152
fbs: 2
restecg: 3
thalach: 91
exang: 2
oldpeak: 40
slope: 3
ca: 4
thal: 3


According to the https://archive.ics.uci.edu/dataset/45/heart+disease page, 'cp', 'restecg', 'slope' and 'thal' are categorical variables that have more than 2 possible values, so they should be one-hot-encoded before building the model.

Finally, looking at the target variable:

In [44]:
np.unique(y)

array([0, 1, 2, 3, 4])

We can see that there are 5 possible values for that. In accordance to the experiments mentioned on the dataset's page, we will transform the answers column to have two classes:

0 <- no presence of heart disease (original values: 0)

1 <- presence of heart disease (original values: 1, 2, 3 ,4)


So finally, for a function that returns this dataset in a form palatable for model training:

In [45]:
def fetch_heart_disease():
  # fetch dataset
  data_heart_disease = fetch_ucirepo(id=45)

  # data (as pandas dataframes)
  X = data_heart_disease.data.features
  y = data_heart_disease.data.targets

  # removing missing variables
  X = X.dropna()
  y = y.loc[X.index]

  # one-hot-encoding the detected multi-valued categorical variables
  X = pd.get_dummies(X, columns=['cp', 'restecg', 'slope', 'thal'],
                     drop_first=True, dtype=int)

  # changing the format to numpy arrays
  # (flattening is necessary for y as .values isn't smart enough to notice that
  # y had only one column)
  X = X.values
  y = y.values.flatten()

  # mapping the answers to {0,1}
  y = (y==0)
  y = y.astype(int)

  # removing collinearities
  X, _ = remove_collinear(X)

  return X, y

In [46]:
X, y = fetch_heart_disease()

In [47]:
print(len(X[0]))

18


**Parkinsons dataset** (https://archive.ics.uci.edu/dataset/174/parkinsons)

In terms of the EDA, this time everything is noted on the page.
There are no missing values, every feature is continous and the target variable is indeed a {0,1} binary value, so the only thing we need to do is to fetch the dataset and turn it into an appropriate numpy arrays.

In [48]:
def fetch_parkinsons():
  # fetch dataset
  data_parkinsons = fetch_ucirepo(id=174)

  # data (as pandas dataframes)
  X = data_parkinsons.data.features
  y = data_parkinsons.data.targets

  # changing the format to numpy arrays
  # (flattening is necessary for y as .values isn't smart enough to notice that
  # y had only one column)
  X = X.values
  y = y.values.flatten()

  # removing collinearities
  X, _ = remove_collinear(X)

  return X, y

In [49]:
X, y = fetch_parkinsons()

In [51]:
# Some columns have been removed
print(len(X[0]))

16


**HCV dataset** (https://archive.ics.uci.edu/dataset/571/hcv+data)

In [52]:
# Let's perform some EDA:

# fetch dataset
data_hcv = fetch_ucirepo(id=571)

# data (as pandas dataframes)
X = data_hcv.data.features
y = data_hcv.data.targets

print(X.dtypes)
print(y.dtypes)

Age       int64
Sex      object
ALB     float64
ALP     float64
AST     float64
BIL     float64
CHE     float64
CHOL    float64
CREA    float64
CGT     float64
PROT    float64
ALT     float64
dtype: object
Category    object
dtype: object


In [54]:
print(np.unique(X['Sex']))

['f' 'm']


Looking at the types, it's obvious that we'll need to map the 'Sex' column to {0,1}.

In [55]:
# Checking the amount of missing values.
print('missing values\n')
for name in X.columns:
  print(name+': '+str(X[name].isnull().sum()))

print('\nAmount of data points:',len(X))

missing values

Age: 0
Sex: 0
ALB: 1
ALP: 18
AST: 0
BIL: 0
CHE: 0
CHOL: 10
CREA: 0
CGT: 0
PROT: 1
ALT: 1

Amount of data points: 615


It seems that apart from the 'ALP' and 'CHOL' columns, there aren't that many missing values, so the plan seems to be removing the observations, where 'ALB', 'PROT' or 'ALT' is missing and then using some regressor to predict the values of 'ALP' and 'CHOL' when necessary.

According to the https://archive.ics.uci.edu/dataset/571/hcv+data page, there are no categorical features, only the target variable is categorical, so there is no need for one-hot-encoding.

Speaking of the target variable, it has 5 different values:

In [56]:
np.unique(y)

array(['0=Blood Donor', '0s=suspect Blood Donor', '1=Hepatitis',
       '2=Fibrosis', '3=Cirrhosis'], dtype=object)

Thus we'll group those categories into:

0 <- all blood donors (original values: 0 and 0s)

1 <- all the nasty stuff (orignal values: 1, 2 and 3)

Now for the fetching function:

In [57]:
from sklearn.linear_model import LinearRegression

In [63]:
def fetch_hcv():
  # fetch dataset
  data_hcv = fetch_ucirepo(id=571)

  # data (as pandas dataframes)
  X = data_hcv.data.features
  y = data_hcv.data.targets

  # mapping the 'Sex' column to numeric values
  X.loc[:,'Sex'] = X['Sex'].map({'m': 0, 'f': 1})
  # mapping the target variable to {0,1}
  y.loc[:,'Category'] = y['Category'].map({'0=Blood Donor': 0,
                                           '0s=suspect Blood Donor': 0,
                                           '1=Hepatitis': 1, '2=Fibrosis': 1,
                                           '3=Cirrhosis': 1})

  # removing rare missing values
  X = X.dropna(subset=['ALB','PROT','ALT'])
  y = y.loc[X.index]

  # Regressing for the remaining missing values
  X_for_lr = X.dropna()
  y_for_lr1 = X_for_lr['ALP'].values
  y_for_lr2 = X_for_lr['CHOL'].values
  X_for_lr = X_for_lr.drop(columns=['ALP','CHOL']).values

  lr = LinearRegression()

  lr.fit(X_for_lr, y_for_lr1)
  ALP_missing = X[X['ALP'].isna()]
  data_for_ALP_predicting = ALP_missing.drop(columns=['ALP', 'CHOL']).values
  ALP_predictions = lr.predict(data_for_ALP_predicting)
  X.loc[X['ALP'].isna(), 'ALP'] = ALP_predictions

  lr.fit(X_for_lr, y_for_lr2)
  CHOL_missing = X[X['CHOL'].isna()]
  data_for_CHOL_predicting = CHOL_missing.drop(columns=['ALP', 'CHOL']).values
  CHOL_predictions = lr.predict(data_for_CHOL_predicting)
  X.loc[X['CHOL'].isna(), 'CHOL'] = CHOL_predictions

  # changing the format to numpy arrays
  # (flattening is necessary for y as .values isn't smart enough to notice that
  # y had only one column)
  X = X.values
  y = y.values.flatten()

  # removing collinearities
  X, _ = remove_collinear(X)

  return X, y

In [64]:
X, y = fetch_hcv()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X.loc[:,'Sex'] = X['Sex'].map({'m': 0, 'f': 1})
  X.loc[:,'Sex'] = X['Sex'].map({'m': 0, 'f': 1})
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  y.loc[:,'Category'] = y['Category'].map({'0=Blood Donor': 0,
  y.loc[:,'Category'] = y['Category'].map({'0=Blood Donor': 0,


In [66]:
print(len(X[0]))

12
