# TM10007 Assignment template

## Data loading and cleaning

Below are functions to load the dataset of your choice. After that, it is all up to you to create and evaluate a classification method. Beware, there may be missing values in these datasets. Good luck!

In [50]:
# import packages
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.feature_selection import VarianceThreshold

# Classifiers
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import SGDClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier

# import functions
from worclipo.load_data import load_data
from worclipo.split_set import split_set

data = load_data()
print(f'The number of samples: {len(data.index)}')
print(f'The number of columns: {len(data.columns)}')
print(type(data))


The number of samples: 115
The number of columns: 494
<class 'pandas.core.frame.DataFrame'>


# setting up the data to be processed

In [51]:
# change lipoma = 1 and liposarcoma = 0 and encode labels
group_names = list(set(data.label))
data.loc[data['label'] == 'lipoma', 'label'] = 1
data.loc[data['label'] == 'liposarcoma', 'label'] = 0
data['label'] = pd.cut(data['label'], bins = 2, labels=group_names)
print(data['label'].unique())
label_diag = LabelEncoder()
data['label'] = label_diag.fit_transform(data['label'])

# assign X to measurements and y to outcome (lipoma/sarcoma)
X = data.drop('label', axis=1)
y = data['label']
test_size = 0.3

['liposarcoma', 'lipoma']
Categories (2, object): ['liposarcoma' < 'lipoma']


In [52]:
# code that splits the data into test and validation sets if this is not done already

split_action = split_set(X,y,test_size)
print(split_action)

# if os.path.exists('./TEST_set.csv'):
#     print('TEST_set.csv already exists')
# else:
#     split_set(X,y,test_size)
#     print('TEST_set.csv does not exist, generating new test and training sets')

TEST_set.csv already exists
None


## import the training set

In [53]:
TRAIN = pd.read_csv('TRAIN_set.csv', index_col=0)
X_train = TRAIN.drop('label', axis=1)
y_train = TRAIN['label']

TRAIN['label'] = pd.cut(TRAIN['label'], bins = 2, labels=group_names)
print(TRAIN['label'].unique())
label_diag = LabelEncoder()
TRAIN['label'] = label_diag.fit_transform(TRAIN['label'])

# split into training and validation set
X_train, X_valid, y_train, y_valid = train_test_split(X_train, y_train, test_size=0.3, random_state=10)

# scale the training set
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_valid = sc.transform(X_valid)

# preprocess the data 
if not str in data.values:
    print(f'All values are numerical.')
if np.isnan(X_train).any():
    imp = SimpleImputer(missing_value=np.nan, strategy='mean')
    X_train.iloc[:,:] = imp.fit_transform(X_train)
    X_valid.iloc[:,:] = imp.fit(X_valid)
    print(f'All NaN values replaced by mean value of the column.')

# remove features with zero variance
print(f'Size before removal zero variances: ', X_train.shape)
vt = VarianceThreshold(threshold=0)
X_train = vt.fit_transform(X_train)
X_test = vt.transform(X_valid)
print(f'Size after removal zero variance: ', X_train.shape)

['liposarcoma', 'lipoma']
Categories (2, object): ['liposarcoma' < 'lipoma']


# Classify

In [54]:

# try a classifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix
rfc = RandomForestClassifier(n_estimators=500)
rfc.fit(X_train, y_train)
pred_rfc = rfc.predict(X_valid)

print('rfc', classification_report(y_valid, pred_rfc))
print(confusion_matrix(y_valid, pred_rfc))

rfc               precision    recall  f1-score   support

           0       0.56      0.91      0.69        11
           1       0.83      0.38      0.53        13

    accuracy                           0.62        24
   macro avg       0.69      0.65      0.61        24
weighted avg       0.71      0.62      0.60        24

[[10  1]
 [ 8  5]]
