# TM10007 Assignment template

## Data loading and cleaning

Below are functions to load the dataset of your choice. After that, it is all up to you to create and evaluate a classification method. Beware, there may be missing values in these datasets. Good luck!

In [27]:
# import packages
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn import metrics

# Classifiers
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import SGDClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier

# fetch data
from worclipo.load_data import load_data

data = load_data()
print(f'The number of samples: {len(data.index)}')
print(f'The number of columns: {len(data.columns)}')
print(type(data))


The number of samples: 115
The number of columns: 494
<class 'pandas.core.frame.DataFrame'>


# setting up the data to be processed

In [28]:
from sklearn.preprocessing import StandardScaler, LabelEncoder
# change lipoma = 0 and liposarcoma = 1 and encode labels
group_names = list(set(data.label))
data.loc[data['label'] == 'lipoma', 'label'] = 0
data.loc[data['label'] == 'liposarcoma', 'label'] = 1
data['label'] = pd.cut(data['label'], bins = 2, labels=group_names)
print(data['label'].unique())
label_diag = LabelEncoder()
data['label'] = label_diag.fit_transform(data['label'])

# assign X to measurements and y to outcome (lipoma/sarcoma)
X = data.drop('label', axis=1)
y = data['label']

# splitting the data into test and train
from sklearn.model_selection import train_test_split
test_size = 0.2
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=42)

# scale the data
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)
print(X_train)

# try a classifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix
rfc = RandomForestClassifier(n_estimators=200)
rfc.fit(X_train, y_train)
pred_rfc = rfc.predict(X_test)

print(classification_report(y_test, pred_rfc))
print(confusion_matrix(y_test, pred_rfc))

['liposarcoma', 'lipoma']
Categories (2, object): ['lipoma' < 'liposarcoma']
[[ 1.41607388e+00  2.75742741e-01  1.26753139e-01 ... -6.53277864e-01
  -1.52809004e+00 -3.58400285e-01]
 [ 3.53175800e-01  9.54383519e-01  2.21390016e+00 ...  2.75724793e+00
  -6.40585743e-01  1.83972594e+00]
 [-8.72704709e-01  1.61534848e+00 -3.86924431e-01 ... -2.19873633e-03
   1.03768519e+00  2.25059201e-01]
 ...
 [-5.64821198e-02 -7.80807173e-01 -1.38600246e+00 ... -8.05427523e-01
   6.81561972e-01 -2.36181895e+00]
 [ 1.91586875e+00 -1.60574383e+00 -2.09324758e-01 ... -3.62120381e-01
  -1.86358700e-01  4.14637674e-01]
 [ 4.49871492e-01  1.14097099e+00  7.59191985e-01 ... -6.93847070e-01
  -1.52809004e+00 -4.12892080e-01]]
              precision    recall  f1-score   support

           0       0.67      0.83      0.74        12
           1       0.75      0.55      0.63        11

    accuracy                           0.70        23
   macro avg       0.71      0.69      0.69        23
weighted avg   