# From data pre-processing to the use of algorithms

In [None]:
from sklearn.datasets import load_breast_cancer
import pandas as pd
import numpy as np
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn import svm
from sklearn.model_selection import train_test_split
import random
random.seed(42)
np.random.seed(42)

In [None]:
def calculate_accuracy(y, y_pred):
    res = y == y_pred
    return np.mean(res)

In [None]:
data = load_breast_cancer()
df = pd.DataFrame(data.data, columns=[data.feature_names])
df['target'] = pd.Series(data=data.target, index=df.index)
df

## Data pre-processing

### How much data rows and columns do we have?

### How much malignent data rows and benign data rows do we have? (target column)

### Check for null values and decide on what you are doing with them

### What are the data types of the columns, do we need to one-hot encode anything here?

### Are there values that need to be normalized? If so, then normalize them before putting them into the algorithms

### Feature selection

In [None]:
# We have 30 dimensions in this dataset which is a lot!
# Lets use a basic random forest algorithm for sklearn and only use the most influential features!
# the random forest implementation has an 'feature_importances_' attribute that you can call
# use only the top 10 features

In [None]:
# data X with labels y
X, y = df.iloc[:,:-1], df.iloc[:,-1]
X = X.to_numpy()
y = y.to_numpy()

# split them into train and test
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, shuffle=True
)

In [None]:
clf = RandomForestClassifier(n_estimators=3, random_state=42)
clf.fit(X_train, y_train)

In [None]:
feature_importances = clf.feature_importances_
important_features = np.argsort(feature_importances)[-20:]
X_train = X_train[:,important_features]
X_test = X_test[:,important_features]

## Algorithms

In [None]:
# Use the sklearn library to execute the following algorithms
# Print the accuracies of all models by using the calculate_accuarcy method from above

### k-NN

### Logistic Regression

### Support Vector Machine

### Random Forest Classifier

## What happens if you use the following dataset?

In [None]:
def create_dataset():
    # Create dataset
    np.random.seed(42)  # always the same data generation
    X = np.stack((np.random.normal(size=100),np.random.normal(size=100)),axis = 1)
    y = np.array([1 if (xy[0]**2+xy[1]**2)**0.5 <0.5 else 0 for xy in X])
    
    return X, y

In [None]:
X, y = create_dataset()
# split them into train and test
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, shuffle=True
)

### k-NN

### Logistic regression

### Support Vector Machine

### Random Forest Classifier