## Basic KNN classifier

In [None]:
from sklearn import neighbors, datasets, preprocessing
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

iris = datasets.load_iris()

X, y = iris.data[:, :2], iris.target
X_train, X_test, y_train, y_test  = train_test_split(X,  y,  random_state=33)
scaler = preprocessing.StandardScaler().fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)
knn = neighbors.KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train, y_train)
y_pred = knn.predict(X_test)
# print(knn.predict_proba(X_test))
# print(y_pred)

# Get score
print(knn.score(X_test, y_test))
accuracy_score(y_pred, y_test)

# Print score report
from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred))

# Confusion matrix
from sklearn.metrics import confusion_matrix
print(confusion_matrix(y_pred=y_pred, y_true=y_test, normalize='all'))

# Other metrics
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
print("Mean absolute error: %s." % mean_absolute_error(y_test, y_pred))
print("Mean squared error: %s." % mean_squared_error(y_test, y_pred))
print("R2 score: %s." % r2_score(y_test, y_pred))

## Scaling and Normalization

In [None]:
from sklearn.preprocessing import Normalizer, StandardScaler, Binarizer

# Scaling
scaler = StandardScaler().fit(X_train)
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.fit_transform(X_test)

# Normalization
normalizer = Normalizer().fit(X_train)
X_train_normalized = normalizer.fit_transform(X_train)
X_test_normalized = normalizer.fit_transform(X_test)

# Binarization
binarizer = Binarizer(threshold=5).fit(X_train)
X_train_binarized = binarizer.fit_transform(X_train) 
X_test_binarized = binarizer.fit_transform(X_test) 

## Modelling

In [None]:
from sklearn.linear_model import LinearRegression, LogisticRegression

# Sample X
X = np.array([[1, 1], [1, 2], [2, 2], [2, 3]])
# Sample Y: y = 1 * x_0 + 2 * x_1 + 3
y = np.dot(X, np.array([1, 2])) + 3
reg = LinearRegression().fit(X, y)

reg.score(X, y) # R^2
reg.coef_ # Show coefficients: 1 and 2 in this case
reg.intercept_ # Show y intercept
reg.predict(np.array([[3, 5]])) # Predict y for x_0 = 3 and x_1 = 5
reg.get_params()

In [None]:
# Other models
from sklearn.svm import SVC
svc = SVC(kernel="linear") 

from sklearn.naive_bayes import GaussianNB
gnb = GaussianNB()

from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(n_neighbors=5)

from sklearn.decomposition import PCA
pca = PCA(n_components=0.95)

from sklearn.cluster import KMeans
kmeans = KMeans(n_clusters=3, random_state=0)

## Cross validation

In [None]:
from sklearn.model_selection import cross_val_score
from sklearn.svm import SVC

X, y = datasets.load_iris(return_X_y=True)
svc = SVC(C=1, kernel='linear', random_state=0)
scores = cross_val_score(X=X, y=y, estimator=svc, cv=5)

# Using other scoring metrics
from sklearn import metrics
scores = cross_val_score(X=X, y=y, estimator=svc, cv=10, scoring="f1_macro")

# Using multile scoring metrics
from sklearn.model_selection import cross_validate
scores = cross_validate(X=X, y=y, estimator=svc, cv=10, scoring=["precision_macro", "recall_macro"])
print(scores['test_recall_macro'])


## K-fold

In [None]:
from sklearn.model_selection import RepeatedKFold
import numpy as np

X = np.array([[456, 234], [376, 732], [68, 218], [7456, 7456]])

random_state = 12883823

rkf = RepeatedKFold(n_splits=2, n_repeats=2, random_state=random_state)
for train, test in rkf.split(X):
    print("%s %s" % (train, test))
    X[train]
    X[test]

## Time series Cross Validation

In [None]:
from sklearn.model_selection import TimeSeriesSplit

X = np.array([[1, 2], [3, 4], [5, 6], [7, 8], [9, 10], [11, 12]])
y = y = np.array([1, 2, 3, 4, 5, 6])

tscv = TimeSeriesSplit(n_splits=3)

for train, test in tscv.split(X):
    print("%s %s" % (train, test))
    print(X[train])
    print(X[test])

## Make pipeline

In [None]:
%%writefile test.txt

from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler

classifier = make_pipeline(StandardScaler(), SVC(C=1, kernel="linear"))
cross_val_score(classifier, X, y, cv=10)

##  Impute missing values

In [None]:
from sklearn import impute

s = np.array([[3, 3], [4, np.nan], [5, 5], [6, 6]])

imp = impute.SimpleImputer(missing_values=np.nan, strategy='mean', add_indicator=True)
# imp.fit([[7, 2, 3], [4, np.nan, 6], [10, 5, 9]])

# print(imp.fit_transform([[np.nan, 2, 3], [4, np.nan, 6], [10, np.nan, 9]]))

imp = impute.KNNImputer(n_neighbors=2, weights="uniform", add_indicator=True)
imp.fit_transform(s)

## Make polynomial features

In [None]:
from sklearn.preprocessing import PolynomialFeatures

poly = PolynomialFeatures(2)
X = np.array([[1, 2], [3, 4], [5, 6], [7, 8], [9, 10], [11, 12]])
poly.fit_transform(X)

## Grid param search

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score
params = {"n_neighbors" : np.arange(1, 3)}

iris = datasets.load_iris()

X, y = iris.data[:, :2], iris.target
X_train, X_test, y_train, y_test  = train_test_split(X,  y,  random_state=33)
scaler = preprocessing.StandardScaler().fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)
knn = neighbors.KNeighborsClassifier()


grid = GridSearchCV(estimator=knn, param_grid=params, scoring='accuracy')
grid.fit(X_train, y_train)
print(grid.best_score_)
print(grid.best_estimator_.n_neighbors) # 2 neighbors , score = 0.8027667984189722

In [None]:
from sklearn.model_selection import RandomizedSearchCV

X, y = iris.data[:, :2], iris.target
X_train, X_test, y_train, y_test  = train_test_split(X,  y,  random_state=33)
scaler = preprocessing.StandardScaler().fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

params = {"n_neighbors": range(1,5), "weights": ["uniform", "distance"]}
rsearch = RandomizedSearchCV(estimator=knn, 
                                param_distributions=params,
                                cv=4,
                                n_iter=8,
                                random_state=5)
rsearch.fit(X_train, y_train)
print(rsearch.best_score_)