# Overview

This is a handy Scikit-Learn "cheat sheet" to Machine Learning with Python, mainly consisting of code examples. It contains the basic steps needed to implement Machine Learning algorithms successfully: data loading, data preprocessing, creating a model and fitting the data to it, predicting target labels, validating the model and tuning it further to improve its performance. 

* [A Basic Code Example](#basic-code-example)
* [Data Loading](#data-loading)
* [Splitting the Data into Training and Test Data](#splitting)
* [Data Preprocessing](#data-preprocessing)
* [Creating a Model](#model-creation)
    * [Supervised Learning Estimators](#supervised-models)
    * [Unsupervised Learning Estimators](#unsupervised-models)
* [Model Fitting](#model-fitting)
* [Prediction](#prediction) 
* [Evaluating Model Performance](#evaluation)
    * [Regression Metrics](#regression-metrics)
    * [Classification Metrics](#classification-metrics)
    * [Clustering Metrics](#clustering-metrics)
    * [Cross-Validation](#cross-validation)
* [Model Tuning](#tuning)
    * [Grid Search](#grid-search)
    * [Randomized Parameter Optimization](#random-parameter-opt)

# A Basic Code Example <a class="anchor" id='basic-code-example'></a> 

In [1]:
from sklearn import neighbors, datasets, preprocessing
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

iris = datasets.load_iris()
X, y = iris.data[:, :2], iris.target
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=33)
scaler = preprocessing.StandardScaler().fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

knn = neighbors.KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train, y_train)
y_pred = knn.predict(X_test)

accuracy_score(y_test, y_pred)

0.631578947368421

# Data Loading <a class="anchor" id='data-loading'></a> 
For Scikit-Learn models, data needs to be numeric and stored as NumPy arrays or SciPy sparse matrices. Other types that are convertible to numeric arrays, such as Pandas DataFrame, are also acceptable.

In [2]:
import numpy as np
X_sample = np.random.random((10,5))
y_sample = np.array(['M','M','F','F','M','F','M','M','F','F','F'])
X_sample[X_sample < 0.7] = 0

# Splitting the Data into Training and Test Data <a class="anchor" id='splitting'></a> 

In [3]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y,random_state=0)

# Data Preprocessing <a class="anchor" id='data-preprocessing'></a>

In [4]:
# Standardization
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler().fit(X_train)
standardized_X = scaler.transform(X_train)
standardized_X_test = scaler.transform(X_test)

In [5]:
# Normalization
from sklearn.preprocessing import Normalizer
scaler = Normalizer().fit(X_train)
normalized_X = scaler.transform(X_train)
normalized_X_test = scaler.transform(X_test)

In [6]:
# Binarization
from sklearn.preprocessing import Binarizer
binarizer = Binarizer(threshold=0.0).fit(X)
binary_X = binarizer.transform(X)

In [7]:
# Encoding Categorical Features
from sklearn.preprocessing import LabelEncoder
encoder = LabelEncoder()
y = encoder.fit_transform(y)

In [8]:
# Imputing Missing Values
from sklearn.impute import SimpleImputer
imp = SimpleImputer(missing_values=np.nan, strategy='mean')
X_train_transformed = imp.fit_transform(X_train) 

In [9]:
# Generating Polynomial Features
from sklearn.preprocessing import PolynomialFeatures
poly = PolynomialFeatures(degree=5)
X_transformed = poly.fit_transform(X)

# Creating a Model <a class="anchor" id='model-creation'></a>  
### Creating Supervised Learning Estimators <a class="anchor" id='supervised-models'></a>

In [10]:
# Linear Regression
from sklearn.linear_model import LinearRegression
lr = LinearRegression(normalize=True)

# Support Vector Machines (SVM)
from sklearn.svm import SVC
svc = SVC(kernel='linear')

# Naive Bayes
from sklearn.naive_bayes import GaussianNB
gnb = GaussianNB()

# KNN
from sklearn import neighbors
knn = neighbors.KNeighborsClassifier(n_neighbors=5)

### Creating Unsupervised Learning Estimators <a class="anchor" id='unsupervised-models'></a> 

In [11]:
# Principal Component Analysis (PCA)
from sklearn.decomposition import PCA
pca = PCA(n_components=0.95)

# K Means 
from sklearn.cluster import KMeans
k_means = KMeans(n_clusters=3, random_state=0)

 # Model Fitting <a class="anchor" id='model-fitting'></a> 

In [12]:
# Supervised Learning 
lr.fit(X, y)
knn.fit(X_train, y_train)
svc.fit(X_train, y_train)

# Unsupervised Learning
k_means.fit(X_train)
pca_model = pca.fit_transform(X_train)

# Prediction <a class="anchor" id='prediction'></a> 

In [13]:
# Supervised Estimators
y_pred_lr = lr.predict(X_test)
y_pred_knn = knn.predict_proba(X_test)
y_pred_svc = svc.predict(X_test)

# Unsupervised Estimators
y_pred_kmeans = k_means.predict(X_test)

# Evaluating Model Performance <a class="anchor"  id='evaluation'></a>  
### Regression Metrics <a class="anchor" id='regression-metrics'></a>

In [14]:
# Mean Absolute Error
from sklearn.metrics import mean_absolute_error
mean_absolute_error(y_test, y_pred_lr)

0.3562032677104089

In [15]:
# Mean Squared Error
from sklearn.metrics import mean_squared_error
mean_squared_error(y_test, y_pred_lr)

0.19980847725160666

In [16]:
# R^2 Score
from sklearn.metrics import r2_score
r2_score(y_test, y_pred_lr)

0.6481421449374145

### Classification Metrics <a class="anchor" id='classification-metrics'></a> 

In [17]:
# Accuracy Score
knn.score(X_test, y_test)
from sklearn.metrics import accuracy_score
accuracy_score(y_test, y_pred_svc)

0.7631578947368421

In [18]:
# Classification Report
from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred_svc))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00        13
           1       0.73      0.69      0.71        16
           2       0.50      0.56      0.53         9

    accuracy                           0.76        38
   macro avg       0.74      0.75      0.75        38
weighted avg       0.77      0.76      0.77        38



In [19]:
# Confusion Matrix
from sklearn.metrics import confusion_matrix
print(confusion_matrix(y_test, y_pred_svc))

[[13  0  0]
 [ 0 11  5]
 [ 0  4  5]]


### Clustering Metrics <a class="anchor" id='clustering-metrics'></a> 

In [20]:
# Adjusted Rand Index
from sklearn.metrics import adjusted_rand_score
adjusted_rand_score(y_test, y_pred_kmeans)

0.4698674040093006

In [21]:
# Homogeneity
from sklearn.metrics import homogeneity_score
homogeneity_score(y_test, y_pred_kmeans)

0.523405271415338

In [22]:
# V-measure
from sklearn.metrics import v_measure_score
v_measure_score(y_test, y_pred_kmeans)

0.5315718798622953

### Cross-Validation <a class="anchor" id='cross-validation'></a> 

In [23]:
from sklearn.model_selection import cross_val_score
print(cross_val_score(knn, X_train, y_train, cv=4))

[0.73333333 0.75       0.81481481 0.77777778]


 # Model Tuning <a class="anchor" id='tuning'></a>
### Grid Search <a class="anchor" id='grid-search'></a> 

In [24]:
#from sklearn.grid_search import GridSearchCV
from sklearn.model_selection import GridSearchCV

params = {"n_neighbors": np.arange(1,10), 
          "metric": ["euclidean", "cityblock"],
          "weights": ["uniform", "distance"]}

grid = GridSearchCV(estimator=knn, 
                    param_grid=params, 
                    cv=5, 
                    iid=False)

grid.fit(X_train, y_train)
print(grid.best_score_)
print(grid.best_estimator_.n_neighbors)

0.8128317334839075
9


### Randomized Parameter Optimization <a class="anchor" id='random-parameter-opt'></a> 

In [25]:
from sklearn.model_selection import RandomizedSearchCV

params = {"n_neighbors": np.arange(1,10), 
          "metric": ["euclidean", "cityblock"],
          "weights": ["uniform", "distance"]}

rsearch = RandomizedSearchCV(estimator=knn, 
                             param_distributions=params,
                             cv=5,
                             n_iter=8,
                             random_state=2,
                             iid=False)

rsearch.fit(X_train, y_train)
print(rsearch.best_score_)
print(rsearch.best_estimator_.n_neighbors)

0.804103143233578
7
