<a href="https://colab.research.google.com/github/jessicamadridmejia/Cheat-Sheets/blob/master/3_Scikit_Learn_Cheat_Sheet_Python_Machine_Learning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Scikit-learn
Scikit-learn is an open source Python library that implements a range of machine learning, preprocessing, cross-validation and visualization algorithms using a unified interface.



## A Basic Example

In [59]:
from sklearn import neighbors, datasets, preprocessing
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

iris = datasets.load_iris()
X, y = iris.data[:,:2], iris.target

X_train, X_test, y_train, y_test = train_test_split(X,y,random_state=33)

scaler = preprocessing.StandardScaler().fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

knn = neighbors.KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train,y_train)

y_pred = knn.predict(X_test)
accuracy_score(y_pred, y_test)

0.631578947368421

## Loading The Data

Your data needs to be numeric and stored as NumPy arrays or SciPy sparse
matrices. Other types that are convertible to numeric arrays, such as Pandas
DataFrame, are also acceptable.

In [60]:
import numpy as np

X = np.random.random((10,5))
y = np.array(['M','M','F','F','M','F','M','M','F','F'])

X[X < 0.7] = 0

X

array([[0.7584019 , 0.        , 0.99476562, 0.        , 0.78921816],
       [0.        , 0.        , 0.97151424, 0.        , 0.97978104],
       [0.95560261, 0.        , 0.        , 0.        , 0.8434506 ],
       [0.84470202, 0.        , 0.        , 0.        , 0.        ],
       [0.78558866, 0.        , 0.        , 0.        , 0.        ],
       [0.        , 0.        , 0.        , 0.        , 0.        ],
       [0.902645  , 0.        , 0.        , 0.        , 0.        ],
       [0.        , 0.        , 0.        , 0.94857593, 0.        ],
       [0.        , 0.        , 0.94810829, 0.        , 0.        ],
       [0.        , 0.        , 0.79047242, 0.        , 0.        ]])

## Training And Test Data

In [0]:
from sklearn.model_selection import  train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)

## Preprocessing The Data



### Standarization

In [0]:
from sklearn.preprocessing import StandardScaler

scaler =  StandardScaler().fit(X_train)

standardize_X = scaler.transform(X_train)
standardize_X_test = scaler.transform(X_test)

### Normalization

In [0]:
from sklearn.preprocessing import Normalizer

normalizer = Normalizer().fit(X_train)
normalized_X = normalizer.transform(X_train)
normalized_X_test = normalizer.transform(X_test)

### Binarization

In [0]:
from sklearn.preprocessing import Binarizer

binarizer = Binarizer(threshold=0.0).fit(X)
binarizer_X = binarizer.transform(X) 

### Encoding Categorical Features

In [0]:
from sklearn.preprocessing import LabelEncoder

enc = LabelEncoder()
y = enc.fit_transform(y)

### Imputing Missing Values

In [66]:
from sklearn.impute import SimpleImputer

imp = SimpleImputer(missing_values=0, strategy='mean')
imp.fit_transform(X_train)

array([[0.83524964, 0.79047242, 0.94857593, 0.8844996 ],
       [0.83524964, 0.97151424, 0.94857593, 0.97978104],
       [0.902645  , 0.91891743, 0.94857593, 0.8844996 ],
       [0.83524964, 0.91891743, 0.94857593, 0.8844996 ],
       [0.84470202, 0.91891743, 0.94857593, 0.8844996 ],
       [0.7584019 , 0.99476562, 0.94857593, 0.78921816],
       [0.83524964, 0.91891743, 0.94857593, 0.8844996 ]])

### Generating Polynomial Features

In [67]:
from sklearn.preprocessing import  PolynomialFeatures

poly = PolynomialFeatures(5)
poly.fit_transform(X)

array([[1.        , 0.7584019 , 0.        , ..., 0.        , 0.        ,
        0.30618601],
       [1.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.90291143],
       [1.        , 0.95560261, 0.        , ..., 0.        , 0.        ,
        0.42687258],
       ...,
       [1.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [1.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [1.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ]])

## Create Your Model

### Supervised Learning Estimators

#### Linear Regression

In [0]:
from sklearn.linear_model import LinearRegression

lr = LinearRegression(normalize=True)

#### Support Vector Machines (SVM)

In [0]:
from sklearn.svm import  SVC

svc = SVC(kernel='linear')

#### Naive Bayes

In [0]:
from sklearn.naive_bayes import GaussianNB
gnb = GaussianNB()

#### KNN

In [0]:
from sklearn import neighbors

knn = neighbors.KNeighborsClassifier(n_neighbors=5)

### Unsupervised Learning Estimator


#### Principal Component Analysis (PCA)

In [0]:
from sklearn.decomposition import PCA

pca = PCA(n_components=0.95)

#### K Means

In [0]:
from sklearn.cluster import  KMeans

k_means = KMeans(n_clusters=3, random_state=0)

## Model Fitting

### Supervised learning

In [74]:
# Fit the model to the data
lr.fit(X,y)
knn.fit(X_train, y_train)
svc.fit(X_train, y_train)

SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='linear',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)

### Unsupervised Learning

In [0]:
# Fit the model to the data
k_means.fit(X_train)

# Fit to data, then transform it
pca_model = pca.fit_transform(X_train)

## Prediction

### Supervised Estimators

In [0]:
# Predict labels
y_pred = svc.predict(np.random.random((2,5)))

# Predict labels
y_pred = lr.predict(X_test)

# Estimate probability of a label
y_pred = knn.predict_proba(X_test)

### Unsupervised Estimators

In [0]:
# Predict labels in clustering
y_pred = k_means.predict(X_test)

## Evaluate Your Model's Performance

### Classification Metrics

#### Accuracy Score

In [78]:
# Estimator score method
knn.score(X_test, y_test)

# Metric scoring functions
from sklearn.metrics import accuracy_score
accuracy_score(y_test, y_pred)

  score = y_true == y_pred


0.0

#### Classification Report

In [0]:
# Precision, recall, f1-score and support
from sklearn.metrics import classification_report
#print(classification_report(y_test, y_pred))

#### Confusion Matrix

In [0]:
from sklearn.metrics import confusion_matrix
#confusion_matrix(y_test, y_pred)

### Regression Metrics

#### Mean Absolute Error

In [81]:
from sklearn.metrics import mean_absolute_error

y_true = [3,-0.5,2]
mean_absolute_error(y_true,y_pred)

2.5

#### Mean Squared Error

In [0]:
from sklearn.metrics import mean_squared_error

#mean_absolute_error(y_test, y_pred)

#### R2 Score

In [83]:
from sklearn.metrics import  r2_score

r2_score(y_true, y_pred)

-1.9615384615384617

### Clustering Metrics

#### Adjusted Rand Index

In [84]:
from sklearn.metrics import  adjusted_rand_score
adjusted_rand_score(y_true,y_pred)

0.0

#### Homogeneity

In [85]:
from sklearn.metrics import homogeneity_score
homogeneity_score(y_true,y_pred)

0.5793801642856952

#### V-measure

In [86]:
from sklearn.metrics import v_measure_score
v_measure_score(y_pred, y_true)

0.7336804366512111

### Cross-Validation

In [87]:
from sklearn.model_selection import  cross_val_score
print(cross_val_score(knn, X_train, y_train, cv=4))

[0.5 0.5 0.5 0. ]




## Tune Your Model

### Grid Search

In [88]:
from sklearn.model_selection import GridSearchCV
params = {'n_neighbors': np.arange(1,3),
          'metric': ['euclidean', 'cityblock']}
grid = GridSearchCV(estimator=knn, param_grid=params)
"""
grid.fit(X_train, y_train)
print(grid.best_score_)
print(grid.bes_estimator_.n_neighbors)
"""

'\ngrid.fit(X_train, y_train)\nprint(grid.best_score_)\nprint(grid.bes_estimator_.n_neighbors)\n'

### Randomized Parameter Optimization

In [0]:
from sklearn.model_selection import RandomizedSearchCV
params = {'n_neighbors': range(1,5),
          'weights': ['uniform','distance']}

rsearch = RandomizedSearchCV(estimator=knn,param_distributions=params, cv=4,n_iter=8, random_state=5)

#rsearch.fit(X_train, y_train)
#print(rsearch.best_score)