## Machine Learning

![alt text](https://i.sstatic.net/IMVOl.png "Machine Learning")

In [None]:
!pip install scikit-learn seaborn

## load dataset

In [None]:
from sklearn import datasets

iris = datasets.load_iris()
iris

In [None]:
datasets.load_breast_cancer()


In [None]:
datasets.load_diabetes()

In [None]:
import matplotlib.pyplot as plt

from sklearn import datasets, svm
from sklearn.inspection import DecisionBoundaryDisplay

# import some data to play with
iris = datasets.load_iris()
# Take the first two features. We could avoid this by using a two-dim dataset
X = iris.data[:, :2]
y = iris.target

# we create an instance of SVM and fit out data. We do not scale our
# data since we want to plot the support vectors
C = 1.0  # SVM regularization parameter
models = (
    svm.SVC(kernel="linear", C=C),
    svm.LinearSVC(C=C, max_iter=10000),
    svm.SVC(kernel="rbf", gamma=0.7, C=C),
    svm.SVC(kernel="poly", degree=3, gamma="auto", C=C),
)
models = (clf.fit(X, y) for clf in models)

# title for the plots
titles = (
    "SVC with linear kernel",
    "LinearSVC (linear kernel)",
    "SVC with RBF kernel",
    "SVC with polynomial (degree 3) kernel",
)

# Set-up 2x2 grid for plotting.
fig, sub = plt.subplots(2, 2)
plt.subplots_adjust(wspace=0.4, hspace=0.4)

X0, X1 = X[:, 0], X[:, 1]

for clf, title, ax in zip(models, titles, sub.flatten()):
    disp = DecisionBoundaryDisplay.from_estimator(
        clf,
        X,
        response_method="predict",
        cmap=plt.cm.coolwarm,
        alpha=0.8,
        ax=ax,
        xlabel=iris.feature_names[0],
        ylabel=iris.feature_names[1],
    )
    ax.scatter(X0, X1, c=y, cmap=plt.cm.coolwarm, s=20, edgecolors="k")
    ax.set_xticks(())
    ax.set_yticks(())
    ax.set_title(title)

plt.show()

## Data Visualization

In [None]:
import pandas as pd
import numpy as np
data = pd.DataFrame(data= np.c_[iris['data'], iris['target']],
                     columns= iris['feature_names'] + ['target'])
data


In [None]:
"""
Iris-setosa        0
Iris-versicolor    1
Iris-virginica     2
"""

data['target'].value_counts()

In [None]:

import seaborn as sns
sns.set_palette('husl')
g = sns.pairplot(data, hue='target', markers='+')
plt.show()

In [None]:
for feature in iris.feature_names:
    g = sns.violinplot(x='target', y=feature, data=data, inner='quartile')
    plt.show()
    

## Train and test on the same dataset

In [None]:
X = data.drop(['target'], axis=1)
y = data['target']
print(X.shape)
print(y.shape)

In [None]:
from sklearn import metrics
from sklearn.neighbors import KNeighborsClassifier

k_range = list(range(1,26))
scores = []
for k in k_range:
    knn = KNeighborsClassifier(n_neighbors=k)
    knn.fit(X, y)
    y_pred = knn.predict(X)
    scores.append(metrics.accuracy_score(y, y_pred))
    
plt.plot(k_range, scores)
plt.xlabel('Value of k for KNN')
plt.ylabel('Accuracy Score')
plt.title('Accuracy Scores for Values of k of k-Nearest-Neighbors')
plt.show()

In [None]:
from sklearn.svm import SVC
svc = SVC()
svc.fit(X, y)
y_pred = svc.predict(X)
print(metrics.accuracy_score(y, y_pred))
for i in range(len(y)):
    print(y[i],"==" if y[i]==y_pred[i] else "=/=",  y_pred[i])

## Split the dataset into a training set and a testing set

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4)
print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

In [None]:
# experimenting with different n values
k_range = list(range(1,26))
scores = []
for k in k_range:
    knn = KNeighborsClassifier(n_neighbors=k)
    knn.fit(X_train, y_train)
    y_pred = knn.predict(X_test)
    scores.append(metrics.accuracy_score(y_test, y_pred))
    
plt.plot(k_range, scores)
plt.xlabel('Value of k for KNN')
plt.ylabel('Accuracy Score')
plt.title('Accuracy Scores for Values of k of k-Nearest-Neighbors')
plt.show()

In [None]:
knn = KNeighborsClassifier(n_neighbors=12)
knn.fit(X, y)

# make a prediction for an example of an out-of-sample observation
knn.predict([[6, 3, 4, 2]])

# regression

In [None]:
data_url = "https://raw.githubusercontent.com/selva86/datasets/master/BostonHousing.csv"
data = pd.read_csv(data_url, on_bad_lines='skip')
data

In [None]:
from sklearn import linear_model

model = linear_model.ElasticNet()
X=data.drop(columns=["medv"])
y=data.medv


model.fit(X, y)

scorers = ["explained_variance",
               "max_error",
               "neg_mean_absolute_error",
               "neg_mean_squared_error",
               "neg_root_mean_squared_error",
               "neg_median_absolute_error",
               "r2"
               ]

from sklearn.model_selection import cross_validate, cross_val_predict
score=cross_validate(model, X, y, cv=5, scoring=scorers)
score



In [None]:
df = pd.DataFrame(score)
std = []
mean = []
metrics = {}
for k, v in score.items():
        mean.append(v.mean())
        std.append(v.std())
        metrics[f"cross_validate.{k}.mean"] = mean[-1]
        metrics[f"cross_validate.{k}.std"] = std[-1]
df.loc['mean'] = mean
df.loc['std'] = std
df

In [None]:
preds = cross_val_predict(model, X, y, cv=5)
preds

In [None]:
fig, ax = plt.subplots()
ax.scatter(y, preds)
ax.plot([y.min(), y.max()], [y.min(), y.max()], 'k--', lw=4)
ax.set_xlabel('measured')
ax.set_ylabel('predicted')

In [None]:
from sklearn import linear_model
model = linear_model.Lasso()
model.fit(X, y)
model

In [None]:
preds = cross_val_predict(model, X, y, cv=5)
fig, ax = plt.subplots()
ax.scatter(y, preds)
ax.plot([y.min(), y.max()], [y.min(), y.max()], 'k--', lw=4)
ax.set_xlabel('measured')
ax.set_ylabel('predicted')