# Dataset loading and handling utilities in sklearn

In [None]:
from sklearn.datasets import load_iris

In [None]:
iris = load_iris()

In [None]:
iris.keys()

In [None]:
print(iris['data'])

In [None]:
print(iris['feature_names'])

In [None]:
print(iris['target'])

In [None]:
print(iris['target_names'])

# Classification & Regression
* ## Classification - output set is discrete - categorizing the input in one of a finite set of labels or classes
* ## Regression - output set is continuous - the output consists of one or more continuous variables

# Classification with Sklearn

In [None]:
from sklearn.neighbors import KNeighborsClassifier

In [None]:
knn = KNeighborsClassifier()
X = iris['data']
y = iris['target']
knn.fit(X, y)
prediction = knn.predict(X)

In [None]:
correct = (prediction == y).sum()
accuracy = correct / len(y) * 100
print("Accuracy: {} %".format(accuracy))

## Explanation of Nearest Neighbour Classification
<ul>
<item><h3>1. Memorize the dataset</h3></item>
<item><h3>2. For each test sample:</h3></item>
<ul>
<item><h4>1. Find the most similar <i>training</i> sample</h4></item>
<item><h4>2. Assign the label of the most similar training sample to the test sample.</h4></item>
<item><h4>3. Generalize for $k$ nearest samples, have them vote on the label.</h4></item>
</ul>
<item><h3>3. What is _similarity_?</h3></item>
<ul>
<item><h4>1. Depends on _distance_. Generally, $similarity \propto \frac{1}{distance}$</h4></item>
<item><h4>2. Distances can be of multiple types.</h4></item>
<item><h4>3. Choosing a distance metrics depends on the distribution of data</h4></item>
</ul>
</ul>

# Regression with sklearn

In [None]:
from sklearn.datasets import load_boston

In [None]:
boston = load_boston()
print(boston.keys())

In [None]:
print(boston['data'])

In [None]:
print(boston['feature_names'])

In [None]:
print(boston['DESCR'])

In [None]:
print(boston['target'])

In [None]:
from sklearn.linear_model import LinearRegression

In [None]:
X = boston['data']
y = boston['target']
lr = LinearRegression()
lr.fit(X, y)
prediction = lr.predict(X)

In [None]:
from sklearn.metrics import r2_score

In [None]:
r2_score(y, prediction)

## Exercise: Use DecisionTreeClassifier and LogisticRegression on the MNIST dataset (provided below) and compare their performance.

In [None]:
from sklearn.datasets import load_digits
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier

In [None]:
# enter code here

## Exercise: Use LinearRegressor and Support Vector regressor on the diabetes dataset (provided below) and compare their performance

In [None]:
from sklearn.svm import SVR
from sklearn.datasets import load_diabetes

In [None]:
# enter code here