In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import random
from matplotlib.colors import ListedColormap

In [None]:
def generateColors(n):
    li = []
    for i in range(n):
        r = lambda: random.randint(0,255)
        li.append('#%02X%02X%02X' % (r(),r(),r()))
    return li

In [None]:
# Load dataset
#url = "https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data"
url = "iris.data"
names = ['sepal-length', 'sepal-width', 'petal-length', 'petal-width', 'class']
data = pd.read_csv(url, names=names)
#lets make it a binary problem, remove one of the classes
data = data[data['class'] != 'Iris-virginica']

X = data.iloc[:,2:4]#sepal length and width
y = data.iloc[:,-1]#class

### Les look at our data

In [None]:
X.head(5)

In [None]:
X.describe()

### Lets plot the petal length vs petal width

In [None]:
plt.scatter(X.iloc[:,0], X.iloc[:,1])

In [None]:
#lets log transform and see what it looks like
plt.scatter(np.log(X.iloc[:,0]), np.log(X.iloc[:,1]))

### Encode the labels

In [None]:
#Encode the y labels
print("Labels before Transforming")
print(y.head(5))
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
y_label_encoder = LabelEncoder()
y = y_label_encoder.fit_transform(y)
print("Labels after Transforming")
print(y[:5])

In [None]:
#lets quickly look at our data
plt.scatter(X['petal-length'], X['petal-width'], c=y, cmap = ListedColormap(generateColors(100)))#zero is purple

### Lets split the dataset into training and testing

In [None]:
# Splitting the dataset into the Training set and Test set
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)

### Lets do some feature scaling

In [None]:
# Feature Scaling
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

### Lets see if it made a difference?

In [None]:
plt.scatter(X_train[:,0], X_train[:,1], c=y_train, cmap = ListedColormap(generateColors(100)))#zero is purple

Do you think it made a difference?

### Lets fit a logistic regression to the data and see if we can classify

In [None]:
# Fitting Logistic Regression to the Training set
from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression(random_state = 0, solver='lbfgs')
classifier.fit(X_train, y_train)

In [None]:
# Predicting the Test set results
y_hat = classifier.predict(X_test)

In [None]:
# Visualising the Testing set results
from matplotlib.colors import ListedColormap
X_set, y_set = X_test, y_test
X1, X2 = np.meshgrid(np.arange(start = X_set[:, 0].min() - 1, stop = X_set[:, 0].max() + 1, step = 0.01),
                     np.arange(start = X_set[:, 1].min() - 1, stop = X_set[:, 1].max() + 1, step = 0.01))
plt.contourf(X1, X2, classifier.predict(np.array([X1.ravel(), X2.ravel()]).T).reshape(X1.shape),
             alpha = 0.5, cmap = ListedColormap(('red', 'green')))
plt.xlim(X1.min(), X1.max())
plt.ylim(X2.min(), X2.max())
for i, j in enumerate(np.unique(y_set)):
    plt.scatter(X_set[y_set == j, 0], X_set[y_set == j, 1],
                c = ListedColormap(('red', 'green'))(i), label = y_label_encoder.classes_[j])
plt.title('Logistic Regression (Testing set) - Iris Dataset')
plt.xlabel('petal-length')
plt.ylabel('petal-width')
plt.legend()
plt.show()

### Lets now try with KNN

In [None]:
# Fitting K-NN to the Training set
from sklearn.neighbors import KNeighborsClassifier
classifier = KNeighborsClassifier(n_neighbors = 5, metric = 'minkowski', p = 2)
classifier.fit(X_train, y_train)

In [None]:
# Visualising the Testing set results
from matplotlib.colors import ListedColormap
X_set, y_set = X_test, y_test
X1, X2 = np.meshgrid(np.arange(start = X_set[:, 0].min() - 1, stop = X_set[:, 0].max() + 1, step = 0.01),
                     np.arange(start = X_set[:, 1].min() - 1, stop = X_set[:, 1].max() + 1, step = 0.01))
plt.contourf(X1, X2, classifier.predict(np.array([X1.ravel(), X2.ravel()]).T).reshape(X1.shape),
             alpha = 0.5, cmap = ListedColormap(('red', 'green')))
plt.xlim(X1.min(), X1.max())
plt.ylim(X2.min(), X2.max())
for i, j in enumerate(np.unique(y_set)):
    plt.scatter(X_set[y_set == j, 0], X_set[y_set == j, 1],
                c = ListedColormap(('red', 'green'))(i), label = y_label_encoder.classes_[j])
plt.title('KNN (Testing set) - Iris Dataset')
plt.xlabel('petal-length')
plt.ylabel('petal-width')
plt.legend()
plt.show()

### Assignment: Try with decision tree and random forest

In [None]:
from sklearn.tree import DecisionTreeClassifier
classifier = DecisionTreeClassifier(max_depth=1)
classifier.fit(X_train, y_train)

In [None]:
# Visualising the Testing set results
from matplotlib.colors import ListedColormap
X_set, y_set = X_test, y_test
X1, X2 = np.meshgrid(np.arange(start = X_set[:, 0].min() - 1, stop = X_set[:, 0].max() + 1, step = 0.01),
                     np.arange(start = X_set[:, 1].min() - 1, stop = X_set[:, 1].max() + 1, step = 0.01))
plt.contourf(X1, X2, classifier.predict(np.array([X1.ravel(), X2.ravel()]).T).reshape(X1.shape),
             alpha = 0.5, cmap = ListedColormap(('red', 'green')))
plt.xlim(X1.min(), X1.max())
plt.ylim(X2.min(), X2.max())
for i, j in enumerate(np.unique(y_set)):
    plt.scatter(X_set[y_set == j, 0], X_set[y_set == j, 1],
                c = ListedColormap(('red', 'green'))(i), label = y_label_encoder.classes_[j])
plt.title('Decision Tree (Testing set) - Iris Dataset')
plt.xlabel('petal-length')
plt.ylabel('petal-width')
plt.legend()
plt.show()

In [None]:
from sklearn.ensemble import RandomForestClassifier
classifier = RandomForestClassifier(max_depth=1, n_estimators=10)
classifier.fit(X_train, y_train)

In [None]:
# Visualising the Testing set results
from matplotlib.colors import ListedColormap
X_set, y_set = X_test, y_test
X1, X2 = np.meshgrid(np.arange(start = X_set[:, 0].min() - 1, stop = X_set[:, 0].max() + 1, step = 0.01),
                     np.arange(start = X_set[:, 1].min() - 1, stop = X_set[:, 1].max() + 1, step = 0.01))
plt.contourf(X1, X2, classifier.predict(np.array([X1.ravel(), X2.ravel()]).T).reshape(X1.shape),
             alpha = 0.5, cmap = ListedColormap(('red', 'green')))
plt.xlim(X1.min(), X1.max())
plt.ylim(X2.min(), X2.max())
for i, j in enumerate(np.unique(y_set)):
    plt.scatter(X_set[y_set == j, 0], X_set[y_set == j, 1],
                c = ListedColormap(('red', 'green'))(i), label = y_label_encoder.classes_[j])
plt.title('RandomForestClassifier (Testing set) - Iris Dataset')
plt.xlabel('petal-length')
plt.ylabel('petal-width')
plt.legend()
plt.show()

# Assignment: try classifying the 3 flower classes

In [None]:
# Load dataset
#url = "https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data"
url = "iris.data"
names = ['sepal-length', 'sepal-width', 'petal-length', 'petal-width', 'class']
data = pd.read_csv(url, names=names)
#lets make it a binary problem, remove one of the classes

X = data.iloc[:,2:4]#sepal length and width
y = data.iloc[:,-1]#class

### Lets plot the petal length vs petal width

In [None]:
plt.scatter(X.iloc[:,0], X.iloc[:,1])

In [None]:
#Encode the y labels
print("Labels before Transforming")
print(y.head(5))
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
y_label_encoder = LabelEncoder()
y = y_label_encoder.fit_transform(y)
print("Labels after Transforming")
print(y[:5])

In [None]:
#lets quickly look at our data
plt.scatter(X['petal-length'], X['petal-width'], c=y, cmap = ListedColormap(generateColors(100)))#zero is purple

### Lets split the dataset into training and testing

In [None]:
# Splitting the dataset into the Training set and Test set
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 100)

### Lets do some feature scaling

In [None]:
# Feature Scaling
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

### Before we predict we should look at our test data and see if we can classify them

In [None]:
plt.scatter(X_test[:,0], X_test[:,1], c=y_test, cmap = ListedColormap(generateColors(100)))#zero is purple

### Lets fit a logistic regression to the data and see if we can classify

In [None]:
# Fitting Logistic Regression to the Training set
from sklearn.linear_model import LogisticRegression
#You must instantiate your logistic regression classifier here, similar to above
classifier = #(solver='lbfgs', multi_class='auto')
classifier.fit(X_train, y_train)

In [None]:
# Predicting the Test set results
y_hat = classifier.predict(X_test)

In [None]:
# Visualising the Testing set results
from matplotlib.colors import ListedColormap
X_set, y_set = X_test, y_test
X1, X2 = np.meshgrid(np.arange(start = X_set[:, 0].min() - 1, stop = X_set[:, 0].max() + 1, step = 0.01),
                     np.arange(start = X_set[:, 1].min() - 1, stop = X_set[:, 1].max() + 1, step = 0.01))
plt.contourf(X1, X2, classifier.predict(np.array([X1.ravel(), X2.ravel()]).T).reshape(X1.shape),
             alpha = 0.5, cmap = ListedColormap(('red', 'green', 'blue')))
plt.xlim(X1.min(), X1.max())
plt.ylim(X2.min(), X2.max())
for i, j in enumerate(np.unique(y_set)):
    plt.scatter(X_set[y_set == j, 0], X_set[y_set == j, 1],
                c = ListedColormap(('red', 'green', 'blue'))(i), label = y_label_encoder.classes_[j])
plt.title('Logistic Regression (Testing set) 3 classes - Iris Dataset')
plt.xlabel('petal-length')
plt.ylabel('petal-width')
plt.legend()
plt.show()

In [None]:
from sklearn.metrics import accuracy_score
print('Our accuracy is: {}%'.format(accuracy_score(y_test, y_hat)*100))

### Try with random forest

In [None]:
from sklearn.ensemble import RandomForestClassifier
#Try with Trees that have different depth and number of trees
#You must instantiate your randomforest classifier here, similar to above
classifier = #(max_depth=1, n_estimators=1)
classifier.fit(X_train, y_train)

In [None]:
# Predicting the Test set results
y_hat = classifier.predict(X_test)

In [None]:
# Visualising the Testing set results
from matplotlib.colors import ListedColormap
X_set, y_set = X_test, y_test
X1, X2 = np.meshgrid(np.arange(start = X_set[:, 0].min() - 1, stop = X_set[:, 0].max() + 1, step = 0.01),
                     np.arange(start = X_set[:, 1].min() - 1, stop = X_set[:, 1].max() + 1, step = 0.01))
plt.contourf(X1, X2, classifier.predict(np.array([X1.ravel(), X2.ravel()]).T).reshape(X1.shape),
             alpha = 0.5, cmap = ListedColormap(('red', 'green', 'blue')))
plt.xlim(X1.min(), X1.max())
plt.ylim(X2.min(), X2.max())
for i, j in enumerate(np.unique(y_set)):
    plt.scatter(X_set[y_set == j, 0], X_set[y_set == j, 1],
                c = ListedColormap(('red', 'green', 'blue'))(i), label = y_label_encoder.classes_[j])
plt.title('Random Forest (Testing set) 3 classes - Iris Dataset')
plt.xlabel('petal-length')
plt.ylabel('petal-width')
plt.legend()
plt.show()

### Why do you think that a random forest with 1 tree and a depth of 1 gives us this?

In [None]:
plt.scatter(X_test[:,0], X_test[:,1], c=y_hat, cmap = ListedColormap(generateColors(100)))#zero is purple

### We can check the accuracy score

In [None]:
from sklearn.metrics import accuracy_score
print('Our accuracy is: {}%'.format(accuracy_score(y_test, y_hat)*100))

In [None]:
# Fitting K-NN to the Training set
from sklearn.neighbors import KNeighborsClassifier
classifier = KNeighborsClassifier(n_neighbors = 5, metric = 'minkowski', p = 2)
classifier.fit(X_train, y_train)

In [None]:
y_hat = classifier.predict(X_test)

In [None]:
# Visualising the Testing set results
from matplotlib.colors import ListedColormap
X_set, y_set = X_test, y_test
X1, X2 = np.meshgrid(np.arange(start = X_set[:, 0].min() - 1, stop = X_set[:, 0].max() + 1, step = 0.01),
                     np.arange(start = X_set[:, 1].min() - 1, stop = X_set[:, 1].max() + 1, step = 0.01))
plt.contourf(X1, X2, classifier.predict(np.array([X1.ravel(), X2.ravel()]).T).reshape(X1.shape),
             alpha = 0.5, cmap = ListedColormap(('red', 'green', 'blue')))
plt.xlim(X1.min(), X1.max())
plt.ylim(X2.min(), X2.max())
for i, j in enumerate(np.unique(y_set)):
    plt.scatter(X_set[y_set == j, 0], X_set[y_set == j, 1],
                c = ListedColormap(('red', 'green', 'blue'))(i), label = y_label_encoder.classes_[j])
plt.title('KNN (Testing set) 3 classes - Iris Dataset')
plt.xlabel('petal-length')
plt.ylabel('petal-width')
plt.legend()
plt.show()

In [None]:
from sklearn.metrics import accuracy_score
print('Our accuracy is: {}%'.format(accuracy_score(y_test, y_hat)*100))

# If you have gotten this far... Try using SVM from sklearn... Google is your friend 

# One last cool example: the XOR problem!

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
xor = pd.DataFrame({'X1':[0.,0.,1.,1.], 'X2':[0.,1.,0.,1.], 'y':[0.,1.,1.,0.]})
xor

In [None]:
# Fitting Logistic Regression to the Training set
from sklearn.linear_model import LogisticRegression
#You must instantiate your logistic regression classifier here, similar to above
classifier = LogisticRegression(solver='lbfgs', multi_class='auto')
classifier.fit(xor.iloc[:,:2], xor.iloc[:,2])

In [None]:
# Visualising the Testing set results
from matplotlib.colors import ListedColormap
X_set, y_set = xor.iloc[:,:2].values, xor.iloc[:,2].values
X1, X2 = np.meshgrid(np.arange(start = X_set[:, 0].min() - 1, stop = X_set[:, 0].max() + 1, step = 0.01),
                     np.arange(start = X_set[:, 1].min() - 1, stop = X_set[:, 1].max() + 1, step = 0.01))
plt.contourf(X1, X2, classifier.predict(np.array([X1.ravel(), X2.ravel()]).T).reshape(X1.shape),
             alpha = 0.5, cmap = ListedColormap(('red', 'green')))
plt.xlim(X1.min(), X1.max())
plt.ylim(X2.min(), X2.max())
for i, j in enumerate(np.unique(y_set)):
    plt.scatter(X_set[y_set == j, 0], X_set[y_set == j, 1],
                c = ListedColormap(('red', 'green'))(i), label = j)
plt.title('XOR problem 3 classes - Iris Dataset')
plt.xlabel('petal-length')
plt.ylabel('petal-width')
plt.legend()
plt.show()

In [None]:
classifier.predict(xor.iloc[:,:2])