# Iris Dataset

### Load Libraries

In [None]:
from pandas import read_csv
from pandas.plotting import scatter_matrix
from numpy import mean
from numpy import std
import matplotlib.pyplot as plt
from IPython.display import Image, display
from collections import Counter
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import plot_confusion_matrix
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import auc
from sklearn.metrics import make_scorer
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.decomposition import PCA
import time
import pickle

### Load Dataset

In [None]:
names = ['sepal-length', 'sepal-width', 'petal-length', 'petal-width', 'class']
full_path = 'iris.csv'
dataframe = read_csv(full_path, names=names)

# rows/cols of data
print("rows and columns of imported data: {}\n".format(dataframe.shape))

# first 20 entries
print("first 20 entries of data:\n{}\n".format(dataframe.head(20)))

# descriptions
print("some extra information on the data:\n{}\n".format(dataframe.describe()))

# class distribution
print("class distribution:\n{}\n".format(dataframe.groupby('class').size()))

### Visualize the Data

In [None]:
# image showing features of iris
display(Image(filename='iris_features.png'))

# histograms
dataframe.hist()
plt.show()

# scatter plot matrix
scatter_matrix(dataframe)
plt.show()

### Compare Algorithms

In [None]:
# Split-out validation dataset
array = dataframe.values
X = array[:,0:4]
y = array[:,4]
X_train, X_test, Y_train, Y_test = train_test_split(X, y, test_size=0.20, random_state=1, shuffle=True)

# Spot Check Algorithms
models = []
models.append(('Logistic Regression', LogisticRegression(solver='liblinear', multi_class='ovr')))
models.append(('KNN', KNeighborsClassifier()))
models.append(('CART', DecisionTreeClassifier()))
models.append(('SVM', SVC(gamma='auto')))

# evaluate each model in turn
for name, model in models:
    kfold = StratifiedKFold(n_splits=10, random_state=1, shuffle=True)
    cross_val_results = cross_val_score(model, X_train, Y_train, cv=kfold, scoring='accuracy')
    print('{:20} training results:{:.3f} {:.3f}'.format(name, cross_val_results.mean(), cross_val_results.std()))


### Evaluate Using Single Algorithm

In [None]:
model = SVC(gamma='auto')
model.fit(X_train, Y_train)
test_predictions = model.predict(X_test)
print(accuracy_score(Y_test, test_predictions))
class_names = sorted(set(array[:,4]))
# confusion matrix
plot_confusion_matrix(model, X_test, Y_test,
                                 display_labels=class_names,
                                 cmap=plt.cm.Blues)

### Save Trained Model to File

In [None]:
model_filename = "sklearn_model.pkl"
with open(model_filename, 'wb') as file:
    pickle.dump(model, file)

### Import Trained Model from File

In [None]:
with open("sklearn_model.pkl", 'rb') as file:
    saved_model = pickle.load(file)

test_predictions = saved_model.predict(X_test)
print(accuracy_score(Y_test, test_predictions))

# Spam Filtering

### Visualize the Data

In [None]:
# define the dataset location
full_path = 'spambase.csv'
# load the csv file as a pandas data frame
dataframe = read_csv(full_path, header=None)
# rows/cols of data
print("rows and columns of imported data: {}\n".format(dataframe.shape))
# first 20 entries
print("first 5 entries of data:\n{}\n".format(dataframe.head(5)))
# class distribution
target = dataframe.values[:,-1]
counter = Counter(target)
for k,v in counter.items():
    per = v / len(target) * 100
    print('Class=%d, Count=%d, Percentage=%.3f%%' % (k, v, per))

### Define Functions to Keep Things Clean

In [None]:
# load the dataset
def load_dataset(full_path):
    # load the dataset as a numpy array
    data = read_csv(full_path, header=None)
    # retrieve numpy array
    data = data.values
    # split into input and output elements
    X, y = data[:, :-1], data[:, -1]
    return X, y

# define models to test
def get_models():
    models, names = list(), list()
    # KNN
    steps = [('s',StandardScaler()),('m',KNeighborsClassifier())]
    models.append(Pipeline(steps=steps))
    names.append('KNN')
    # CART
    models.append(DecisionTreeClassifier())
    names.append('CART')
    # RF
    models.append(RandomForestClassifier(n_estimators=100))
    names.append('Random Forest')
    # Logistic Regression
    models.append(LogisticRegression(solver='liblinear', multi_class='ovr'))
    names.append('Logistic Regression')
    
    return models, names

# evaluate a model
def evaluate_model(X, y, model):
    # define evaluation procedure
    cv = StratifiedKFold(n_splits=10, random_state=1, shuffle=True)
    # evaluate model
    scores = cross_val_score(model, X, y, scoring='accuracy', cv=cv, n_jobs=-1)
    return scores

def timer(start,end):
    hours, rem = divmod(end-start, 3600)
    minutes, seconds = divmod(rem, 60)
    return "{:0>2}:{:0>2}:{:05.2f}".format(int(hours),int(minutes),seconds)

### Try Different Models on Dataset

In [None]:
# load the dataset
X, y = load_dataset(full_path)
# define models
models, names = get_models()

# evaluate each model
for i in range(len(models)):
    start = time.time()
    # evaluate the model and store results
    scores = evaluate_model(X, y, models[i])
    # summarize performance
    end = time.time()
    print('{:20} mean:{:.3f} std dev:({:.3f}) time elapsed:{}'.format(names[i], mean(scores), std(scores), timer(start,end)))

### Use PCA to Reduce Dimensions

In [None]:
# reduce dimensions to 25
X_reduced = PCA(n_components=25).fit_transform(X)

# evaluate each model
for i in range(len(models)):
    start = time.time()
    # evaluate the model and store results
    scores = evaluate_model(X_reduced, y, models[i])
    # summarize performance
    end = time.time()
    print('{:20} mean:{:.3f} std dev:({:.3f}) time elapsed:{}'.format(names[i], mean(scores), std(scores), timer(start,end)))

### Other Algorithms in Sklearn

https://scikit-learn.org/stable/supervised_learning.html

### UCI Datasets

https://archive.ics.uci.edu/ml/datasets.php