# <center>IEE 520: Fall 2019</center>

# <center>Python overview (8/27/19)</center>

## <center>Klim Drobnyh (klim.drobnyh@asu.edu)</center>

## <center>1. Artificial data</center>

### <center>First, let's generate data with 2 classes: the first class is located around point (2,2). And the second one is around (-2, 0).</center>

In [None]:
# For compatibility with Python 2
from __future__ import print_function

import numpy as np

# To maintain consistent results
np.random.seed(520)

import matplotlib.pyplot as plt

# To display all the plots inline
%matplotlib inline

In [None]:
# To increase quality of figures
plt.rcParams["figure.figsize"] = (20, 10)

In [None]:
# Number of samples in every class
n = 10000
# Generating the data
X = np.random.normal(0, 1.0, (2*n, 2))
# Shifting the data towards the expecting means
X[0:n, :] += 2
X[n:2*n, 0] -= 2
# Just in case, to have data for each class separately
X_0 = X[0:n, :]
X_1 = X[n:2*n, :]
# Generating the labels
Y = np.zeros(2*n)
Y[n:2*n] = 1

### <center>Now, let's check the data: calculate some statistics and display it. Does it look correct?</center>

In [None]:
print(np.mean(X_0, axis=0))
print(np.mean(X_1, axis=0))
print(np.std(X_0, axis=0))
print(np.std(X_1, axis=0))

In [None]:
# Function to plot the 2d data
def plot(X, Y):
    alpha=0.05
    X_0 = X[Y==0, :]
    X_1 = X[Y==1, :]
    plt.plot(X_0[:, 0], X_0[:, 1], 'ro', alpha=alpha)
    plt.plot(X_1[:, 0], X_1[:, 1], 'bo', alpha=alpha)
    plt.xlabel('x1')
    plt.ylabel('x2')
    plt.title('Simulated data')
    plt.show()

plot(X, Y)
plt.hist(Y)
plt.show()

### <center>Now, let's create a simple classifier for that data</center>

#### <center>First, let's divide the data into train and test</center>

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.1, random_state=520, shuffle=True)

plot(X_train, Y_train)
plt.hist(Y_train)
plt.show()
plot(X_test, Y_test)
plt.hist(Y_test)
plt.show()

#### <center>Second, create and fit a classifier. Here we will use a logistic regression</center>

In [None]:
from sklearn.linear_model import LogisticRegression

classifier = LogisticRegression(random_state=520, solver='liblinear')
classifier.fit(X_train, Y_train)
predicted = classifier.predict(X_test)
predicted_proba = classifier.predict_proba(X_test)[:, 1]

In [None]:
print(Y_test)
print(predicted)
print(predicted_proba)

#### <center>Third, we need to evaluate the model</center>

In [None]:
from sklearn.metrics import accuracy_score, confusion_matrix, roc_curve, auc

cm = confusion_matrix(Y_test, predicted)
print('Accuracy:', accuracy_score(Y_test, predicted)*100)

In [None]:
import seaborn as sn
plt.figure()
ax = sn.heatmap(cm, annot=True, fmt='g')
ax.set_xlabel('Predicted')
ax.set_ylabel('True')
ax.set_title('Confusion Matrix')
plt.show()

In [None]:
fpr, tpr, thr = roc_curve(Y_test, predicted_proba)
roc_auc = auc(fpr, tpr)

plt.figure()
plt.plot(fpr, tpr)
plt.title('ROC curve (area = %f)' % roc_auc)
plt.plot([0, 1], [0, 1])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.show()

## <center>2. Real data. Iris dataset</center>

The Iris flower data set or Fisher's Iris data set is a multivariate data set introduced by the British statistician and biologist Ronald Fisher in his 1936 paper **"The use of multiple measurements in taxonomic problems"** as an example of linear discriminant analysis.

The data set consists of 50 samples from each of three species of Iris (**Iris setosa**, **Iris virginica** and **Iris versicolor**). Four features were measured from each sample: the length and the width of the sepals and petals, in centimeters. Based on the combination of these four features, Fisher developed a linear discriminant model to distinguish the species from each other.

### <center>First, let's load the dataset</center>

In [None]:
from sklearn import datasets
iris = datasets.load_iris()

### <center>Using pandas to write dataset to file and read it again</center>

In [None]:
import pandas as pd

df = pd.DataFrame(data=np.c_[iris['data'], iris['target']],
                  columns=iris['feature_names'] + ['target'])
df.style.hide_index()
print(df[:5])

In [None]:
# Save data to file
df.to_csv('data_iris.csv')

In [None]:
df_loaded = pd.read_csv('data_iris.csv', index_col=0)
print(df_loaded[:5])

### <center>Visualizing the data</center>

In [None]:
Y = df_loaded['target']
X = df_loaded.values[:, 0:2]

In [None]:
def plot_iris(X, Y):
    alpha = 0.55
    X_0 = X[Y==0, :]
    X_1 = X[Y==1, :]
    X_2 = X[Y==2, :]
    plt.plot(X_0[:, 0], X_0[:, 1], 'ro', alpha=alpha)
    plt.plot(X_1[:, 0], X_1[:, 1], 'bo', alpha=alpha)
    plt.plot(X_2[:, 0], X_2[:, 1], 'go', alpha=alpha)
    plt.xlabel(df.columns[0])
    plt.ylabel(df.columns[1])
    plt.title('IRIS dataset')
    plt.show()
    
plot_iris(X, Y)

In [None]:
Y = df_loaded['target']
X = df_loaded.values[:, 2:4]

def plot_iris(X, Y):
    alpha = 0.55
    X_0 = X[Y==0, :]
    X_1 = X[Y==1, :]
    X_2 = X[Y==2, :]
    plt.plot(X_0[:, 0], X_0[:, 1], 'ro', alpha=alpha)
    plt.plot(X_1[:, 0], X_1[:, 1], 'bo', alpha=alpha)
    plt.plot(X_2[:, 0], X_2[:, 1], 'go', alpha=alpha)
    plt.xlabel(df.columns[2])
    plt.ylabel(df.columns[3])
    plt.title('IRIS dataset')
    plt.show()
    
plot_iris(X, Y)

### <center>Splitting into train and test</center>

In [None]:
X = df.values[:, :-1]
Y = df.values[:, -1]

In [None]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.1, random_state=520, shuffle=True)
plt.hist(Y)
plt.show()
plt.hist(Y_train)
plt.show()
plt.hist(Y_test)
plt.show()

### <center>Training the classifier</center>

In [None]:
from sklearn.naive_bayes import GaussianNB

classifier = GaussianNB()
classifier.fit(X_train, Y_train)
predicted = classifier.predict(X_test)
predicted_proba = classifier.predict_proba(X_test)[:, 1]

### <center>Evaluating the model</center>

In [None]:
from sklearn.metrics import accuracy_score, confusion_matrix

cm = confusion_matrix(Y_test, predicted)
# To narmalize the confusion matrix
cm = cm.astype('float') / cm.sum(axis=1)
print('Accuracy:', accuracy_score(Y_test, predicted)*100)

In [None]:
# TODO: Add confusion matrix visualization
import seaborn as sn
plt.figure()
ax = sn.heatmap(cm, annot=True, fmt='g')
ax.set_xlabel('Predicted')
ax.set_ylabel('True')
ax.set_title('Confusion Matrix')
plt.show()

In [None]:
predicted = classifier.predict(X_train)
predicted_proba = classifier.predict_proba(X_train)[:, 1]
from sklearn.metrics import accuracy_score, confusion_matrix

cm = confusion_matrix(Y_train, predicted)
# To narmalize the confusion matrix
cm = cm.astype('float') / cm.sum(axis=1)
print('Accuracy:', accuracy_score(Y_train, predicted)*100)
# TODO: Add confusion matrix visualization
import seaborn as sn
plt.figure()
ax = sn.heatmap(cm, annot=True, fmt='g')
ax.set_xlabel('Predicted')
ax.set_ylabel('True')
ax.set_title('Confusion Matrix')
plt.show()