In [None]:
pip install plotly

In [None]:
pip install plotly_express

In [None]:
pip install sklearn

In [None]:
# function to plot a graph for feature importances
def plt_feature_importance():
    feature_imp = pd.Series(rfc.feature_importances_,index=feat_col)
    feature_imp = (feature_imp*100).sort_values(ascending=False)
    fig = px.bar(feature_imp,x=feature_imp.index,y=feature_imp.values,labels={'x':'Features','y':'Importance Percentage %'})
    return fig

In [None]:
# get accuracy of the predicted test class
def get_acc(y_test,prediction):
    acc = metrics.accuracy_score(y_test, prediction)
    return acc*100

In [None]:
# Scale data using Min Max values
def scale_data(X):
    from sklearn.preprocessing import MinMaxScaler
    scaler = MinMaxScaler()
    sc = scaler.fit(X)
    X = sc.transform(X)
    return X

In [None]:
# Plot algorithm decision boundaries
def plot_boundaries(iris,algo):
    from sklearn.preprocessing import StandardScaler
    from plotly import subplots
    X = iris[['SepalWidthCm','PetalLengthCm']]  

    y = iris.Species
    y = le.fit_transform(iris.Species)
    h = .02  # step size in the mesh

    X = StandardScaler().fit_transform(X)

    if algo=='rfst':
        trees = RandomForestClassifier(max_depth=5,n_estimators=10,
                                       random_state=0)
        trees.fit(X, y)

        trees_overfit = RandomForestClassifier(max_depth=100, 
                                               n_estimators=10, 
                                               random_state=0)
        trees_overfit.fit(X, y)
        
    elif algo=='svm':
        trees = LinearSVC(penalty='l2', loss='squared_hinge',
                dual=True, tol=0.001, C=1,multi_class='ovr',
                fit_intercept=True, intercept_scaling=1, class_weight=None,verbose=0
                , random_state=0, max_iter=1000)
        trees.fit(X,y)
        
    else:
        trees = DecisionTreeClassifier(random_state=0)
        trees.fit(X, y)
        

    x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1
    y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1

    xx, yy = np.meshgrid(np.arange(x_min, x_max, h)
                         , np.arange(y_min, y_max, h))
    y_ = np.arange(y_min, y_max, h)

    Z = trees.predict(np.c_[xx.ravel(), yy.ravel()])
    Z = Z.reshape(xx.shape)    
    
    
    if algo=='rfst':
        fig = subplots.make_subplots(rows=1, cols=2,
                                  subplot_titles=("Random Forest (Depth = 5)",
                                                  "Random Forest (Depth = 50)")
                                 )

        trace1 = go.Heatmap(x=xx[0], y=y_, z=Z,
                          colorscale='Viridis',
                          showscale=False)

        trace2 = go.Scatter(x=X[:, 0], y=X[:, 1], 
                            mode='markers',
                            showlegend=True,
                            marker=dict(size=10,
                                        color=y, 
                                        colorscale='Viridis',
                                        line=dict(color='black', width=1))
                            )

        fig.append_trace(trace1, 1, 1)
        fig.append_trace(trace2, 1, 1)
        
    else:
        if algo=='dt':
            fig = subplots.make_subplots(rows=1, cols=2,subplot_titles=("Decision Tree",""))
        else:
            fig = subplots.make_subplots(rows=1, cols=2,subplot_titles=("Support vector Machine",""))
            
        trace1 = go.Heatmap(x=xx[0], y=y_, z=Z,
                          colorscale='Viridis',
                          showscale=False)

        trace2 = go.Scatter(x=X[:, 0], y=X[:, 1], 
                            mode='markers',
                            showlegend=True,
                            marker=dict(size=10,
                                        color=y, 
                                        colorscale='Viridis',
                                        line=dict(color='black', width=1))
                            )

        fig.append_trace(trace1, 1, 1)
        fig.append_trace(trace2, 1, 1)

    if algo=='rfst':
        Z = trees_overfit.predict(np.c_[xx.ravel(), yy.ravel()])
        Z = Z.reshape(xx.shape)


        trace3 = go.Heatmap(x=xx[0], y=y_, 
                            z=Z,
                            colorscale='Viridis',
                            showscale=False)

        trace4 = go.Scatter(x=X[:, 0], y=X[:, 1],
                            mode='markers',
                            showlegend=True,
                            marker=dict(size=10,
                                        color=y, 
                                        colorscale='Viridis',
                                        line=dict(color='black', width=1))
                           )
        fig.append_trace(trace3, 1, 2)
        fig.append_trace(trace4, 1, 2)

    for i in map(str, range(1, 3)):
        x = 'xaxis' + i
        y = 'yaxis' + i
        fig['layout'][x].update(showgrid=False, 
                                zeroline=False,
                                showticklabels=False, 
                                ticks='', 
                                autorange=True)
        fig['layout'][y].update(showgrid=False, 
                                zeroline=False,
                                showticklabels=False, 
                                ticks='', 
                                autorange=True)

    py.iplot(fig)

In [None]:
# plot confucion matrices for any given algorithm
def confusion_matrix(y_test,prediction,algo,acc):
    dtpreds = {'Actual':y_test,'Predicted':prediction}
    df = pd.DataFrame(dtpreds, columns=['Actual','Predicted'])
    confusion_matrix = pd.crosstab(df['Actual'], df['Predicted'], rownames=['Actual'], colnames=['Predicted'])
    fig = sn.heatmap(confusion_matrix, annot=True,cbar=False)
    if algo=='svm':
        plt.title("Support Vector Machine with Accuracy="+str(acc)+"%")
    elif algo=='dt':
        plt.title("Decision Tree with Accuracy="+str(acc)+"%")
    elif algo=='rf':
        plt.title("Randomn Forest with Accuracy="+str(acc)+"%")
    return fig.get_figure()

# <U>The Iris Flower<U>

<img src="irisall.png">

The Iris Flowerhas 3 different species:
* Iris-virginica
* Iris-versicolor
* Iris-setosa

We need to find a way to classify these species of the Iris among themselves. We have the data of the physical dimensions of the flower like the length and width of the Sepal(Leaves below the petals) and of the Petals. Using this we need to find which species do they belong.

Let's get an insight over the dataset

In [None]:
import pandas as pd
iris = pd.read_csv('Iris.csv')
iris.sample(frac=0.04)

<code>As we can see in the above table, we have the features of the flower in the four columns after the id and we have the species of the flower that we need to predict.<code>

### How do we solve this ?

Let's import all the libraries that we might require today solve this problem.

In [None]:
import plotly_express as px
import numpy as np
from plotly import graph_objects as go
import sklearn as skl
import plotly.offline as py
import seaborn as sn
from sklearn import metrics
import matplotlib.pyplot as plt

# 1.  Statistical Analysis

In [None]:
iris.describe()

# Visualize the dataset

Columns of the iris dataset

In [None]:
iris.columns

Go ahead and try changing the x, y, z values down below with any of the feature names.

In [None]:
px.scatter_3d(iris,x='SepalLengthCm',y='SepalWidthCm',z='PetalWidthCm',color='Species')

## 2. <u>Statistical Method<U>

Lets seperate out the labels from the features

In [None]:
y = iris.Species
X = iris.drop(columns=['Id','Species'])

Here, X is the set of features and y is the target variable that we will predict.

We split the Data into two parts, two thirds of the data will be used in training and the rest one third for testing the model.

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

Seperate out the different species

In [None]:
versicolor = y_train=="Iris-versicolor"
setosa = y_train=="Iris-setosa"
virginica = y_train=="Iris-virginica"
df_versicolor = X_train[versicolor]
df_setosa = X_train[setosa]
df_virginica = X_train[virginica]

Lets us find out the mean and standard deviation of the lengths and widths of petal and sepal of each species of the flower.

In [None]:
df_setosa.describe().iloc[1:3]

In [None]:
df_versicolor.describe().iloc[1:3]

In [None]:
df_virginica.describe().iloc[1:3]

Using the above information we set a limit range (generally in mean +- standard-deviation) for the various features

In [None]:
# Set hard boundaries and check for reach data row.
statpred = []
for i in range(X_test.shape[0]):
    if(X_test.PetalWidthCm.iloc[i]<=0.35):
        label = "Iris-setosa"
    elif(X_test.PetalLengthCm.iloc[i]<=4.7 and X_test.PetalWidthCm.iloc[i]<=1.51):
        label = "Iris-versicolor"
    else:
        label = "Iris-virginica"
    statpred.append(label)

In statistical approach. We predict the labels by a simple if else condition.

In [None]:
j=0
c=0
for i in statpred:
    if y_test.iloc[j]!=i:
        c+=1
    j+=1

### 2.1 Accuracy of Statistic Model

In [None]:
compare = pd.DataFrame(data = y_test.reset_index(drop=True))
compare['Predicted'] = statpred
compare['Correct?'] = compare.Species==compare.Predicted
compare.head(10)

In [None]:
acur = (150-c)*100/150
print(np.round(acur,2),"%")

## How can we still improve this ? 

Let us try the Machine Learning Approach

<img src='machinelearning.png' height="50" width="500">

# 3. Data Pre-processing

To apply any Machine learning algorithm to our data, we must prepare our data accordingly.

For the model to be build, our data must be standardized and normalized

Change the labels from string to integer

In [None]:
from sklearn import preprocessing
le = preprocessing.LabelEncoder()
y = le.fit_transform(iris.Species)
X = iris.drop(columns=['Id','Species'])

You can see below how the species of the flower are converted to integers.

In [None]:
display(y)

Collect all feature columns being used

In [None]:
feat_col = X.columns

Scale the data to be in a fixed range

In [None]:
X = scale_data(X)

## 4. <U>Train Test Split<U>

We split the data in two parts, one would be used to train our model and the other to test our model's prediction.

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

## 5. <U>Support Vector Machine</U> (SVM)

A SVM classifies the data by drawing hyperplanes in the available dimensional space.

<img src="svm.png"/>

#### 5.1 Train Model

In [None]:
from sklearn.svm import LinearSVC

svm = LinearSVC(penalty='l2', loss='squared_hinge',
                dual=True, tol=0.001, C=10,multi_class='ovr',
                fit_intercept=True, intercept_scaling=1, class_weight=None,verbose=0
                , random_state=0, max_iter=1000)
svm.fit(X_train,y_train)

#### 5.2 Predict

In [None]:
svmpred = svm.predict(X_test)

#### 5.3 Accuracy

In [None]:
accsvm = get_acc(y_test,svmpred)
print(accsvm,'%')

#### 5.4 Confusion Matrix

In [None]:
cmsvm = confusion_matrix(y_test,svmpred,'svm',accsvm)

#### 5.5 Decision Boundaries

In [None]:
plot_boundaries(iris,'svm')

## 6. <U>Decision Tree Classification<U>

<img src='dt.jpg'>

Decision tree is a Supervised Machine Learning technique where the data is continuously split according to a certain parameter.

#### 6.1 Train the model

In [None]:
from sklearn.tree import DecisionTreeClassifier
dtc = DecisionTreeClassifier(random_state=0)
dtmodel = dtc.fit(X_train,y_train)

#### 6.2 Predict

In [None]:
prediction = dtmodel.predict(X_test)

In [None]:
prediction

The predictions are made in numbers as the species were assigned. But they can be converted back to their labels.

In [None]:
dtpreds = le.inverse_transform(prediction)
display(dtpreds)

#### 6.3 Accuracy

In [None]:
accdt = get_acc(y_test,prediction)
print(accdt,'%')

#### 6.4 Confusion Matrix

In [None]:
cmdt = confusion_matrix(y_test,prediction,"dt",accdt)

#### 6.5 Decision Boundaries

In [None]:
plot_boundaries(iris,'dt')

## 7. <U>Random Forest Classification<U>

Random forest, like its name implies, consists of a large number of individual decision trees that operate as an ensemble. Each individual tree in the random forest spits out a class prediction and the class with the most votes becomes our model’s prediction.

<img src='pic1.jpeg'/>

<U><code>The reason for this wonderful effect is that the trees protect each other from their individual errors.<code><U>

#### 7.1 Train Model

In [None]:
from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier(max_depth=100, random_state=0)
rfc.fit(X_train, y_train)

#### 7.2 Predict

In [None]:
preds = rfc.predict(X_test)
display(preds)

#### 7.3 Accuracy

In [None]:
accrf = get_acc(y_test,preds)
print(accrf,'%')

#### 7.4 Confusion Matrix

In [None]:
cmrf = confusion_matrix(y_test,preds,'rf',accrf)

#### 7.5 Decision Boundaries

In [None]:
plot_boundaries(iris,'rfst')

## 8. <U>Feature Importances</U>

Features that have been referred to the most to predict the class.

In [None]:
fig = plt_feature_importance()
fig.show()

## 9. <U>Conclusion</U>

In [None]:
display(cmsvm,cmrf,cmdt)

## 10. <U>Feature adding</U>

Let us add two new features to our dataset. We can calculate the area (length x breadth) of sepal and petal and add them as new features.

In [None]:
iris['SepalArea'] = iris['SepalLengthCm']*iris['SepalWidthCm']
iris['PetalArea'] = iris['PetalLengthCm']*iris['PetalWidthCm']

In [None]:
iris.sample(frac=0.035)

Now let's try running the the models again.

In [None]:
y = le.fit_transform(iris.Species)
X = iris.drop(columns=['Id','Species'])
feat_col = X.columns
X = scale_data(X)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

### Scalar Vector Machine (SVM)

In [None]:
svm.fit(X_train,y_train)
new_svm = svm.predict(X_test)

In [None]:
svmacc = get_acc(y_test,new_svm)
print("Previous Accuracy: ",accsvm,"%")
print("New Accuracy: ",svmacc,"%")

In [None]:
new_cmsvm = confusion_matrix(y_test,new_svm,"svm",svmacc)

### Decision Tree

In [None]:
dtc.fit(X_train,y_train)
new_dtree = dtc.predict(X_test)

In [None]:
dtacc = get_acc(y_test,new_dtree)
print("Previous Accuracy: ",accdt,"%")
print("New Accuracy: ",dtacc,"%")

In [None]:
new_cmdt = confusion_matrix(y_test,new_dtree,"dt",dtacc)

### Random Forest

In [None]:
rfc.fit(X_train, y_train)
new_rfst = rfc.predict(X_test)

In [None]:
rfacc = get_acc(y_test,new_rfst)
print("Previous Accuracy: ",accrf,"%")
print("New Accuracy: ",rfacc,"%")

In [None]:
new_cmrf = confusion_matrix(y_test,new_rfst,"rf",rfacc)

## New Feature Importances

In [None]:
fig = plt_feature_importance()
fig.show()