## Machine Learning on Datasets

In [1]:
#Importing libraries

#there are a bunch of sun-libraries that will also be imported over here instead
#of being imported randomly throughout the cooding experience
import pandas as pd # type: ignore
import numpy as np # type: ignore
import seaborn as sns # type: ignore
from matplotlib import pyplot as plt # type: ignore 
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn import metrics
from sklearn.naive_bayes import GaussianNB
from sklearn import datasets
from sklearn.tree import DecisionTreeClassifier
from ipywidgets import interact
import warnings
warnings.filterwarnings('ignore')
from IPython.display import display
import ipywidgets as widgets
from sklearn import linear_model
from sklearn.naive_bayes import BernoulliNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import RandomForestRegressor

## Reading the data

In [2]:
data=pd.read_csv('IRIS.csv')

Doing basic functions on the data

In [3]:
display(data.describe())
#gives basic understanding of the dataset

display(data.info())
#NOTE: there exists 2 columns
#1) ID column that needs to be dropped because of its increasing numeric value
#2) Species column that contains object which should be label encoded

display(data.size)
#displays the total amount entries in every column included.

display(data.shape)
#tells us the division of rows and columns of the table
#i.e. 6 features with 150 rows of entries

display(data["Species"].value_counts())
#displays all the values on different species

display(data.isna().sum())
#checks the total number of NaN values
#consists 0 therefore no cleaning required

display(data.isnull().sum())
#checks the total number of NULL values
#consists 0 therefore no cleaning required

display(data.duplicated().sum())
#checks for number of duplicates in the dataset

display(data.drop_duplicates(inplace=True))
#removes that duplicate from the dataset

Unnamed: 0,ID,Sepal.Length,Sepal.Width,Petal.Length,Petal.Width
count,150.0,150.0,150.0,150.0,150.0
mean,75.5,5.843333,3.057333,3.758,1.199333
std,43.445368,0.828066,0.435866,1.765298,0.762238
min,1.0,4.3,2.0,1.0,0.1
25%,38.25,5.1,2.8,1.6,0.3
50%,75.5,5.8,3.0,4.35,1.3
75%,112.75,6.4,3.3,5.1,1.8
max,150.0,7.9,4.4,6.9,2.5


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150 entries, 0 to 149
Data columns (total 6 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   ID            150 non-null    int64  
 1   Sepal.Length  150 non-null    float64
 2   Sepal.Width   150 non-null    float64
 3   Petal.Length  150 non-null    float64
 4   Petal.Width   150 non-null    float64
 5   Species       150 non-null    object 
dtypes: float64(4), int64(1), object(1)
memory usage: 7.2+ KB


None

900

(150, 6)

Species
setosa        50
versicolor    50
virginica     50
Name: count, dtype: int64

ID              0
Sepal.Length    0
Sepal.Width     0
Petal.Length    0
Petal.Width     0
Species         0
dtype: int64

ID              0
Sepal.Length    0
Sepal.Width     0
Petal.Length    0
Petal.Width     0
Species         0
dtype: int64

np.int64(0)

None

# Data Preprocessing

Removing Unnecessary columns

In [4]:
columns_to_drop = ['ID']  # These columns have no variance
data = data.drop(columns=columns_to_drop)

Outliers Removal

In [5]:
def remove_outliers(data):
    df_clean = data.copy()
    initial_rows = len(df_clean)
    
    for col in df_clean.select_dtypes(exclude=['object']).columns:
        Q1 = df_clean[col].quantile(0.25)
        Q3 = df_clean[col].quantile(0.75)
        IQR = Q3 - Q1
        df_clean = df_clean[(df_clean[col] >= Q1 - 1.5 * IQR) & 
                           (df_clean[col] <= Q3 + 1.5 * IQR)]
    
    removed = initial_rows - len(df_clean)
    display(f"Removed {removed} outliers ({removed/initial_rows:.2%} of data)")
    return df_clean
data=remove_outliers(data)

'Removed 4 outliers (2.67% of data)'

Feature Sepration

In [6]:
numeric_features = data.select_dtypes(exclude=['object']).columns
categorical_features = data.select_dtypes(include=['object']).columns

print("\nNumeric Features:", list(numeric_features))
print("Categorical Features:", list(categorical_features))


Numeric Features: ['Sepal.Length', 'Sepal.Width', 'Petal.Length', 'Petal.Width']
Categorical Features: ['Species']


Encoding

In [7]:
numeric_data=data.select_dtypes(exclude=['object'])
categorical_feature=data.select_dtypes(include=['object'])

encoder=LabelEncoder()
for col in categorical_feature.columns:
    data[col]=encoder.fit_transform(categorical_feature[col])

Scalling

In [8]:
from sklearn.preprocessing import StandardScaler
def scaled_data(data):
    data_copy = data.copy()
    scaler = StandardScaler()
    data_copy[numeric_features] = scaler.fit_transform(data_copy[numeric_features]) 
    return data_copy
data_scaled = scaled_data(data)

# Applying Regression

<img src="Diagrams/Logistic Regression.png" width="50%">

Linear Regression

In [None]:
#importing data
X=data[['Sepal.Length','Sepal.Width','Petal.Length']].values
Y=data[['Petal.Width']]

#choosing min, max of each column
min_sl, max_sl = X[:, 0].min(), X[:, 0].max()
min_sw, max_sw = X[:, 1].min(), X[:, 1].max()
min_pl, max_pl = X[:, 2].min(), X[:, 2].max()

#model defining and fitting
model=LinearRegression()
model.fit(X,Y)

#function defination for input and display of petal width
def Petal_Width(Sl, Sw, Pl):
    prediction = model.predict([[Sl, Sw, Pl]])
    if(prediction<0):
        print("Vaues give Negative Value. Try a realistic Value")
    else:
        print(f'A flower with {Sl} Sepal Length, {Sw} Sepal Width, {Pl} Petal Length will have Petal Width: {float(prediction[0]):.2f}')

#using libraries and in built function in the library to display the outcome
i = interact(Petal_Width, Sl=(min_sl,max_sl,0.1),
                          Sw=(min_sw,max_sw,0.1),
                          Pl=(min_pl,max_pl,0.1)
            )
display(i)

#there is a resudal error in each linear regression problem.
print ('Residual sum of squares: %.2f' % np.mean((model.predict(X)- Y) ** 2))

interactive(children=(FloatSlider(value=6.0, description='Sl', max=7.9, min=4.3), FloatSlider(value=3.0, descr…

<function __main__.Petal_Width(Sl, Sw, Pl)>

Residual sum of squares: 0.04


Logistic Regression

In [10]:
def Regularization_Logistic(Regu,type):

    #importing data
    X = data[['Sepal.Length','Sepal.Width']].values #only 2 values because we are plotting 2D
    Y = data[['Species']].values

    h = .02  #smoothness of graph

    if type == 'l1':
        logreg = linear_model.LogisticRegression(C=Regu, penalty=type, solver='liblinear')#solver best for l1
    else:
        logreg = linear_model.LogisticRegression(C=Regu, penalty=type, solver='lbfgs')#solver best for l2
    
    #training data
    logreg.fit(X, Y)

    #addition and substraction of stuff to center the points of the graph
    x_min, x_max = X[:, 0].min() - .5, X[:, 0].max() + .5
    y_min, y_max = X[:, 1].min() - .5, X[:, 1].max() + .5

    #creating the underlying graph
    xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h))

    #coloring the square based on what we think the type of flower should be there
    Z = logreg.predict(np.c_[xx.ravel(), yy.ravel()])

    # Put the result into a color plot
    Z = Z.reshape(xx.shape)
    #plotting every color in the plot, coloring to show different flowers
    plt.pcolormesh(xx, yy, Z, cmap=plt.cm.Paired)

    # Plotting data point with color 
    plt.scatter(X[:, 0], X[:, 1], c=Y, edgecolors='k', cmap=plt.cm.Paired)

    #Labels
    plt.xlabel('Sepal length')
    plt.ylabel('Sepal width')

    #fitting the points
    plt.xlim(xx.min(), xx.max())
    plt.ylim(yy.min(), yy.max())
    plt.xticks(())
    plt.yticks(())

    plt.show()

    #cunfusion matrix
    expected = Y
    predicted = logreg.predict(X)
    # summarize the fit of the model
    print(metrics.confusion_matrix(expected, predicted))

In [11]:
i = interact(Regularization_Logistic, Regu=(0.01,100,0.01), type=widgets.Dropdown(options=['l1', 'l2']))

interactive(children=(FloatSlider(value=50.0, description='Regu', min=0.01, step=0.01), Dropdown(description='…

# Naive bayes

<img src="Diagrams/Naive Bayes.png" width="50%">

In [12]:
def Naive_bayes(Model_Type):
        #importing data
        X = data[['Sepal.Length','Sepal.Width']].values #only 2 values because we are plotting 2D
        Y = data[['Species']].values        
        h = .2  #smoothing of graph

        #calling of different logistic regression models
        if(Model_Type=='Gaussian'):
            model =  GaussianNB()
        elif (Model_Type=='Multinomial'):
                model =  MultinomialNB()                         
        else:
                model =  BernoulliNB()  

        #fitting the model      
        model.fit(X, Y)

        #plotting the decision boundries
        x_min, x_max = X[:, 0].min() - .5, X[:, 0].max() + .5
        y_min, y_max = X[:, 1].min() - .5, X[:, 1].max() + .5
        xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h))
        Z = model.predict(np.c_[xx.ravel(), yy.ravel()])
        
        # Put the result into a color plot
        Z = Z.reshape(xx.shape)
        plt.figure(1, figsize=(4, 3))
        plt.pcolormesh(xx, yy, Z, cmap=plt.cm.Paired)

        # Plot also the training points
        plt.scatter(X[:, 0], X[:, 1], c=Y, edgecolors='k', cmap=plt.cm.Paired)
        plt.xlabel('Sepal length')
        plt.ylabel('Sepal width')
        plt.xlim(xx.min(), xx.max())
        plt.ylim(yy.min(), yy.max())
        plt.xticks(())
        plt.yticks(())
        plt.show()
        
        #seeing cunfusion matrix and classification report
        expected = Y
        predicted = model.predict(X)
        print(metrics.classification_report(expected, predicted))
        print(metrics.confusion_matrix(expected, predicted))

In [13]:
i = interact(Naive_bayes, Model_Type=widgets.Dropdown(options=['Gaussian', 'Multinomial', 'Bernoulli'], description='Model Type'))

interactive(children=(Dropdown(description='Model Type', options=('Gaussian', 'Multinomial', 'Bernoulli'), val…

# Decision Tree

<img src="Diagrams/Decision Tree.png" width="50%" style="background-color: white;">

In [14]:
def Decision_Tree(Type, Depth):
    # Select features using proper pandas syntax
    X = data[['Sepal.Length', 'Sepal.Width']].values
    Y = data['Species'].values
    
    h = .02  # step size in the mesh
    
    # Create and fit the Decision Tree model
    model = DecisionTreeClassifier(criterion=Type, max_depth=Depth)
    model.fit(X, Y)
    
    # Create visualization
    x_min, x_max = X[:, 0].min() - .5, X[:, 0].max() + .5
    y_min, y_max = X[:, 1].min() - .5, X[:, 1].max() + .5
    xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h))
    Z = model.predict(np.c_[xx.ravel(), yy.ravel()])
    
    # Plot decision boundary
    Z = Z.reshape(xx.shape)
    plt.figure(1, figsize=(4, 3))
    plt.pcolormesh(xx, yy, Z, cmap=plt.cm.Paired)
    
    # Plot training points
    plt.scatter(X[:, 0], X[:, 1], c=Y, edgecolors='k', cmap=plt.cm.Paired)
    plt.xlabel('Sepal Length')
    plt.ylabel('Sepal Width')
    plt.xlim(xx.min(), xx.max())
    plt.ylim(yy.min(), yy.max())
    plt.title(f'Decision Tree (Criterion: {Type}, Max Depth: {Depth})')
    plt.show()
    
    # Print classification metrics
    predicted = model.predict(X)
    print("\nConfusion Matrix:")
    print(metrics.confusion_matrix(Y, predicted))

In [15]:
i = interact(Decision_Tree, Type=widgets.Dropdown(options=['gini', 'entropy']), Depth=(1, 10))

interactive(children=(Dropdown(description='Type', options=('gini', 'entropy'), value='gini'), IntSlider(value…

# Random Forest

<img src="Diagrams/Random Forest.png" width="50%">

In [16]:
def Random_forest(n_estimators):
    # Select features and target
    X = data.iloc[:, :-1].values  # All columns except last
    y = data.iloc[:, -1].values   # Last column
    
    # Split the data
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=0
    )
    
    # Validate n_estimators
    if n_estimators == 0:
        print("Error: n_estimators cannot be 0")
        return
    
    # Create and train the model
    forest = RandomForestRegressor(
        n_estimators=n_estimators, 
        criterion='squared_error',
        random_state=0
    )
    
    # Fit the model
    forest.fit(X_train, y_train)
    
    # Calculate and display scores
    train_score = forest.score(X_train, y_train)
    test_score = forest.score(X_test, y_test)
    
    print(f"Training Score: {train_score:.4f}")
    print(f"Testing Score: {test_score:.4f}")
    
    return forest

# Create interactive widget
i = interact(
    Random_forest,
    n_estimators=widgets.IntSlider(
        min=1,
        max=1000,
        step=1,
        value=100,
        description='Trees'
    )
)

interactive(children=(IntSlider(value=100, description='Trees', max=1000, min=1), Output()), _dom_classes=('wi…