In [None]:
import pandas as pd
import numpy as np

url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data'

columns = ['sepal_length', 'sepal_width', 'petal_length', 'petal_width', 'species']

df = pd.read_csv(url, header=None, names=columns, na_values='?')

print(df.head())


   sepal_length  sepal_width  petal_length  petal_width      species
0           5.1          3.5           1.4          0.2  Iris-setosa
1           4.9          3.0           1.4          0.2  Iris-setosa
2           4.7          3.2           1.3          0.2  Iris-setosa
3           4.6          3.1           1.5          0.2  Iris-setosa
4           5.0          3.6           1.4          0.2  Iris-setosa


In [None]:
def equal_width_binning(data, num_bins):
    min_val = np.min(data)
    max_val = np.max(data)
    bin_width = (max_val - min_val) / num_bins
    bins = [(min_val + i * bin_width, min_val + (i + 1) * bin_width) for i in range(num_bins)]
    bins[-1] = (bins[-1][0], max_val)
    return bins

data_column = df['petal_width'].dropna()
bins = equal_width_binning(data_column, 5)
print("Equal Width Bins:", bins)

Equal Width Bins: [(0.1, 0.58), (0.58, 1.06), (1.06, 1.54), (1.54, 2.02), (2.02, 2.5)]


In [None]:
def equal_frequency_binning(data, num_bins):
    sorted_data = np.sort(data)
    bin_size = len(data) // num_bins
    bins = [sorted_data[i * bin_size] for i in range(num_bins)]
    bins.append(sorted_data[-1])
    return bins

bins = equal_frequency_binning(data_column, 5)
print("Equal Frequency Bins:", bins)


Equal Frequency Bins: [np.float64(0.1), np.float64(0.2), np.float64(1.2), np.float64(1.5), np.float64(1.9), np.float64(2.5)]


In [None]:
def min_max_normalization(data):
    min_val = np.min(data)
    max_val = np.max(data)
    return (data - min_val) / (max_val - min_val)

normalized_data = min_max_normalization(data_column)
print("Normalized Data:", normalized_data)


Normalized Data: 0      0.041667
1      0.041667
2      0.041667
3      0.041667
4      0.041667
         ...   
145    0.916667
146    0.750000
147    0.791667
148    0.916667
149    0.708333
Name: petal_width, Length: 150, dtype: float64


In [None]:
from scipy.stats import chi2

def chi_square_test(observed):
    row_totals = observed.sum(axis=1)
    col_totals = observed.sum(axis=0)
    grand_total = observed.sum()
    expected = np.outer(row_totals, col_totals) / grand_total

    chi_squared_stat = np.sum(((observed - expected) ** 2) / expected)

    rows, cols = observed.shape
    df = (rows - 1) * (cols - 1)

    p_value = 1 - chi2.cdf(chi_squared_stat, df)

    return chi_squared_stat, p_value

observed = np.array([[10, 20], [30, 40]])
chi_squared_stat, p_value = chi_square_test(observed)
print("Chi-Squared Statistic:", chi_squared_stat)
print("P-Value:", p_value)

Chi-Squared Statistic: 0.7936507936507936
P-Value: 0.37299848361348686


In [None]:
def confusion_matrix(true, pred):
    """
    Calculates a confusion matrix.

    Args:
        true (list): A list of true labels.
        pred (list): A list of predicted labels.

    Returns:
        np.ndarray: The confusion matrix as a NumPy array.
    """
    classes = sorted(list(set(true + pred)))
    num_classes = len(classes)
    class_to_index = {cls: i for i, cls in enumerate(classes)}
    mat = np.zeros((num_classes, num_classes), dtype=int)

    for t, p in zip(true, pred):
        if t in class_to_index and p in class_to_index:
            mat[class_to_index[t]][class_to_index[p]] += 1
    return mat

true_labels = ['cat', 'dog', 'cat', 'cat', 'dog']
pred_labels = ['cat', 'dog', 'dog', 'cat', 'dog']
cm = confusion_matrix(true_labels, pred_labels)
print("\nConfusion Matrix (with string labels):\n", cm)


Confusion Matrix (with string labels):
 [[2 1]
 [0 2]]


In [36]:
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

features = ['sepal_length', 'sepal_width', 'petal_length', 'petal_width']
x = df[features].values

x = StandardScaler().fit_transform(x)

pca = PCA(n_components=2)
principal_components = pca.fit_transform(x)

pca_df = pd.DataFrame(data=principal_components, columns=['principal_component_1', 'principal_component_2'])

pca_df = pd.concat([pca_df, df[['species']]], axis=1)

print("\nDataFrame after PCA:\n")
display(pca_df.head())


DataFrame after PCA:



Unnamed: 0,principal_component_1,principal_component_2,species
0,-2.264542,0.505704,Iris-setosa
1,-2.086426,-0.655405,Iris-setosa
2,-2.36795,-0.318477,Iris-setosa
3,-2.304197,-0.575368,Iris-setosa
4,-2.388777,0.674767,Iris-setosa
