In [1]:
import pandas as pd
iris_data = pd.read_csv('Iris.csv')
print("First 5 rows of the dataset:")
print(iris_data.head())
shape = iris_data.shape
print("\nShape of the dataset (rows, columns):", shape)
statistics = iris_data.describe()
print("\nStatistical summary of the dataset:")
print(statistics)

First 5 rows of the dataset:
   Id  SepalLengthCm  SepalWidthCm  PetalLengthCm  PetalWidthCm      Species
0   1            5.1           3.5            1.4           0.2  Iris-setosa
1   2            4.9           3.0            1.4           0.2  Iris-setosa
2   3            4.7           3.2            1.3           0.2  Iris-setosa
3   4            4.6           3.1            1.5           0.2  Iris-setosa
4   5            5.0           3.6            1.4           0.2  Iris-setosa

Shape of the dataset (rows, columns): (150, 6)

Statistical summary of the dataset:
               Id  SepalLengthCm  SepalWidthCm  PetalLengthCm  PetalWidthCm
count  150.000000     150.000000    150.000000     150.000000    150.000000
mean    75.500000       5.843333      3.054000       3.758667      1.198667
std     43.445368       0.828066      0.433594       1.764420      0.763161
min      1.000000       4.300000      2.000000       1.000000      0.100000
25%     38.250000       5.100000      2.8000

In [2]:
X = iris_data.drop('Species', axis=1)
Y = iris_data['Species']
print("Shape of X (features):", X.shape)
print("Shape of Y (output):", Y.shape)

Shape of X (features): (150, 5)
Shape of Y (output): (150,)


In [11]:
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()
Y_encoded = label_encoder.fit_transform(Y)

print("Encoded Y (Species):", Y_encoded)

Encoded Y (Species): [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 2 2
 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
 2 2]


In [12]:
from sklearn.model_selection import train_test_split

ratios = [(0.6, 0.4), (0.5, 0.5), (0.7, 0.3), (0.8, 0.2), (0.55, 0.45), (0.55, 0.25)]

for train_ratio, test_ratio in ratios:
    X_train, X_test, Y_train, Y_test = train_test_split(X, Y_encoded, test_size=test_ratio, random_state=42)
    print(f"Train set shape ({train_ratio*100}% of data): X_train={X_train.shape}, Y_train={Y_train.shape}")
    print(f"Test set shape ({test_ratio*100}% of data): X_test={X_test.shape}, Y_test={Y_test.shape}")

Train set shape (60.0% of data): X_train=(90, 5), Y_train=(90,)
Test set shape (40.0% of data): X_test=(60, 5), Y_test=(60,)
Train set shape (50.0% of data): X_train=(75, 5), Y_train=(75,)
Test set shape (50.0% of data): X_test=(75, 5), Y_test=(75,)
Train set shape (70.0% of data): X_train=(105, 5), Y_train=(105,)
Test set shape (30.0% of data): X_test=(45, 5), Y_test=(45,)
Train set shape (80.0% of data): X_train=(120, 5), Y_train=(120,)
Test set shape (20.0% of data): X_test=(30, 5), Y_test=(30,)
Train set shape (55.00000000000001% of data): X_train=(82, 5), Y_train=(82,)
Test set shape (45.0% of data): X_test=(68, 5), Y_test=(68,)
Train set shape (55.00000000000001% of data): X_train=(112, 5), Y_train=(112,)
Test set shape (25.0% of data): X_test=(38, 5), Y_test=(38,)


In [13]:
seeds = [3, 9, 12, 33]
for seed in seeds:
    X_train, X_test, Y_train, Y_test = train_test_split(X, Y_encoded, test_size=0.4, random_state=seed)
    print(f"Random Seed {seed}:")
    print(f"X_train shape: {X_train.shape}, X_test shape: {X_test.shape}")
    print(f"Y_train shape: {Y_train.shape}, Y_test shape: {Y_test.shape}")

Random Seed 3:
X_train shape: (90, 5), X_test shape: (60, 5)
Y_train shape: (90,), Y_test shape: (60,)
Random Seed 9:
X_train shape: (90, 5), X_test shape: (60, 5)
Y_train shape: (90,), Y_test shape: (60,)
Random Seed 12:
X_train shape: (90, 5), X_test shape: (60, 5)
Y_train shape: (90,), Y_test shape: (60,)
Random Seed 33:
X_train shape: (90, 5), X_test shape: (60, 5)
Y_train shape: (90,), Y_test shape: (60,)


In [15]:
import pandas as pd

df = pd.read_csv("wine.csv")
print("Shape of the data:", df.shape)
print("Statistical values:")
print(df.describe())

Shape of the data: (178, 14)
Statistical values:
             Wine     Alcohol  Malic.acid         Ash         Acl          Mg  \
count  178.000000  178.000000  178.000000  178.000000  178.000000  178.000000   
mean     1.938202   13.000618    2.336348    2.366517   19.494944   99.741573   
std      0.775035    0.811827    1.117146    0.274344    3.339564   14.282484   
min      1.000000   11.030000    0.740000    1.360000   10.600000   70.000000   
25%      1.000000   12.362500    1.602500    2.210000   17.200000   88.000000   
50%      2.000000   13.050000    1.865000    2.360000   19.500000   98.000000   
75%      3.000000   13.677500    3.082500    2.557500   21.500000  107.000000   
max      3.000000   14.830000    5.800000    3.230000   30.000000  162.000000   

          Phenols  Flavanoids  Nonflavanoid.phenols     Proanth   Color.int  \
count  178.000000  178.000000            178.000000  178.000000  178.000000   
mean     2.295112    2.029270              0.361854    1.590899

In [16]:
X = df.drop(columns=['Wine'])
Y = df['Wine']

print("Shape of X (features):", X.shape)
print("Shape of Y (target):", Y.shape)

Shape of X (features): (178, 13)
Shape of Y (target): (178,)


In [19]:
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.3, random_state=12)
gnb = GaussianNB()
gnb.fit(X_train, Y_train)

Y_pred = gnb.predict(X_test)

In [20]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix

accuracy = accuracy_score(Y_test, Y_pred)
print("\nAccuracy:", accuracy)

precision = precision_score(Y_test, Y_pred, average='weighted')
print("Precision:", precision)

recall = recall_score(Y_test, Y_pred, average='weighted')
print("Recall:", recall)

f1 = f1_score(Y_test, Y_pred, average='weighted')
print("F1-Score:", f1)

from sklearn.preprocessing import label_binarize
Y_test_bin = label_binarize(Y_test, classes=[1, 2, 3])
Y_pred_bin = label_binarize(Y_pred, classes=[1, 2, 3])
n_classes = Y_test_bin.shape[1]

roc_auc = roc_auc_score(Y_test_bin, Y_pred_bin, average='macro')
print("AUC:", roc_auc)


Accuracy: 0.9444444444444444
Precision: 0.9474703461631565
Recall: 0.9444444444444444
F1-Score: 0.9445027013654466
AUC: 0.9597465886939571
