## Data Preprocessing:
- Load the Iris dataset.
- Perform data exploration and visualization.
- Check for missing values and handle them if any.
- Split the dataset into features and target variables.

In [None]:
# Load the neccesary libraries:-
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

In [None]:
df = pd.read_csv('iris.csv')
df.head()

In [None]:
# Check the missing values in each column
df.isnull().sum()

In [None]:
# Display the total missing values in the dataset:
df.isnull().sum().sum()

In [None]:
# Change the column name into meaningful names:
df.rename(columns={'Column1':'SepalLengthCm', 'Column2':'SepalWidthCm', 'Column3':'PetalLengthCm','Column4':'PetalWidthCm','Column5':'Species'}, inplace=True)

In [None]:
df

In [None]:
df.info()

In [None]:
df.describe()

In [None]:
X = df.iloc[:, :-1].values
y = df.iloc[:, 4].values

In [None]:
# Split the dataset into train set and test set:

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2)

In [None]:
print('The shape of the train data:', 'X_train:',X_train.shape,'y_train:', y_train.shape)
print('The shape of the test data:', 'X_test:', X_test.shape,'y_test:', y_test.shape) 

## Unsupervised Learning: Clustering and Outlier Detection:
- Apply K-means clustering algorithm to cluster the data.
- Visualize the clusters.
- Detect outliers using appropriate techniques such as isolation forest or DBSCAN.
- Evaluate the clustering results.

In [None]:
# Import kmeans library
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt
k_means = KMeans(n_clusters=3)
k_means.fit(X)

In [None]:
ax = df[df.Species=='Iris-setosa'].plot.scatter(x='SepalLengthCm', y='SepalWidthCm', 
                                                    color='red', label='Iris - Setosa')
df[df.Species=='Iris-versicolor'].plot.scatter(x='SepalLengthCm', y='SepalWidthCm', 
                                                color='green', label='Iris - Versicolor', ax=ax)
df[df.Species=='Iris-virginica'].plot.scatter(x='SepalLengthCm', y='SepalWidthCm', 
                                                color='blue', label='Iris - Virginica', ax=ax)
ax.set_title("Scatter Plot")

In [None]:
X_sepal_length = df.loc[:, ['SepalLengthCm']]
X_sepal_width = df.loc[:, ['SepalWidthCm']]

In [None]:
# Visualize the cluster with n_clusters = 2 for the sepal length & sepal width
plt.figure(figsize=(10,5))
plt.scatter(x= X_sepal_length['SepalLengthCm'], y=X_sepal_width['SepalWidthCm'], c=k_means.labels_)
plt.xlabel('SepalLengthCm')
plt.ylabel('SepalWidthCm');

In [None]:
y_Petal_length = df.loc[:, ['PetalLengthCm']]
y_Petal_width = df.loc[:, ['PetalWidthCm']]

In [None]:
# Visualize the cluster with n_clusters = 2 for the petal length & petal width
plt.figure(figsize=(10,5))
plt.scatter(x= y_Petal_length['PetalLengthCm'], y=y_Petal_width['PetalWidthCm'], c=k_means.labels_)
plt.xlabel('PetalLengthCm')
plt.ylabel('PetalWidthCm');

In [None]:
# Handle missing vaules using DBSCAN
from sklearn.cluster import DBSCAN

DBSModel = DBSCAN(metric='euclidean',eps=0.0375, min_samples=20, algorithm='auto')
y_pred_train = DBSModel.fit_predict(X_train)
y_pred_test = DBSModel.fit_predict(X_test)

print('DBScanModel labels are:', DBSModel.labels_)
print('DBScanModel Train data are:', y_pred_train)
print('DBScanModel Test data are:', y_pred_test)

In [None]:
nclusters = len(set(DBSModel.labels_)) - (1 if -1 in DBSModel.labels_ else 0)
n_noise = list(DBSModel.labels_).count(-1)

print('Estimated number of clusters: %d' % nclusters)
print('Estimated number of noises: %d' % n_noise)

## Supervised Learning: Baseline Model:
- Choose an appropriate evaluation metric based on the problem (classification)
- Split the dataset into training and testing set.
- Build a baseline model (e.g., logistic regression or decision tree) using default parameter.
- Evaluate the baseline model's performarmance.

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

evaluation_metric = 'accuracy'

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

baseline_model = LogisticRegression()
baseline_model.fit(X_train, y_train)

y_pred = baseline_model.predict(X_test)

evaluation_result = accuracy_score(y_test, y_pred)

print(f"Baseline Model Performance ({evaluation_metric}): {evaluation_result}")
print("Classification Report:")
print(classification_report(y_test, y_pred))

## Model Comparison:- Select 3-4 machine learning algorithms (e.g., SVM, Random Forest, Gradient Boosting) suitable for the problem.
- Implement each algorithm and evaluate its performance using cross-validation
- Compare the performance of algorithms based on evaluation metrics
- Select the best-performing algorithm.

In [None]:
from sklearn.model_selection import cross_val_score
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression

models = {
    "SVM": SVC(),
    "Random Forest": RandomForestClassifier(),
    "Gradient Boosting": GradientBoostingClassifier(),
    "Logistic Regression": LogisticRegression()
}

evaluation_metric = 'accuracy'
num_folds = 5

results = {}
for model_name, model in models.items():
    scores = cross_val_score(model, X_train, y_train, cv=num_folds, scoring=evaluation_metric)
    results[model_name] = scores

print("Mean {} Scores:".format(evaluation_metric))
for model_name, scores in results.items():
    print(f"{model_name}: {np.mean(scores):.4f} (±{np.std(scores):.4f})")

In [None]:
# The best model based on the performance:
best_model = max(results, key=lambda x: np.mean(results[x]))
print("\nBest Performing Model:", best_model)

## Model Tuning and Ensemble:
-  Perform hyperparameter tuning on the best-performing algorithm using Grid Search or Random Search
- • Evaluate the tuned model's performan.
-  • Implement an ensemble of the top-performing algorithms and compare its performance with the tuned model.

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import VotingClassifier
from sklearn.metrics import accuracy_score

param_grid_svm = {
    'C': [0.1, 1, 10],
    'kernel': ['linear', 'rbf', 'poly'],
    'gamma': ['scale', 'auto']
}

svm_clf = SVC()

grid_search_svm = GridSearchCV(svm_clf, param_grid_svm, cv=num_folds, scoring=evaluation_metric)

grid_search_svm.fit(X_train, y_train)

best_params_svm = grid_search_svm.best_params_
best_score_svm = grid_search_svm.best_score_

best_svm_model = grid_search_svm.best_estimator_

print("Best parameters for SVM:", best_params_svm)
print("Best mean accuracy score for SVM:", best_score_svm)

tuned_svm_predictions = best_svm_model.predict(X_test)
accuracy_tuned_svm = accuracy_score(y_test, tuned_svm_predictions)
print("Accuracy of the tuned SVM model:", accuracy_tuned_svm)


ensemble_models = [
    ('Tuned SVM', best_svm_model),
    ('Random Forest', RandomForestClassifier()),
    ('Gradient Boosting', GradientBoostingClassifier()),
    ('Logistic Regression', LogisticRegression())
]

ensemble = VotingClassifier(estimators=ensemble_models, voting='hard')

ensemble.fit(X_train, y_train)

ensemble_predictions = ensemble.predict(X_test)
accuracy_ensemble = accuracy_score(y_test, ensemble_predictions)

print("Accuracy of the ensemble model:", accuracy_ensemble)