In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
import os

# Load datasets
eda_train_infect = pd.read_csv(os.path.join('Dataset','eda_train_infect.csv'))
eda_test_infect = pd.read_csv(os.path.join('Dataset','eda_test_infect.csv'))
eda_train_uninfect = pd.read_csv(os.join.path('Dataset','eda_train_uninfect.csv'))
eda_test_uninfect = pd.read_csv(os.join.path('Dataset','eda_test_infect.csv'))

# Labeling infected samples as 1 and uninfected samples as 0
eda_train_infect["Target"] = 1
eda_test_infect["Target"] = 1
eda_train_uninfect["Target"] = 0
eda_test_uninfect["Target"] = 0


In [4]:
# Display the first few rows of the eda_train_infect dataframe
eda_train_infect.head()

Unnamed: 0,ImageFileName,Red_Channel_Distribution,Green_Channel_Distribution,Blue_Channel_Distribution,Infected_Edge_Density,Infected_MeanBlobSize,Infected_MaxBlobSize,Target
0,C100P61ThinF_IMG_20150918_144104_cell_162.png,163.5904,119.1104,119.3664,0.1744,1.333333,2.0,1
1,C100P61ThinF_IMG_20150918_144104_cell_164.png,164.2032,119.9104,116.0448,0.1696,0.0,0.0,1
2,C100P61ThinF_IMG_20150918_144104_cell_165.png,137.8736,104.5488,100.8528,0.168,1.5,2.0,1
3,C100P61ThinF_IMG_20150918_144104_cell_167.png,149.536,110.4432,105.256,0.184,1.333333,2.0,1
4,C100P61ThinF_IMG_20150918_144104_cell_168.png,158.2176,113.8064,109.9968,0.1696,1.0,1.0,1


In [5]:
# Removing 'ImageFileName' column from all datasets
eda_test_infect.drop(columns='ImageFileName',inplace=True)
eda_train_infect.drop(columns='ImageFileName',inplace=True)
eda_test_uninfect.drop(columns='ImageFileName',inplace=True)
eda_train_uninfect.drop(columns='ImageFileName',inplace=True)

# Renaming columns for the infected datasets 
eda_train_infect.rename(columns={
    'ImageFileName': 'ImageFileName',
    'Red_Channel_Distribution': 'Red_Channel_Distribution',
    'Green_Channel_Distribution': 'Green_Channel_Distribution',
    'Blue_Channel_Distribution': 'Blue_Channel_Distribution',
    'Infected_Edge_Density': 'Edge_Density',
    'Infected_MeanBlobSize': 'MeanBlobSize',
    'Infected_MaxBlobSize': 'MaxBlobSize'
}, inplace=True)

eda_test_infect.rename(columns={
    'ImageFileName': 'ImageFileName',
    'Red_Channel_Distribution': 'Red_Channel_Distribution',
    'Green_Channel_Distribution': 'Green_Channel_Distribution',
    'Blue_Channel_Distribution': 'Blue_Channel_Distribution',
    'Infected_Edge_Density': 'Edge_Density',
    'Infected_MeanBlobSize': 'MeanBlobSize',
    'Infected_MaxBlobSize': 'MaxBlobSize'
}, inplace=True)

In [6]:
# Renaming columns for the uninfected datasets 
eda_train_uninfect.rename(columns={
    'ImageFileName': 'ImageFileName',
    'Red_Channel_Distribution': 'Red_Channel_Distribution',
    'Green_Channel_Distribution': 'Green_Channel_Distribution',
    'Blue_Channel_Distribution': 'Blue_Channel_Distribution',
    'Uninfected_Edge_Density': 'Edge_Density',
    'Uninfected_MeanBlobSize': 'MeanBlobSize',
    'Uninfected_MaxBlobSize': 'MaxBlobSize'
}, inplace=True)

eda_test_uninfect.rename(columns={
    'ImageFileName': 'ImageFileName',
    'Red_Channel_Distribution': 'Red_Channel_Distribution',
    'Green_Channel_Distribution': 'Green_Channel_Distribution',
    'Blue_Channel_Distribution': 'Blue_Channel_Distribution',
    'Uninfected_Edge_Density': 'Edge_Density',
    'Uninfected_MeanBlobSize': 'MeanBlobSize',
    'Uninfected_MaxBlobSize': 'MaxBlobSize'
}, inplace=True)

In [7]:
# Display the few rows of the eda_train_uninfect dataframe
eda_test_uninfect

Unnamed: 0,Red_Channel_Distribution,Green_Channel_Distribution,Blue_Channel_Distribution,Infected_Edge_Density,Infected_MeanBlobSize,Infected_MaxBlobSize,Target
0,162.9456,121.9120,117.5216,0.1376,1.000000,1.0,0
1,143.6416,89.4432,96.8192,0.2064,1.333333,2.0,0
2,138.9328,90.3376,95.4704,0.2560,0.000000,0.0,0
3,156.3904,107.8592,104.9968,0.1824,1.000000,1.0,0
4,126.0768,72.1520,87.3760,0.2512,2.000000,2.0,0
...,...,...,...,...,...,...,...
745,151.2352,109.8304,111.5184,0.1232,1.333333,2.0,0
746,120.3968,76.0768,88.9936,0.1600,1.714286,2.0,0
747,128.5920,82.7344,95.1584,0.2128,2.000000,2.0,0
748,141.0784,88.2256,100.7808,0.1984,1.000000,1.0,0


In [8]:
# Display the few rows of the eda_train_uninfect dataframe
eda_test_infect

Unnamed: 0,Red_Channel_Distribution,Green_Channel_Distribution,Blue_Channel_Distribution,Edge_Density,MeanBlobSize,MaxBlobSize,Target
0,162.9456,121.9120,117.5216,0.1376,1.000000,1.0,1
1,143.6416,89.4432,96.8192,0.2064,1.333333,2.0,1
2,138.9328,90.3376,95.4704,0.2560,0.000000,0.0,1
3,156.3904,107.8592,104.9968,0.1824,1.000000,1.0,1
4,126.0768,72.1520,87.3760,0.2512,2.000000,2.0,1
...,...,...,...,...,...,...,...
745,151.2352,109.8304,111.5184,0.1232,1.333333,2.0,1
746,120.3968,76.0768,88.9936,0.1600,1.714286,2.0,1
747,128.5920,82.7344,95.1584,0.2128,2.000000,2.0,1
748,141.0784,88.2256,100.7808,0.1984,1.000000,1.0,1


In [9]:
# Concatenating infected datasets into 1 dataframe
final_df = pd.concat([eda_train_infect,eda_train_uninfect],axis = 0)

In [10]:
# Split the dataset into features (X) and the target variable (y)
X = final_df.drop('Target',axis=1).values 
y = final_df['Target'].values

In [11]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [12]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)


In [13]:
classifier = RandomForestClassifier(random_state=42)
classifier.fit(X_train, y_train)


In [14]:
y_pred = classifier.predict(X_test)


In [15]:
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy:.2f}')

# You can also print a classification report for more detailed metrics
report = classification_report(y_test, y_pred)
print(report)


Accuracy: 0.83
              precision    recall  f1-score   support

           0       0.81      0.83      0.82       324
           1       0.85      0.83      0.84       376

    accuracy                           0.83       700
   macro avg       0.83      0.83      0.83       700
weighted avg       0.83      0.83      0.83       700



#### Analysis: The RF model predicts accuracy at 83%. 

In [16]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

# Assuming final_df is your dataset; if it's not available, this code won't run as is.
# For the sake of demonstration, let's create a dummy dataset.
import pandas as pd
import numpy as np


# Standardize the feature data
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Create and train the Linear Regression model
lr_model = LinearRegression()
lr_model.fit(X_train, y_train)

# Make predictions on the test data
y_pred = lr_model.predict(X_test)

# Evaluate the regression model's performance
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

mse, r2


(0.16996057165680944, 0.3163852762030748)

#### Analysis: 

The MSE value suggests the average squared error your model makes in its predictions. Lower is better.
The R2 value of  0.3164 suggests that the model explains approximately 31.64% of the variance in the dependent variable that is predictable from the independent variables. This means that there's still a significant portion of the variance in the target variable that the model isn't capturing. Ideally, you'd want this value to be higher.

Hence more models need to be explored.

In [17]:
#SVC Model
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report



# Split the dataset into features (X) and the target variable (y)
X = final_df.drop('Target', axis=1)
y = final_df['Target']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Standardize the feature data
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Create and train the SVM model
svm_classifier = SVC(kernel='linear', random_state=42)
svm_classifier.fit(X_train, y_train)

# Make predictions on the test data
y_pred = svm_classifier.predict(X_test)

# Evaluate the classifier's performance
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy:.2f}')

# Print a classification report for more detailed metrics
report = classification_report(y_test, y_pred)
print(report)


Accuracy: 0.80
              precision    recall  f1-score   support

           0       0.74      0.88      0.80       324
           1       0.87      0.73      0.79       376

    accuracy                           0.80       700
   macro avg       0.80      0.80      0.80       700
weighted avg       0.81      0.80      0.80       700



### Analysis = 

The model performs reasonably well with an accuracy of 80%. Both classes have similar precision, recall, and F1-score values, indicating that the model doesn't have a strong bias toward one class over the other. The slightly higher precision for class 1 suggests that when the model predicts class 1, it's more often correct compared to when it predicts class 0. However, the recall for class 0 is higher, indicating the model is better at capturing all the actual instances of class 0 compared to class 1.

In [16]:
#SVC model with hyperparamter

import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report

# Split the dataset into features (X) and the target variable (y)
X = final_df.drop('Target', axis=1)
y = final_df['Target']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Standardize the feature data
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Define a parameter grid to search over
param_grid = {
    'C': [0.1, 1, 10, 100],  # Regularization parameter
    'kernel': ['linear', 'rbf', 'poly'],  # Kernel type
    'gamma': ['scale', 'auto'] + [0.001, 0.01, 0.1, 1]  # Kernel coefficient
}

# Create an SVM classifier
svm_classifier = SVC(random_state=42)

# Create a GridSearchCV object to find the best combination of hyperparameters
grid_search = GridSearchCV(estimator=svm_classifier, param_grid=param_grid, cv=5, n_jobs=-1, verbose=2)

# Perform the grid search on the training data
grid_search.fit(X_train, y_train)

# Get the best parameters from the grid search
best_params = grid_search.best_params_
print("Best Hyperparameters:", best_params)

# Use the best estimator found by GridSearchCV for predictions
best_svm_classifier = grid_search.best_estimator_
y_pred = best_svm_classifier.predict(X_test)

# Evaluate the classifier's performance
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy:.2f}')

# Print a classification report for more detailed metrics
report = classification_report(y_test, y_pred)
print(report)


Fitting 5 folds for each of 72 candidates, totalling 360 fits
[CV] END ..................C=0.1, gamma=scale, kernel=linear; total time=   0.1s
[CV] END ..................C=0.1, gamma=scale, kernel=linear; total time=   0.1s
[CV] END ..................C=0.1, gamma=scale, kernel=linear; total time=   0.1s
[CV] END ..................C=0.1, gamma=scale, kernel=linear; total time=   0.1s
[CV] END ..................C=0.1, gamma=scale, kernel=linear; total time=   0.1s
[CV] END .....................C=0.1, gamma=scale, kernel=rbf; total time=   0.2s
[CV] END .....................C=0.1, gamma=scale, kernel=rbf; total time=   0.1s
[CV] END .....................C=0.1, gamma=scale, kernel=rbf; total time=   0.1s
[CV] END .....................C=0.1, gamma=scale, kernel=rbf; total time=   0.1s
[CV] END ....................C=0.1, gamma=scale, kernel=poly; total time=   0.1s
[CV] END ....................C=0.1, gamma=scale, kernel=poly; total time=   0.1s
[CV] END ....................C=0.1, gamma=scale

### Analysis: 

The SVM model with hyperparameters C=10, gamma=0.1, and kernel='rbf' performs well on the test data, achieving an accuracy of 85%. The model has good precision and recall for both classes, suggesting a balanced performance. The results of the grid search highlight the importance of hyperparameter tuning in achieving optimal model performance.

### Precision:
For class 0: 84% of the instances predicted as class 0 were actually class 0.

For class 1: 86% of the instances predicted as class 1 were actually class 1.

### Recall:
For class 0: 85% of actual instances of class 0 were correctly identified by the model.

For class 1: 84% of actual instances of class 1 were correctly identified.

In [18]:
# SVC + PCA 
from sklearn.decomposition import PCA
from sklearn.metrics import accuracy_score, classification_report

# Split the dataset into features (X) and the target variable (y)
X = final_df.drop('Target', axis=1)
y = final_df['Target']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Standardize the feature data
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Apply PCA for dimensionality reduction
pca = PCA(n_components=0.95)  # Choose the number of components to capture 95% of variance
X_train_pca = pca.fit_transform(X_train_scaled)
X_test_pca = pca.transform(X_test_scaled)

# Create an SVM classifier
svm_classifier = SVC(kernel='linear', random_state=42)

# Train the SVM model on the reduced dimensionality data
svm_classifier.fit(X_train_pca, y_train)

# Make predictions on the test data
y_pred = svm_classifier.predict(X_test_pca)

# Evaluate the classifier's performance
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy with PCA: {accuracy:.2f}')

# Print a classification report for more detailed metrics
report = classification_report(y_test, y_pred)
print(report)


Accuracy with PCA: 0.79
              precision    recall  f1-score   support

           0       0.72      0.88      0.79       324
           1       0.87      0.71      0.78       376

    accuracy                           0.79       700
   macro avg       0.80      0.79      0.79       700
weighted avg       0.80      0.79      0.79       700



### Analysis:

1. Accuracy:

The model achieves an accuracy of 79% on the test set. This means it correctly predicts the class for 79% of the test samples.

2. Precision:

Class 0: Of all the instances predicted as class 0, 72% were actually class 0. This means that when the model predicts an instance to be of class 0, it is correct 72% of the time.
Class 1: Of all the instances predicted as class 1, 87% were actually class 1. This suggests that the model is more reliable when it predicts an instance as class 1 compared to class 0.

The use of PCA with SVC has yielded an accuracy of 79%. While this is decent, there's room for improvement. The model is more precise in predicting class 1, but its recall is lower for the same class, meaning it misses out on a significant number of true class 1 instances. Conversely, the model has a higher recall for class 0, indicating it is good at identifying true class 0 instances, but its precision for class 0 is lower, suggesting there are false positives.

In the context of PCA, it's worth noting that PCA reduces the dimensionality of the data by capturing the most significant variance in fewer components. Depending on how much of the variance is retained after PCA, some information might be lost, which can affect the model's performance.

In [18]:
#Decision Tree 
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, classification_report

# Split the dataset into features (X) and the target variable (y)
X = final_df.drop('Target', axis=1)
y = final_df['Target']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create and train the Decision Tree model
tree_classifier = DecisionTreeClassifier(random_state=42)
tree_classifier.fit(X_train, y_train)

# Make predictions on the test data
y_pred = tree_classifier.predict(X_test)

# Evaluate the classifier's performance
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy:.2f}')

# Print a classification report for more detailed metrics
report = classification_report(y_test, y_pred)
print(report)


Accuracy: 0.74
              precision    recall  f1-score   support

           0       0.72      0.70      0.71       324
           1       0.75      0.77      0.76       376

    accuracy                           0.74       700
   macro avg       0.74      0.73      0.74       700
weighted avg       0.74      0.74      0.74       700



### Analysis : 

1. Accuracy:

The Decision Tree model correctly predicts the outcome 74% of the time on the test set.
2. Precision:

Class 0: Of all the instances predicted as class 0, 72% were actually class 0. This means that when the model predicts an instance to be of class 0, it is correct 72% of the time.
Class 1: Of all the instances predicted as class 1, 75% were actually class 1. This suggests the model is slightly more reliable when predicting class 1 compared to class 0.

The Decision Tree model has an overall accuracy of 74%. While the precision and recall for both classes are relatively balanced, the model performs slightly better in predicting class 1 over class 0, as evidenced by the higher F1-score for class 1. But the overall accuracy is much lower.

In [19]:
# Random Forest With Gradient boosting
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, VotingClassifier
from sklearn.metrics import accuracy_score, classification_report


# Assuming you have 'target' column containing labels (0 for not infected, 1 for infected)
X = final_df.drop('Target', axis=1)
y = final_df['Target']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create individual classifiers
random_forest = RandomForestClassifier(n_estimators=100, random_state=42)
gradient_boosting = GradientBoostingClassifier(n_estimators=100, random_state=42)

# Create a Voting Classifier (Ensemble of classifiers)
ensemble_classifier = VotingClassifier(estimators=[
    ('rf', random_forest),
    ('gradient_boosting', gradient_boosting)
], voting='hard')  # Use 'hard' for majority voting

# Fit the ensemble model on the training data
ensemble_classifier.fit(X_train, y_train)

# Make predictions on the test data
y_pred = ensemble_classifier.predict(X_test)

# Evaluate the ensemble classifier's performance
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy:.2f}')

# Print a classification report for more detailed metrics
report = classification_report(y_test, y_pred)
print(report)


Accuracy: 0.83
              precision    recall  f1-score   support

           0       0.80      0.86      0.83       324
           1       0.87      0.81      0.84       376

    accuracy                           0.83       700
   macro avg       0.83      0.84      0.83       700
weighted avg       0.84      0.83      0.83       700



### Analysis:

1. Accuracy:

The model correctly predicts the outcome 83% of the time on the test set.

2. Precision:

Class 0: Of all the instances predicted as class 0, 80% were actually class 0. This means the model is correct 80% of the time when it predicts an instance to belong to class 0.

Class 1: The model is slightly more reliable when predicting class 1, with 87% of the instances predicted as class 1 being actual class 1 instances.

The Random Forest with Gradient Boosting model demonstrates a commendable accuracy of 83%. The precision and recall values are relatively balanced for both classes, signifying that the model doesn't show a strong bias towards any particular class. The slightly higher precision for class 1 indicates that when the model predicts an instance to be of class 1, it's more often correct compared to when it predicts class 0. On the other hand, the recall for class 0 is higher, suggesting the model is more adept at capturing all the actual instances of class 0 compared to class 1.

In [20]:
#SVC+RF+neural network

import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score, classification_report


# Assuming you have 'target' column containing labels (0 for not infected, 1 for infected)
X = final_df.drop('Target', axis=1)
y = final_df['Target']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create individual classifiers
svc_classifier = SVC(kernel='linear', random_state=42, probability=True)
random_forest = RandomForestClassifier(n_estimators=100, random_state=42)
neural_network = MLPClassifier(hidden_layer_sizes=(100, 50), max_iter=1000, random_state=42)

# Create a Voting Classifier (Ensemble of classifiers)
ensemble_classifier = VotingClassifier(estimators=[
    ('svc', svc_classifier),
    ('rf', random_forest),
    ('nn', neural_network)
], voting='soft')  # Use 'soft' for weighted voting based on probabilities

# Fit the ensemble model on the training data
ensemble_classifier.fit(X_train, y_train)

# Make predictions on the test data
y_pred = ensemble_classifier.predict(X_test)

# Evaluate the ensemble classifier's performance
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy:.2f}')

# Print a classification report for more detailed metrics
report = classification_report(y_test, y_pred)
print(report)


Accuracy: 0.81
              precision    recall  f1-score   support

           0       0.77      0.84      0.80       324
           1       0.85      0.78      0.81       376

    accuracy                           0.81       700
   macro avg       0.81      0.81      0.81       700
weighted avg       0.81      0.81      0.81       700



### Analysis:

1. Accuracy:

The combined model correctly predicts the outcome 81% of the time on the test set.

2. Precision:

Class 0: Of all the instances predicted as class 0, 77% were actually class 0. This means when the ensemble predicts an instance to be of class 0, it's correct 77% of the time.

Class 1: The ensemble is more precise in its predictions for class 1, with 85% of the instances predicted as class 1 being actual class 1 instances.

This ensemble approach, combining predictions from an SVC, RF, and a neural network, has resulted in an accuracy of 81%. The precision and recall values are relatively balanced for both classes, suggesting that the model has a balanced performance across the two classes.

### IMAGE PIXEL - ML MODEL TRIAL , apart from CNN

K-NN

In [33]:
from PIL import Image
# load training data from csv
train_path = os.path.join('Dataset','eda_train_uninfect.csv')
train_eda_uninf_df = pd.read_csv(train_path)

In [21]:
# Show the first 5 rows of the dataset
train_eda_uninf_df.head()

Unnamed: 0,ImageFileName,Red_Channel_Distribution,Green_Channel_Distribution,Blue_Channel_Distribution,Uninfected_Edge_Density,Uninfected_MeanBlobSize,Uninfected_MaxBlobSize
0,C100P61ThinF_IMG_20150918_144104_cell_128.png,146.592,108.2592,102.9936,0.1408,1.0,1.0
1,C100P61ThinF_IMG_20150918_144104_cell_131.png,155.7472,115.0592,110.232,0.1328,1.0,1.0
2,C100P61ThinF_IMG_20150918_144104_cell_144.png,158.4128,117.5648,112.1712,0.1344,0.0,0.0
3,C100P61ThinF_IMG_20150918_144104_cell_21.png,158.5968,118.3712,114.6064,0.128,0.0,0.0
4,C100P61ThinF_IMG_20150918_144104_cell_34.png,156.4768,116.3568,113.4704,0.1376,1.0,2.0


In [22]:
# creating a column in the dataframe for a column target of value 0 to uninfected dataset, uninfected = 0 
train_eda_uninf_df["Target"]= 0 

In [23]:
#Rename the column names to be universal
train_eda_uninf_df.rename(columns={
    'ImageFileName': 'ImageFileName',
    'Uninfected_Edge_Density': 'Edge_Density',
    'Uninfected_MeanBlobSize': 'MeanBlobSize',
    'Uninfected_MaxBlobSize': 'MaxBlobSize',
    'Red_Channel_Distribution': 'Red_Channel_Distribution',
    'Green_Channel_Distribution': 'Green_Channel_Distribution',
    'Blue_Channel_Distribution': 'Blue_Channel_Distribution',
    'Target': 'Target'
}, inplace=True)

In [24]:
#Check if the column names were renamed
train_eda_uninf_df.head()

Unnamed: 0,ImageFileName,Red_Channel_Distribution,Green_Channel_Distribution,Blue_Channel_Distribution,Edge_Density,MeanBlobSize,MaxBlobSize,Target
0,C100P61ThinF_IMG_20150918_144104_cell_128.png,146.592,108.2592,102.9936,0.1408,1.0,1.0,0
1,C100P61ThinF_IMG_20150918_144104_cell_131.png,155.7472,115.0592,110.232,0.1328,1.0,1.0,0
2,C100P61ThinF_IMG_20150918_144104_cell_144.png,158.4128,117.5648,112.1712,0.1344,0.0,0.0,0
3,C100P61ThinF_IMG_20150918_144104_cell_21.png,158.5968,118.3712,114.6064,0.128,0.0,0.0,0
4,C100P61ThinF_IMG_20150918_144104_cell_34.png,156.4768,116.3568,113.4704,0.1376,1.0,2.0,0


In [25]:
#checking length of the dataset
len(train_eda_uninf_df)

1750

In [26]:
# reading the training EDA data from the csv 
train_eda_inf_df = pd.read_csv('Dataset','eda_train_infect.csv')

In [27]:
# creating a column target of value 1 to uninfected dataset, infected = 1
train_eda_inf_df["Target"]= 1

In [28]:
#Check the infected training dataset with the added column
train_eda_inf_df.head()

Unnamed: 0,ImageFileName,Red_Channel_Distribution,Green_Channel_Distribution,Blue_Channel_Distribution,Infected_Edge_Density,Infected_MeanBlobSize,Infected_MaxBlobSize,Target
0,C100P61ThinF_IMG_20150918_144104_cell_162.png,163.5904,119.1104,119.3664,0.1744,1.333333,2.0,1
1,C100P61ThinF_IMG_20150918_144104_cell_164.png,164.2032,119.9104,116.0448,0.1696,0.0,0.0,1
2,C100P61ThinF_IMG_20150918_144104_cell_165.png,137.8736,104.5488,100.8528,0.168,1.5,2.0,1
3,C100P61ThinF_IMG_20150918_144104_cell_167.png,149.536,110.4432,105.256,0.184,1.333333,2.0,1
4,C100P61ThinF_IMG_20150918_144104_cell_168.png,158.2176,113.8064,109.9968,0.1696,1.0,1.0,1


In [29]:
#Rename the column names to be universal
train_eda_inf_df.rename(columns={
    'ImageFileName': 'ImageFileName',
    'Infected_Edge_Density': 'Edge_Density',
    'Infected_MeanBlobSize': 'MeanBlobSize',
    'Infected_MaxBlobSize': 'MaxBlobSize',
    'Red_Channel_Distribution': 'Red_Channel_Distribution',
    'Green_Channel_Distribution': 'Green_Channel_Distribution',
    'Blue_Channel_Distribution': 'Blue_Channel_Distribution',
    'Target': 'Target'
}, inplace=True)

In [34]:
# Define the path to the folder containing your images
folder_path = os.path.join('Dataset', 'clean', 'train', 'infected_processed')

# List all image files in the folder
image_files = [f for f in os.listdir(folder_path) if f.endswith('.png')]  # Adjust the file extension as needed

# Initialize empty lists to store image pixel data for different rotations and unrotated images
left_rotated_pixels_list = []
right_rotated_pixels_list = []
rotated_180_pixels_list = []
original_pixels_list = []

# Iterate through each image file
for image_file in image_files:
    # Open the image using PIL
    image = Image.open(os.path.join(folder_path, image_file))

    # Rotate the image 90 degrees to the left (counter-clockwise)
    left_rotated_image = image.rotate(90, expand=True)

    # Rotate the image 90 degrees to the right (clockwise)
    right_rotated_image = image.rotate(-90, expand=True)

    # Rotate the image 180 degrees
    rotated_180_image = image.rotate(180, expand=True)

    # Convert the rotated images to NumPy arrays
    left_rotated_array = np.array(left_rotated_image)
    right_rotated_array = np.array(right_rotated_image)
    rotated_180_array = np.array(rotated_180_image)

    # Flatten the arrays to 1D arrays (lists of pixel values)
    left_rotated_pixels = left_rotated_array.flatten()
    right_rotated_pixels = right_rotated_array.flatten()
    rotated_180_pixels = rotated_180_array.flatten()

    # Convert the original image to a NumPy array and flatten it
    original_array = np.array(image)
    original_pixels = original_array.flatten()

    # Append the pixel values to the respective lists
    left_rotated_pixels_list.append(left_rotated_pixels)
    right_rotated_pixels_list.append(right_rotated_pixels)
    rotated_180_pixels_list.append(rotated_180_pixels)
    original_pixels_list.append(original_pixels)

# Create DataFrames for left, right, 180-degree rotated, and original images
left_rotated_df = pd.DataFrame(left_rotated_pixels_list)
right_rotated_df = pd.DataFrame(right_rotated_pixels_list)
rotated_180_df = pd.DataFrame(rotated_180_pixels_list)
original_df = pd.DataFrame(original_pixels_list)

# merging the various dataframes 
train_inf_pixel = pd.concat([original_df,left_rotated_df,right_rotated_df,rotated_180_df],axis=0)

In [35]:
#Check the dataframe of the merged rotated images dataframes
train_inf_pixel.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1865,1866,1867,1868,1869,1870,1871,1872,1873,1874
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [36]:
#Check the length of trained dataframe
len(train_inf_pixel)

7000

In [37]:
# Normalize the data in the training infected pixel database 
train_inf_pixel = train_inf_pixel/255

# Engineering a column named 'Target' in the infected training dataframe and assigning the value 1
train_inf_pixel['Target']= 1

In [38]:
#Check the dataframe 
train_inf_pixel.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1866,1867,1868,1869,1870,1871,1872,1873,1874,Target
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1


In [39]:
# Define the path to the folder containing your images
folder_path = os.path.join('Dataset', 'clean', 'train', 'uninfected_processed')

# List all image files in the folder
image_files = [f for f in os.listdir(folder_path) if f.endswith('.png')]  # Adjust the file extension as needed

# Initialize empty lists to store image pixel data for different rotations and unrotated images
left_rotated_pixels_list = []
right_rotated_pixels_list = []
rotated_180_pixels_list = []
original_pixels_list = []

# Iterate through each image file
for image_file in image_files:
    # Open the image using PIL
    image = Image.open(os.path.join(folder_path, image_file))

    # Rotate the image 90 degrees to the left (counter-clockwise)
    left_rotated_image = image.rotate(90, expand=True)

    # Rotate the image 90 degrees to the right (clockwise)
    right_rotated_image = image.rotate(-90, expand=True)

    # Rotate the image 180 degrees
    rotated_180_image = image.rotate(180, expand=True)

    # Convert the rotated images to NumPy arrays
    left_rotated_array = np.array(left_rotated_image)
    right_rotated_array = np.array(right_rotated_image)
    rotated_180_array = np.array(rotated_180_image)

    # Flatten the arrays to 1D arrays (lists of pixel values)
    left_rotated_pixels = left_rotated_array.flatten()
    right_rotated_pixels = right_rotated_array.flatten()
    rotated_180_pixels = rotated_180_array.flatten()

    # Convert the original image to a NumPy array and flatten it
    original_array = np.array(image)
    original_pixels = original_array.flatten()

    # Append the pixel values to the respective lists
    left_rotated_pixels_list.append(left_rotated_pixels)
    right_rotated_pixels_list.append(right_rotated_pixels)
    rotated_180_pixels_list.append(rotated_180_pixels)
    original_pixels_list.append(original_pixels)

# Create DataFrames for left, right, 180-degree rotated, and original images
left_rotated_df = pd.DataFrame(left_rotated_pixels_list)
right_rotated_df = pd.DataFrame(right_rotated_pixels_list)
rotated_180_df = pd.DataFrame(rotated_180_pixels_list)
original_df = pd.DataFrame(original_pixels_list)


In [40]:
# merging the various dataframes 
train_uninf_pixel = pd.concat([original_df,left_rotated_df,right_rotated_df,rotated_180_df],axis=0)

In [41]:
#Check the dataframe 
train_uninf_pixel.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1865,1866,1867,1868,1869,1870,1871,1872,1873,1874
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [42]:
#Check the length of the Dataframe
len(train_uninf_pixel)

7000

In [43]:
# Normalize the data in the training uninfected pixel database 
train_uninf_pixel = train_uninf_pixel/255

## Engineering a column named 'Target' in the infected training dataframe and assigning the value 0
train_uninf_pixel["Target"]= 0

In [44]:
# concatenating the 2 infected and uninfeacted trained data frame containing pixel data done for ML
train_df = pd.concat([train_inf_pixel,train_uninf_pixel],axis = 0)

In [45]:
#Check the dataframe 
train_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1866,1867,1868,1869,1870,1871,1872,1873,1874,Target
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1745,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
1746,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
1747,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
1748,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.12549,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0


In [46]:
# Splitting the training data, data to X_train and y_train 
X = train_df.drop(columns = "Target").values
y = train_df['Target'].values


In [47]:
# reshaping the X_train data into shape (-1,25,25,3)
X = X.reshape(-1,25,25,3)

In [48]:
# reshaping the array to fit into a CNN ML model
X.shape

(14000, 25, 25, 3)

In [49]:
#Check the length of y
len(y)

14000

In [50]:
# To split the data into training, validating and testing datasets - 2 steps are required
# First, split the data into a temporary set (X_temp, X_test, y_temp, y_test)
X_temp, X_test, y_temp, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Then, split the temporary set into training and validation sets (X_train, X_val, y_train, y_val)
X_train, X_val, y_train, y_val = train_test_split(X_temp, y_temp, test_size=0.25, random_state=42)

IMAGE DATASET MODEL TRIALS - K-NN

In [51]:
# K-NN model
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score

# Flatten the data
X_train_flattened = X_train.reshape(X_train.shape[0], -1)
X_val_flattened = X_val.reshape(X_val.shape[0], -1)
X_test_flattened = X_test.reshape(X_test.shape[0], -1)

# Standardize the data
scaler = StandardScaler()
X_train_standardized = scaler.fit_transform(X_train_flattened)
X_val_standardized = scaler.transform(X_val_flattened)
X_test_standardized = scaler.transform(X_test_flattened)

# Initialize the K-NN classifier
knn_classifier = KNeighborsClassifier(n_neighbors=2) 

# Train the classifier
knn_classifier.fit(X_train_standardized, y_train)

# Predict on the validation set and calculate accuracy
y_val_pred = knn_classifier.predict(X_val_standardized)
val_accuracy = accuracy_score(y_val, y_val_pred)

val_accuracy


0.5717857142857142

In [52]:
from sklearn.model_selection import cross_val_score

# Assuming X_train_standardized and y_train are already defined from previous steps
# This code will search for the best value of k (n_neighbors) using cross-validation

# Define a range of potential k values
k_values = list(range(1, 21))  # This searches k from 1 to 20; you can adjust the range as needed

# Perform cross-validation for each k value
cv_scores = []

for k in k_values:
    knn = KNeighborsClassifier(n_neighbors=k)
    scores = cross_val_score(knn, X_train_standardized, y_train, cv=10, scoring='accuracy')  # 5-fold cross-validation
    cv_scores.append(scores.mean())

# Determine the best k value
optimal_k = k_values[cv_scores.index(max(cv_scores))]

optimal_k


3

In [53]:

# Flatten the data
X_train_flattened = X_train.reshape(X_train.shape[0], -1)
X_val_flattened = X_val.reshape(X_val.shape[0], -1)
X_test_flattened = X_test.reshape(X_test.shape[0], -1)

# Standardize the data
scaler = StandardScaler()
X_train_standardized = scaler.fit_transform(X_train_flattened)
X_val_standardized = scaler.transform(X_val_flattened)
X_test_standardized = scaler.transform(X_test_flattened)

# Initialize the K-NN classifier
knn_classifier = KNeighborsClassifier(n_neighbors=3) 

# Train the classifier
knn_classifier.fit(X_train_standardized, y_train)

# Predict on the validation set and calculate accuracy
y_val_pred = knn_classifier.predict(X_val_standardized)
val_accuracy = accuracy_score(y_val, y_val_pred)

val_accuracy


0.5935714285714285