In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

In [47]:
# Assuming your dataset is in a CSV file
data = pd.read_csv('eda.csv')
data.head()


Unnamed: 0,Uninfected_Edge_Density,Infected_Edge_Density,Parasitized_Avg_Colors,Uninfected_Avg_Colors,Infected_MeanBlobSize,Infected_MaxBlobSize,Uninfected_MeanBlobSize,Uninfected_MaxBlobSize
0,0.1408,0.1744,133.491832,118.653807,5.027778,13.888889,2.074074,7.444444
1,0.1328,0.1696,132.62104,126.433029,5.833333,13.888889,1.585859,7.444444
2,0.1344,0.168,113.769025,128.820641,4.866667,13.888889,3.148148,13.888889
3,0.128,0.184,121.016622,129.87628,3.148148,13.888889,6.37037,17.111111
4,0.1376,0.1696,126.731932,128.073351,2.611111,7.444444,4.222222,13.888889


In [48]:
data.columns

Index(['Uninfected_Edge_Density', 'Infected_Edge_Density',
       'Parasitized_Avg_Colors', 'Uninfected_Avg_Colors',
       'Infected_MeanBlobSize', 'Infected_MaxBlobSize',
       'Uninfected_MeanBlobSize', 'Uninfected_MaxBlobSize'],
      dtype='object')

In [49]:
infected_df = data[["Infected_Edge_Density","Parasitized_Avg_Colors",'Infected_MeanBlobSize','Infected_MaxBlobSize']]

In [50]:
infected_df = infected_df.copy()

infected_df.rename(columns={
    "Infected_Edge_Density": "Edge_Density",
    "Parasitized_Avg_Colors": "Avg_Colors",
    "Infected_MeanBlobSize": "MeanBlobSize",
    "Infected_MaxBlobSize": "MaxBlobSize"
}, inplace=True)

In [51]:
infected_df["target"]= 0
infected_df

Unnamed: 0,Edge_Density,Avg_Colors,MeanBlobSize,MaxBlobSize,target
0,0.1744,133.491832,5.027778,13.888889,0
1,0.1696,132.621040,5.833333,13.888889,0
2,0.1680,113.769025,4.866667,13.888889,0
3,0.1840,121.016622,3.148148,13.888889,0
4,0.1696,126.731932,2.611111,7.444444,0
...,...,...,...,...,...
495,0.1584,125.966542,1.537037,7.444444,0
496,0.1392,126.543157,9.055556,17.111111,0
497,0.1792,118.729682,2.841270,13.888889,0
498,0.1424,108.920481,2.611111,13.888889,0


In [35]:
uninfected_df = data[['Uninfected_Edge_Density',
       'Uninfected_Avg_Colors','Uninfected_MeanBlobSize', 'Uninfected_MaxBlobSize']]

In [52]:
uninfected_df = uninfected_df.copy()

uninfected_df.rename(columns = {"Uninfected_Edge_Density": "Edge_Density",
    "Uninfected_Avg_Colors": "Avg_Colors",
    "Uninfected_MeanBlobSize": "MeanBlobSize",
    "Uninfected_MaxBlobSize": "MaxBlobSize"}, inplace=True)

In [53]:
uninfected_df["target"]= 1
uninfected_df

Unnamed: 0,Edge_Density,Avg_Colors,MeanBlobSize,MaxBlobSize,target
0,0.1408,118.653807,2.074074,7.444444,1
1,0.1328,126.433029,1.585859,7.444444,1
2,0.1344,128.820641,3.148148,13.888889,1
3,0.1280,129.876280,6.370370,17.111111,1
4,0.1376,128.073351,4.222222,13.888889,1
...,...,...,...,...,...
495,0.1360,134.838038,1.716049,7.444444,1
496,0.1392,127.553836,3.148148,13.888889,1
497,0.1296,138.398527,3.577778,13.888889,1
498,0.1248,133.434947,4.222222,13.888889,1


In [55]:
final_df = pd.concat([infected_df,uninfected_df],axis=0)
final_df

Unnamed: 0,Edge_Density,Avg_Colors,MeanBlobSize,MaxBlobSize,target
0,0.1744,133.491832,5.027778,13.888889,0
1,0.1696,132.621040,5.833333,13.888889,0
2,0.1680,113.769025,4.866667,13.888889,0
3,0.1840,121.016622,3.148148,13.888889,0
4,0.1696,126.731932,2.611111,7.444444,0
...,...,...,...,...,...
495,0.1360,134.838038,1.716049,7.444444,1
496,0.1392,127.553836,3.148148,13.888889,1
497,0.1296,138.398527,3.577778,13.888889,1
498,0.1248,133.434947,4.222222,13.888889,1


In [61]:
# Split the dataset into features (X) and the target variable (y)
X = final_df.drop('target',axis=1).values # Assuming 'Label' is the column that indicates infected or non-infected
y = final_df['target'].values

In [62]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [63]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)


In [64]:
classifier = RandomForestClassifier(random_state=42)
classifier.fit(X_train, y_train)


In [65]:
y_pred = classifier.predict(X_test)


In [66]:
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy:.2f}')

# You can also print a classification report for more detailed metrics
report = classification_report(y_test, y_pred)
print(report)


Accuracy: 0.84
              precision    recall  f1-score   support

           0       0.87      0.78      0.82        96
           1       0.82      0.89      0.85       104

    accuracy                           0.84       200
   macro avg       0.84      0.84      0.84       200
weighted avg       0.84      0.84      0.84       200



In [82]:
#SVC Model
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report



# Split the dataset into features (X) and the target variable (y)
X = final_df.drop('target', axis=1)
y = final_df['target']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Standardize the feature data
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Create and train the SVM model
svm_classifier = SVC(kernel='linear', random_state=42)
svm_classifier.fit(X_train, y_train)

# Make predictions on the test data
y_pred = svm_classifier.predict(X_test)

# Evaluate the classifier's performance
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy:.2f}')

# Print a classification report for more detailed metrics
report = classification_report(y_test, y_pred)
print(report)


Accuracy: 0.84
              precision    recall  f1-score   support

           0       0.89      0.77      0.83        96
           1       0.81      0.91      0.86       104

    accuracy                           0.84       200
   macro avg       0.85      0.84      0.84       200
weighted avg       0.85      0.84      0.84       200



In [73]:
#SVC model with hyperparamter

import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report

# Split the dataset into features (X) and the target variable (y)
X = final_df.drop('target', axis=1)
y = final_df['target']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Standardize the feature data
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Define a parameter grid to search over
param_grid = {
    'C': [0.1, 1, 10, 100],  # Regularization parameter
    'kernel': ['linear', 'rbf', 'poly'],  # Kernel type
    'gamma': ['scale', 'auto'] + [0.001, 0.01, 0.1, 1]  # Kernel coefficient
}

# Create an SVM classifier
svm_classifier = SVC(random_state=42)

# Create a GridSearchCV object to find the best combination of hyperparameters
grid_search = GridSearchCV(estimator=svm_classifier, param_grid=param_grid, cv=5, n_jobs=-1, verbose=2)

# Perform the grid search on the training data
grid_search.fit(X_train, y_train)

# Get the best parameters from the grid search
best_params = grid_search.best_params_
print("Best Hyperparameters:", best_params)

# Use the best estimator found by GridSearchCV for predictions
best_svm_classifier = grid_search.best_estimator_
y_pred = best_svm_classifier.predict(X_test)

# Evaluate the classifier's performance
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy:.2f}')

# Print a classification report for more detailed metrics
report = classification_report(y_test, y_pred)
print(report)


Fitting 5 folds for each of 72 candidates, totalling 360 fits
[CV] END ..................C=0.1, gamma=scale, kernel=linear; total time=   0.0s
[CV] END ..................C=0.1, gamma=scale, kernel=linear; total time=   0.0s
[CV] END ..................C=0.1, gamma=scale, kernel=linear; total time=   0.0s
[CV] END ..................C=0.1, gamma=scale, kernel=linear; total time=   0.0s
[CV] END ..................C=0.1, gamma=scale, kernel=linear; total time=   0.0s
[CV] END .....................C=0.1, gamma=scale, kernel=rbf; total time=   0.0s
[CV] END .....................C=0.1, gamma=scale, kernel=rbf; total time=   0.0s
[CV] END .....................C=0.1, gamma=scale, kernel=rbf; total time=   0.0s
[CV] END ....................C=0.1, gamma=scale, kernel=poly; total time=   0.0s
[CV] END .....................C=0.1, gamma=scale, kernel=rbf; total time=   0.0s
[CV] END ....................C=0.1, gamma=scale, kernel=poly; total time=   0.0s
[CV] END ...................C=0.1, gamma=auto, 

In [83]:
# PCA 
from sklearn.decomposition import PCA
from sklearn.metrics import accuracy_score, classification_report

# Split the dataset into features (X) and the target variable (y)
X = final_df.drop('target', axis=1)
y = final_df['target']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Standardize the feature data
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Apply PCA for dimensionality reduction
pca = PCA(n_components=0.95)  # Choose the number of components to capture 95% of variance
X_train_pca = pca.fit_transform(X_train_scaled)
X_test_pca = pca.transform(X_test_scaled)

# Create an SVM classifier
svm_classifier = SVC(kernel='linear', random_state=42)

# Train the SVM model on the reduced dimensionality data
svm_classifier.fit(X_train_pca, y_train)

# Make predictions on the test data
y_pred = svm_classifier.predict(X_test_pca)

# Evaluate the classifier's performance
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy with PCA: {accuracy:.2f}')

# Print a classification report for more detailed metrics
report = classification_report(y_test, y_pred)
print(report)


Accuracy with PCA: 0.84
              precision    recall  f1-score   support

           0       0.89      0.77      0.83        96
           1       0.81      0.91      0.86       104

    accuracy                           0.84       200
   macro avg       0.85      0.84      0.84       200
weighted avg       0.85      0.84      0.84       200



In [85]:
#Decision Tree 
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, classification_report

# Split the dataset into features (X) and the target variable (y)
X = final_df.drop('target', axis=1)
y = final_df['target']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create and train the Decision Tree model
tree_classifier = DecisionTreeClassifier(random_state=42)
tree_classifier.fit(X_train, y_train)

# Make predictions on the test data
y_pred = tree_classifier.predict(X_test)

# Evaluate the classifier's performance
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy:.2f}')

# Print a classification report for more detailed metrics
report = classification_report(y_test, y_pred)
print(report)


Accuracy: 0.77
              precision    recall  f1-score   support

           0       0.78      0.72      0.75        96
           1       0.76      0.81      0.78       104

    accuracy                           0.77       200
   macro avg       0.77      0.76      0.76       200
weighted avg       0.77      0.77      0.76       200



In [86]:
# Ensemble ML
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, VotingClassifier
from sklearn.metrics import accuracy_score, classification_report


# Assuming you have 'target' column containing labels (0 for not infected, 1 for infected)
X = final_df.drop('target', axis=1)
y = final_df['target']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create individual classifiers
random_forest = RandomForestClassifier(n_estimators=100, random_state=42)
gradient_boosting = GradientBoostingClassifier(n_estimators=100, random_state=42)

# Create a Voting Classifier (Ensemble of classifiers)
ensemble_classifier = VotingClassifier(estimators=[
    ('rf', random_forest),
    ('gradient_boosting', gradient_boosting)
], voting='hard')  # Use 'hard' for majority voting

# Fit the ensemble model on the training data
ensemble_classifier.fit(X_train, y_train)

# Make predictions on the test data
y_pred = ensemble_classifier.predict(X_test)

# Evaluate the ensemble classifier's performance
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy:.2f}')

# Print a classification report for more detailed metrics
report = classification_report(y_test, y_pred)
print(report)


Accuracy: 0.82
              precision    recall  f1-score   support

           0       0.81      0.82      0.82        96
           1       0.83      0.83      0.83       104

    accuracy                           0.82       200
   macro avg       0.82      0.82      0.82       200
weighted avg       0.83      0.82      0.83       200



In [81]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score, classification_report


# Assuming you have 'target' column containing labels (0 for not infected, 1 for infected)
X = final_df.drop('target', axis=1)
y = final_df['target']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create individual classifiers
svc_classifier = SVC(kernel='linear', random_state=42, probability=True)
random_forest = RandomForestClassifier(n_estimators=100, random_state=42)
neural_network = MLPClassifier(hidden_layer_sizes=(100, 50), max_iter=1000, random_state=42)

# Create a Voting Classifier (Ensemble of classifiers)
ensemble_classifier = VotingClassifier(estimators=[
    ('svc', svc_classifier),
    ('rf', random_forest),
    ('nn', neural_network)
], voting='soft')  # Use 'soft' for weighted voting based on probabilities

# Fit the ensemble model on the training data
ensemble_classifier.fit(X_train, y_train)

# Make predictions on the test data
y_pred = ensemble_classifier.predict(X_test)

# Evaluate the ensemble classifier's performance
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy:.2f}')

# Print a classification report for more detailed metrics
report = classification_report(y_test, y_pred)
print(report)


Accuracy: 0.84
              precision    recall  f1-score   support

           0       0.83      0.84      0.84        96
           1       0.85      0.84      0.84       104

    accuracy                           0.84       200
   macro avg       0.84      0.84      0.84       200
weighted avg       0.84      0.84      0.84       200

