# PART A task 1 

In [8]:
import pandas as pd
import numpy as np
from scipy import stats
from sklearn.preprocessing import StandardScaler, OneHotEncoder

# Load the dataset
df = pd.read_csv("C:\\Users\\harip\\OneDrive\\Desktop\\Data analytics\\machine leaning\\assessment1\\dataset\\BankNote_Authentication.csv")

# Display the first few entries for a snapshot of the data structure
print(df.head())




   variance  skewness  curtosis  entropy  class
0   3.62160    8.6661   -2.8073 -0.44699      0
1   4.54590    8.1674   -2.4586 -1.46210      0
2   3.86600   -2.6383    1.9242  0.10645      0
3   3.45660    9.5228   -4.0112 -3.59440      0
4   0.32924   -4.4552    4.5718 -0.98880      0


In [9]:
# Check for missing values in the dataset and print out any columns with missing data
missing_values = df.isnull().sum()
print("Missing values in each column:\n", missing_values)

Missing values in each column:
 variance    0
skewness    0
curtosis    0
entropy     0
class       0
dtype: int64


In [24]:
# Assuming there are no missing values, we proceed to outlier removal
# Calculate the Z-score for numerical columns
z_scores = stats.zscore(df.select_dtypes(include=[np.number]))
abs_z_scores = np.abs(z_scores)

# Filter out the outliers
df_clean = df[(abs_z_scores < 3).all(axis=1)].copy()


In [25]:

# Proceed with feature scaling on the numerical columns
scaler = StandardScaler()
# Get the list of numerical features, excluding 'class' which is the target
numerical_features = [col for col in df_clean.columns if col != 'class']
# Apply the scaling to the numerical features
df_clean.loc[:, numerical_features] = scaler.fit_transform(df_clean[numerical_features])


In [12]:
# Encoding categorical variables, if present
# Initialize the OneHotEncoder
encoder = OneHotEncoder(sparse=False, drop='first')
# Identify categorical columns (Note: Adjust this if you have categorical columns)
categorical_features = df_clean.select_dtypes(include=['object']).columns.tolist()
# Apply the encoder to the categorical columns
encoded_features = encoder.fit_transform(df_clean[categorical_features])
# Create a DataFrame with encoded variables
encoded_vars_df = pd.DataFrame(encoded_features, columns=encoder.get_feature_names_out(categorical_features))
# Concatenate the DataFrame with the original one and drop the original categorical columns
df_clean = pd.concat([df_clean.drop(categorical_features, axis=1), encoded_vars_df], axis=1)

# Display the cleaned DataFrame
print(df_clean.head())

   variance  skewness  curtosis   entropy  class
0  1.103540  1.186418 -1.013448  0.328655    0.0
1  1.434725  1.097028 -0.925150 -0.179100    0.0
2  1.191111 -0.839847  0.184664  0.605483    0.0
3  1.044419  1.339977 -1.318300 -1.245668    0.0
4 -0.076142 -1.165518  0.855089  0.057643    0.0




# reclean

In [16]:
df_clean.dropna(inplace=True)


In [18]:
from sklearn.impute import SimpleImputer

# For numerical columns, you can fill missing values with the mean or median
imputer = SimpleImputer(strategy='mean')  # Or strategy='median'
df_clean[numerical_features] = imputer.fit_transform(df_clean[numerical_features])


In [19]:
from sklearn.impute import KNNImputer

imputer = KNNImputer(n_neighbors=5)
df_clean[numerical_features] = imputer.fit_transform(df_clean[numerical_features])


In [21]:
if not df_clean.isnull().sum().any():
    print("All missing values handled successfully.")
else:
    print("Missing values still present.")


All missing values handled successfully.


# c

In [26]:
from sklearn.model_selection import train_test_split

# Define the features and the target variable
X = df_clean.drop('class', axis=1)  # Independent variables
y = df_clean['class']               # Dependent variable (target)

# Split the data into an 80% training subset and a 20% testing subset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Confirm the sizes of the training and testing sets
print(f"Training Features Shape: {X_train.shape}")
print(f"Training Target Shape: {y_train.shape}")
print(f"Testing Features Shape: {X_test.shape}")
print(f"Testing Target Shape: {y_test.shape}")



Training Features Shape: (1068, 4)
Training Target Shape: (1068,)
Testing Features Shape: (268, 4)
Testing Target Shape: (268,)


# task 2

In [27]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

# Initialize the Random Forest Classifier
rf_classifier = RandomForestClassifier(random_state=42)

# Train the classifier with the training data
rf_classifier.fit(X_train, y_train)

# Predictions on the testing set
y_pred = rf_classifier.predict(X_test)

# Basic Model Evaluation
print("Accuracy of the basic model:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))


Accuracy of the basic model: 0.9776119402985075
              precision    recall  f1-score   support

           0       0.97      0.99      0.98       157
           1       0.98      0.96      0.97       111

    accuracy                           0.98       268
   macro avg       0.98      0.98      0.98       268
weighted avg       0.98      0.98      0.98       268



In [30]:
from sklearn.model_selection import GridSearchCV

# Defining the parameter grid
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 2]
}

# Initializing the GridSearchCV object
grid_search = GridSearchCV(estimator=RandomForestClassifier(random_state=42),
                           param_grid=param_grid,
                           cv=5,
                           verbose=2,
                           n_jobs=-1)

# Performing hyperparameter tuning
grid_search.fit(X_train, y_train)

# Extracting the best estimator
rf_optimized = grid_search.best_estimator_

# Displaying the best parameters
print("Best parameters found:", grid_search.best_params_)


Fitting 5 folds for each of 36 candidates, totalling 180 fits
Best parameters found: {'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 100}


In [31]:
# Making predictions with the optimized model
y_pred_optimized = rf_optimized.predict(X_test)

# Optimized Model Evaluation
print("Optimized Random Forest Classifier Performance:")
print("Accuracy:", accuracy_score(y_test, y_pred_optimized))
print(classification_report(y_test, y_pred_optimized))


Optimized Random Forest Classifier Performance:
Accuracy: 0.9776119402985075
              precision    recall  f1-score   support

           0       0.97      0.99      0.98       157
           1       0.98      0.96      0.97       111

    accuracy                           0.98       268
   macro avg       0.98      0.98      0.98       268
weighted avg       0.98      0.98      0.98       268



In [32]:
from sklearn.metrics import confusion_matrix

# Calculate the confusion matrix for the optimized model
cm = confusion_matrix(y_test, y_pred_optimized)

print("Confusion Matrix:")
print(cm)


Confusion Matrix:
[[155   2]
 [  4 107]]


In [33]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Accuracy: Overall, how often is the classifier correct?
accuracy = accuracy_score(y_test, y_pred_optimized)

# Precision: When it predicts yes, how often is it correct?
precision = precision_score(y_test, y_pred_optimized)

# Recall: When it's actually yes, how often does it predict yes?
recall = recall_score(y_test, y_pred_optimized)

# F1 Score: A weighted harmonic mean of precision and recall
f1 = f1_score(y_test, y_pred_optimized)

print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1 Score: {f1:.4f}")


Accuracy: 0.9776
Precision: 0.9817
Recall: 0.9640
F1 Score: 0.9727


In [34]:
from sklearn.model_selection import cross_val_score

# Cross-validation for the optimized Random Forest model
cv_scores = cross_val_score(rf_optimized, X, y, cv=5, scoring='accuracy')

print(f"Cross-Validation Accuracy Scores: {cv_scores}")
print(f"Mean CV Accuracy: {cv_scores.mean()}")


Cross-Validation Accuracy Scores: [0.99253731 0.99625468 0.98876404 0.99625468 0.99625468]
Mean CV Accuracy: 0.9940130806640953
