<a href="https://colab.research.google.com/github/kavya-gee/Stats-Assignments/blob/main/ML_Assignment_4.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [35]:
from google.colab import files
uploaded = files.upload()

Saving diabetes_data_upload.csv to diabetes_data_upload (4).csv


In [42]:
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer

df = pd.read_csv('diabetes_data_upload.csv')

# Handling Missing Values
# Identifying the missing values
missing_values = df.isnull().sum()
print("Missing values per column:")
print(missing_values[missing_values > 0]) # Displaying only columns with missing values
print("------------------------------------------------------------")

# Splitting into numerical and categorical features for imputation
numeric_features = df.select_dtypes(include=[np.number]).columns
categorical_features = df.select_dtypes(include=['object']).columns

# Imputing numerical features with the mean (appropriate for most distributions)
numeric_imputer = SimpleImputer(strategy='mean')
df[numeric_features] = numeric_imputer.fit_transform(df[numeric_features])

# Imputing categorical features with the most frequent value
categorical_imputer = SimpleImputer(strategy='most_frequent')
df[categorical_features] = categorical_imputer.fit_transform(df[categorical_features])

# Verifying if all missing values have been imputed
print("\nRemaining missing values:")
print(df.isnull().sum().sum())
print("------------------------------------------------------------")

from sklearn.preprocessing import StandardScaler, MinMaxScaler

# Scaling Data
# Standardization (Z-score)
scaler_z = StandardScaler()
df_scaled_z = pd.DataFrame(scaler_z.fit_transform(df[numeric_features]), columns=numeric_features)

# Normalization (Min-Max scaling)
scaler_mm = MinMaxScaler()
df_scaled_mm = pd.DataFrame(scaler_mm.fit_transform(df[numeric_features]), columns=numeric_features)

print("Standardized data (first 5 rows):")
print(df_scaled_z.head())
print("\nNormalized data (first 5 rows):")
print(df_scaled_mm.head())
print("------------------------------------------------------------")

# Handling Noise
# Choosing a numerical feature to inject noise into
feature_for_noise = numeric_features[0]

# Injecting random noise
df['noisy_feature'] = df[feature_for_noise] + np.random.normal(0, df[feature_for_noise].std() * 0.1, len(df))

# Smoothing the noisy feature using a rolling mean
df['smoothed_feature'] = df['noisy_feature'].rolling(window=5, center=True).mean()

print("Original, noisy, and smoothed feature (first 10 rows):")
print(df[[feature_for_noise, 'noisy_feature', 'smoothed_feature']].head(10))
print("------------------------------------------------------------")

from scipy import stats

# Handling Outliers
# Detecting outliers using Z-score
z_scores = np.abs(stats.zscore(df[numeric_features]))
outliers = (z_scores > 3).any(axis=1)

print(f"Number of outliers detected: {outliers.sum()}")
print("------------------------------------------------------------")

# Handling outliers by removing them
df_no_outliers = df[~outliers]

print(f"Shape of dataset before removing outliers: {df.shape}")
print(f"Shape of dataset after removing outliers: {df_no_outliers.shape}")
print("------------------------------------------------------------")

# Feature Selection

from sklearn.model_selection import train_test_split
from sklearn.feature_selection import mutual_info_classif, RFE
from sklearn.linear_model import LogisticRegression, Lasso
from sklearn.preprocessing import LabelEncoder

df = pd.read_csv('diabetes_data_upload.csv')

# Converting 'class' column to numerical using Label Encoding
label_encoder = LabelEncoder()
df['class'] = label_encoder.fit_transform(df['class'])

# Preparing the dataset
X = df.drop(columns=['class'])
# Converting categorical features to numerical using one-hot encoding
X = pd.get_dummies(X, columns=X.select_dtypes(include=['object']).columns)
y = df['class']  # Targets variable ('class' column)


# Spliting the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 1. Filter Method: Mutual Information
mi_scores = mutual_info_classif(X_train, y_train) # Calculating mutual information scores
mi_importance = pd.Series(mi_scores, index=X.columns).sort_values(ascending=False)
print("Mutual Information Feature Importance:")
print(mi_importance)
print("------------------------------------------------------------")

# 2. Wrapper Method: Recursive Feature Elimination
rfe = RFE(estimator=LogisticRegression(max_iter=1000), n_features_to_select=5) # Selecting top 5 features
rfe.fit(X_train, y_train)
rfe_importance = pd.Series(rfe.support_, index=X.columns) # Identify selected features
print("\nRFE Selected Features:")
print(rfe_importance[rfe_importance].index.tolist())  # Printing names of selected features
print("------------------------------------------------------------")

# 3. Embedded Method: Lasso
lasso = Lasso(alpha=0.1)
lasso.fit(X_train, y_train)
lasso_importance = pd.Series(np.abs(lasso.coef_), index=X.columns).sort_values(ascending=False)
print("\nLasso Feature Importance:")
print(lasso_importance)

Missing values per column:
Series([], dtype: int64)
------------------------------------------------------------

Remaining missing values:
0
------------------------------------------------------------
Standardized data (first 5 rows):
        Age
0 -0.661367
1  0.821362
2 -0.578993
3 -0.249498
4  0.986110

Normalized data (first 5 rows):
        Age
0  0.324324
1  0.567568
2  0.337838
3  0.391892
4  0.594595
------------------------------------------------------------
Original, noisy, and smoothed feature (first 10 rows):
    Age  noisy_feature  smoothed_feature
0  40.0      42.298508               NaN
1  58.0      58.654686               NaN
2  41.0      41.607472         50.135302
3  45.0      44.650761         52.668096
4  60.0      63.465082         52.156752
5  55.0      54.962476         56.821771
6  57.0      56.097965         61.113633
7  66.0      64.932569         62.202130
8  67.0      66.110072         59.996101
9  70.0      68.907569         56.612443
-------------------