In [26]:
import pandas as pd
df = pd.read_csv("./System-Threat-Forecaster/train.csv")
df.head()

Unnamed: 0,MachineID,ProductName,EngineVersion,AppVersion,SignatureVersion,IsBetaUser,RealTimeProtectionState,IsPassiveModeEnabled,AntivirusConfigID,NumAntivirusProductsInstalled,...,IsSecureBootEnabled,IsVirtualDevice,IsTouchEnabled,IsPenCapable,IsAlwaysOnAlwaysConnectedCapable,IsGamer,RegionIdentifier,DateAS,DateOS,target
0,f541bae429089117c4aac39c90dd3416,win8defender,1.1.15200.1,4.18.1807.18075,1.275.1003.0,0,7.0,0,53447.0,1.0,...,0,0.0,1,0,1.0,0.0,6.0,2018-09-10 10:11:00,2018-04-17,0
1,dc2b14d9ce3a0ce4050bb640190f2ca5,win8defender,1.1.15100.1,4.18.1807.18075,1.273.1465.0,0,7.0,0,53447.0,1.0,...,1,0.0,0,0,0.0,0.0,10.0,2018-08-16 00:01:00,2018-08-14,1
2,fd20c5f010e9c5f91ad1c6b3e0da68a0,win8defender,1.1.15200.1,4.18.1807.18075,1.275.1546.0,0,7.0,0,53447.0,1.0,...,0,0.0,0,0,0.0,1.0,6.0,2018-09-20 23:20:00,2018-09-11,1
3,38711eae85eb77a72ec5dfdf27eb2a76,win8defender,1.1.15200.1,4.12.17007.18011,1.275.1141.0,0,7.0,0,46413.0,2.0,...,1,0.0,0,0,0.0,0.0,12.0,2018-09-14 00:32:00,2018-01-03,1
4,32607c9a543a9214e2c7e45800ed4849,win8defender,1.1.15200.1,4.13.17134.228,1.275.1283.0,0,7.0,0,40466.0,2.0,...,0,0.0,0,0,0.0,1.0,7.0,2018-09-15 19:34:00,2018-09-11,0


Preprocessing Pipeline
The preprocessing steps mentioned below are to be used for all the questions that are a part of this milestone

    Impute the categoric columns with the mode and the numeric columns with the mean.

    Perform Label encoding on all the categoric columns. Use a standard scaler to scale all the numeric columns, perform a train test split with a test size 0.2 and random state 42. 

In [27]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder, StandardScaler
le = LabelEncoder()
# Selecting categorical and numerical columns
cat_cols = df.select_dtypes(include=['object']).columns
num_cols = df.select_dtypes(include=['int32', 'int64', 'float64']).columns

# Creating imputers
mean_imp = SimpleImputer(strategy='mean')  # For numerical columns
mode_imp = SimpleImputer(strategy='most_frequent')  # For categorical columns

# Applying imputers
df[num_cols] = mean_imp.fit_transform(df[num_cols]) 
df[cat_cols] = mode_imp.fit_transform(df[cat_cols]) 

for col in cat_cols:
    df[col] = le.fit_transform(df[col])  # Encode categorical values

In [28]:
num_cols = df.select_dtypes(include=['int32', 'int64', 'float64']).columns.drop('target', errors='ignore')
scaler = StandardScaler()
df[num_cols] = scaler.fit_transform(df[num_cols])

In [29]:
from sklearn.model_selection import train_test_split
X = df.drop(columns=['target'])  # Features
y = df['target']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

Principal Component Analysis
Perform PCA on the dataset with default parameters and answer the following questions

In [30]:
from sklearn.decomposition import PCA
pca = PCA()  # Default: n_components=None
X_pca = pca.fit_transform(df)


Q1.What is the number of components required to explain 70% of the variance? *


In [31]:
import numpy as np
# Cumulative explained variance
cumulative_variance = np.cumsum(pca.explained_variance_ratio_)

# Find the number of components needed for 70% variance
n_components_70 = np.argmax(cumulative_variance >= 0.70) + 1

print(f"Number of components required to explain 70% variance: {n_components_70}")


Number of components required to explain 70% variance: 23


Q2.Reconstruct the data using the number of components obtained in the previous question and enter the Mean Squared Error value obtained

In [32]:
from sklearn.metrics import mean_squared_error
# Apply PCA with optimal number of components
pca_70 = PCA(n_components=n_components_70)
X_reduced = pca_70.fit_transform(df)

# Reconstruct the data
X_reconstructed = pca_70.inverse_transform(X_reduced)

# Compute Mean Squared Error (MSE)
mse = mean_squared_error(df, X_reconstructed)

print(f"Mean Squared Error (MSE): {mse}")


Mean Squared Error (MSE): 0.27543198862915846


Q3.How much variance is explained by 40 components? *

In [33]:
# Compute cumulative explained variance
cumulative_variance = np.cumsum(pca.explained_variance_ratio_)

# Get explained variance by first 40 components
explained_variance_40 = cumulative_variance[39]  # Index 39 (since Python uses 0-based indexing)

print(f"Variance explained by first 40 components: {explained_variance_40:.4f}")


Variance explained by first 40 components: 0.9117


Feature Selection
Select top 15 features from the train dataset using f_classif as the score function

In [34]:
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.model_selection import train_test_split

X = df.drop(columns=["target"])  
y = df["target"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

from sklearn.feature_selection import VarianceThreshold

# Remove constant features
constant_filter = VarianceThreshold(threshold=0)  # Removes features with zero variance
X_train_filtered = constant_filter.fit_transform(X_train)

# Get filtered feature names
filtered_features = X_train.columns[constant_filter.get_support()]

# Apply SelectKBest after removing constant features
selector = SelectKBest(score_func=f_classif, k=15)
X_train_selected = selector.fit_transform(X_train_filtered, y_train)

# Get selected feature names
selected_features = filtered_features[selector.get_support()]

print("Top 15 selected features:")
print(selected_features)


Top 15 selected features:
Index(['EngineVersion', 'SignatureVersion', 'RealTimeProtectionState',
       'AntivirusConfigID', 'NumAntivirusProductsInstalled', 'Processor',
       'IsSystemProtected', 'ProcessorCoreCount', 'PrimaryDiskCapacityMB',
       'TotalPhysicalRAMMB', 'PowerPlatformRole', 'OSArchitecture',
       'IsTouchEnabled', 'IsAlwaysOnAlwaysConnectedCapable', 'IsGamer'],
      dtype='object')


What is the score of the best feature as obtained?

In [35]:
# Get all F-scores
f_scores = selector.scores_

# Find the highest F-score (best feature score)
best_feature_score = max(f_scores)

print(f"Score of the best feature: {best_feature_score:.4f}")

Score of the best feature: 1851.6051


Make use of different feature selection methods and describe your process and findings. 
Fit Lasso and Ridge models and provide your insights on the values of the coefficients obtained

In [36]:
from sklearn.feature_selection import RFE
from sklearn.linear_model import LinearRegression

# Define the model (use Linear Regression for RFE)
model = LinearRegression()

# Apply RFE (keep top 15 features)
rfe = RFE(estimator=model, n_features_to_select=15)
X_train_selected = rfe.fit_transform(X_train, y_train)
X_test_selected = rfe.transform(X_test)

selected_features = X_train.columns[rfe.support_]


Fit Lasso & Ridge on Selected Features

In [37]:
from sklearn.linear_model import Lasso
from sklearn.metrics import mean_squared_error

# Train Lasso model
lasso = Lasso(alpha=0.1)
lasso.fit(X_train_selected, y_train)

# Predict
y_pred_lasso = lasso.predict(X_test_selected)

# Compute MSE
mse_lasso = mean_squared_error(y_test, y_pred_lasso)
print(f"Lasso MSE: {mse_lasso:.4f}")

# Display non-zero coefficients
lasso_coeffs = pd.Series(lasso.coef_, index=selected_features)


Lasso MSE: 0.2500


Ridge Regression (L2)

In [38]:
from sklearn.linear_model import Ridge

# Train Ridge model
ridge = Ridge(alpha=0.1)
ridge.fit(X_train_selected, y_train)

# Predict
y_pred_ridge = ridge.predict(X_test_selected)

# Compute MSE
mse_ridge = mean_squared_error(y_test, y_pred_ridge)
print(f"Ridge MSE: {mse_ridge:.4f}")

# Display all coefficients
ridge_coeffs = pd.Series(ridge.coef_, index=selected_features)
print("Ridge Coefficients:")
print(ridge_coeffs.sort_values(ascending=False))


Ridge MSE: 0.2392
Ridge Coefficients:
OSEdition                             0.082527
OSProductSuite                        0.050425
PrimaryDiskCapacityMB                 0.036127
IsSystemProtected                     0.033359
OSInstallLanguageID                   0.030409
AntivirusConfigID                     0.026930
IsGamer                               0.025368
PrimaryDisplayResolutionHorizontal    0.007679
IsVirtualDevice                      -0.022262
SystemVolumeCapacityMB               -0.023591
Processor                            -0.026386
IsAlwaysOnAlwaysConnectedCapable     -0.026581
OSUILocaleID                         -0.031280
OSSkuFriendlyName                    -0.031910
NumAntivirusProductsInstalled        -0.063432
dtype: float64
