In [56]:
import pandas as pd
df = pd.read_csv("./System-Threat-Forecaster/train.csv")
df.head()

Unnamed: 0,MachineID,ProductName,EngineVersion,AppVersion,SignatureVersion,IsBetaUser,RealTimeProtectionState,IsPassiveModeEnabled,AntivirusConfigID,NumAntivirusProductsInstalled,...,IsSecureBootEnabled,IsVirtualDevice,IsTouchEnabled,IsPenCapable,IsAlwaysOnAlwaysConnectedCapable,IsGamer,RegionIdentifier,DateAS,DateOS,target
0,f541bae429089117c4aac39c90dd3416,win8defender,1.1.15200.1,4.18.1807.18075,1.275.1003.0,0,7.0,0,53447.0,1.0,...,0,0.0,1,0,1.0,0.0,6.0,2018-09-10 10:11:00,2018-04-17,0
1,dc2b14d9ce3a0ce4050bb640190f2ca5,win8defender,1.1.15100.1,4.18.1807.18075,1.273.1465.0,0,7.0,0,53447.0,1.0,...,1,0.0,0,0,0.0,0.0,10.0,2018-08-16 00:01:00,2018-08-14,1
2,fd20c5f010e9c5f91ad1c6b3e0da68a0,win8defender,1.1.15200.1,4.18.1807.18075,1.275.1546.0,0,7.0,0,53447.0,1.0,...,0,0.0,0,0,0.0,1.0,6.0,2018-09-20 23:20:00,2018-09-11,1
3,38711eae85eb77a72ec5dfdf27eb2a76,win8defender,1.1.15200.1,4.12.17007.18011,1.275.1141.0,0,7.0,0,46413.0,2.0,...,1,0.0,0,0,0.0,0.0,12.0,2018-09-14 00:32:00,2018-01-03,1
4,32607c9a543a9214e2c7e45800ed4849,win8defender,1.1.15200.1,4.13.17134.228,1.275.1283.0,0,7.0,0,40466.0,2.0,...,0,0.0,0,0,0.0,1.0,7.0,2018-09-15 19:34:00,2018-09-11,0


Preprocessing Pipeline
The preprocessing steps mentioned below are to be used for all the questions that are a part of this milestone

    Impute the categoric columns with the mode and the numeric columns with the mean.

    Perform Label encoding on all the categoric columns. Use a standard scaler to scale all the numeric columns, perform a train test split with a test size 0.2 and random state 42. 

In [57]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder, StandardScaler
le = LabelEncoder()
# Selecting categorical and numerical columns
cat_cols = df.select_dtypes(include=['object']).columns
num_cols = df.select_dtypes(include=['int32', 'int64', 'float64']).columns

# Creating imputers
mean_imp = SimpleImputer(strategy='mean')  # For numerical columns
mode_imp = SimpleImputer(strategy='most_frequent')  # For categorical columns

# Applying imputers
df[num_cols] = mean_imp.fit_transform(df[num_cols]) 
df[cat_cols] = mode_imp.fit_transform(df[cat_cols]) 

for col in cat_cols:
    df[col] = le.fit_transform(df[col])  # Encode categorical values

In [58]:
num_cols = df.select_dtypes(include=['int32', 'int64', 'float64']).columns.drop('target', errors='ignore')
scaler = StandardScaler()
df[num_cols] = scaler.fit_transform(df[num_cols])

In [59]:
from sklearn.model_selection import train_test_split
X = df.drop(columns=['target'])  # Features
y = df[['target']] 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

Fit a Decision tree model (random state 42) on the training set and perform hyper parameter tuning using grid search with 3 folds and use scoring as accuracy, using the following values:

    max_depth: [20, 30]
    min_samples_split: [2, 5]
    min_samples_leaf: [1, 2]

In [60]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV

# Define the Decision Tree model
dt = DecisionTreeClassifier(random_state=42)

# Define hyperparameter grid
param_grid = {
    'max_depth': [20, 30],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 2]
}

# Perform Grid Search with 3-fold cross-validation
grid_search = GridSearchCV(
    estimator=dt,
    param_grid=param_grid,
    cv=3,  # 3-fold cross-validation
    scoring='accuracy',  # Use accuracy as the metric
    n_jobs=-1,  # Use all available CPU cores
)

grid_search.fit(X_train, y_train)



Q1.What is the best value of max_depth?
Q2.What is the best value of min_samples_split? *
Q3.What is the best value of min_samples_leaf? *


In [61]:
print("Best Parameters:", grid_search.best_params_)


Best Parameters: {'max_depth': 20, 'min_samples_leaf': 2, 'min_samples_split': 5}


Q4.Use the best estimator obtained from the previous question and compute the accuracy score on the validation set (as obtained in the train test split done earlier). Enter the score correct up to 2 decimals.

In [62]:
from sklearn.metrics import accuracy_score
best_dt = DecisionTreeClassifier(
    max_depth=20, 
    min_samples_leaf=2, 
    min_samples_split=5,
    random_state=42
)
best_dt.fit(X_train, y_train)
y_pred = best_dt.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"Test Accuracy: {accuracy:.4f}")


Test Accuracy: 0.5662


Model 2

Fit an AdaBoostClassifier model (random state 42) on the training set and perform hyper parameter tuning using grid search with 3 folds and use scoring as accuracy, using the following values :

a. n_estimators: [10, 20, 30]

b. learning_rate: [5, 10]

c. algorithm: ['SAMME']

In [63]:
from sklearn.ensemble import AdaBoostClassifier
adaboost = AdaBoostClassifier( random_state=42)

# Define hyperparameter grid
param_grid = {
    'n_estimators': [10, 20, 30],    # Number of weak learners
    'learning_rate': [5, 10],        # Learning rate
    'algorithm': ['SAMME']           # Algorithm type
}

# Perform Grid Search with 3-fold cross-validation
grid_search = GridSearchCV(
    estimator=adaboost,
    param_grid=param_grid,
    scoring='accuracy',  
    cv=3,  # 3-fold cross-validation
    n_jobs=-1  # Use all available CPU cores
)

# Fit GridSearch to the training data
grid_search.fit(X_train, y_train)

print("Best Accuracy:", grid_search.best_score_)


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


Best Accuracy: 0.4296750202907771


Q5.What is the best number of estimators from your grid search? *
Q6.What is the best value for learning rate? *

In [64]:
print("Best Parameters:", grid_search.best_params_)

Best Parameters: {'algorithm': 'SAMME', 'learning_rate': 5, 'n_estimators': 10}


Mention the accuracy score using the best parameters on the validation set (as obtained in the train test split done earlier). Enter the value correct up to 2 decimal points.

In [65]:
best_adaboost = AdaBoostClassifier(
    algorithm='SAMME',
    learning_rate=5,
    n_estimators=10,
    random_state=42
)
best_adaboost.fit(X_train, y_train)


  y = column_or_1d(y, warn=True)


In [66]:
from sklearn.metrics import accuracy_score

y_pred = best_adaboost.predict(X_test)

test_accuracy = accuracy_score(y_test, y_pred)
print(f"Test Accuracy: {test_accuracy:.4f}")

Test Accuracy: 0.4266
