In [11]:
import pandas as pd
df = pd.read_csv("./System-Threat-Forecaster/train.csv")
test_data = pd.read_csv("./System-Threat-Forecaster/test.csv")
sample_submission = pd.read_csv('./System-Threat-Forecaster/sample_submission.csv')
df.head()


Unnamed: 0,MachineID,ProductName,EngineVersion,AppVersion,SignatureVersion,IsBetaUser,RealTimeProtectionState,IsPassiveModeEnabled,AntivirusConfigID,NumAntivirusProductsInstalled,...,IsSecureBootEnabled,IsVirtualDevice,IsTouchEnabled,IsPenCapable,IsAlwaysOnAlwaysConnectedCapable,IsGamer,RegionIdentifier,DateAS,DateOS,target
0,f541bae429089117c4aac39c90dd3416,win8defender,1.1.15200.1,4.18.1807.18075,1.275.1003.0,0,7.0,0,53447.0,1.0,...,0,0.0,1,0,1.0,0.0,6.0,2018-09-10 10:11:00,2018-04-17,0
1,dc2b14d9ce3a0ce4050bb640190f2ca5,win8defender,1.1.15100.1,4.18.1807.18075,1.273.1465.0,0,7.0,0,53447.0,1.0,...,1,0.0,0,0,0.0,0.0,10.0,2018-08-16 00:01:00,2018-08-14,1
2,fd20c5f010e9c5f91ad1c6b3e0da68a0,win8defender,1.1.15200.1,4.18.1807.18075,1.275.1546.0,0,7.0,0,53447.0,1.0,...,0,0.0,0,0,0.0,1.0,6.0,2018-09-20 23:20:00,2018-09-11,1
3,38711eae85eb77a72ec5dfdf27eb2a76,win8defender,1.1.15200.1,4.12.17007.18011,1.275.1141.0,0,7.0,0,46413.0,2.0,...,1,0.0,0,0,0.0,0.0,12.0,2018-09-14 00:32:00,2018-01-03,1
4,32607c9a543a9214e2c7e45800ed4849,win8defender,1.1.15200.1,4.13.17134.228,1.275.1283.0,0,7.0,0,40466.0,2.0,...,0,0.0,0,0,0.0,1.0,7.0,2018-09-15 19:34:00,2018-09-11,0


In [12]:
df =df.dropna(subset=['target'])
df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 76 columns):
 #   Column                              Non-Null Count   Dtype  
---  ------                              --------------   -----  
 0   MachineID                           100000 non-null  object 
 1   ProductName                         100000 non-null  object 
 2   EngineVersion                       100000 non-null  object 
 3   AppVersion                          100000 non-null  object 
 4   SignatureVersion                    100000 non-null  object 
 5   IsBetaUser                          100000 non-null  int64  
 6   RealTimeProtectionState             99934 non-null   float64
 7   IsPassiveModeEnabled                100000 non-null  int64  
 8   AntivirusConfigID                   99924 non-null   float64
 9   NumAntivirusProductsInstalled       99924 non-null   float64
 10  NumAntivirusProductsEnabled         99924 non-null   float64
 11  HasTpm                     

In [13]:
from sklearn.impute import SimpleImputer

# Identify numerical and categorical columns
num_cols = df.select_dtypes(include=['int64', 'float64']).columns
cat_cols = df.select_dtypes(include=['object']).columns
num_cols = num_cols.drop('target', errors='ignore')

# Impute numerical columns with mean
imputer_num = SimpleImputer(strategy='mean')
df[num_cols] = imputer_num.fit_transform(df[num_cols])
test_data[num_cols] = imputer_num.transform(test_data[num_cols])

# Impute categorical columns with most frequent value
imputer_cat = SimpleImputer(strategy='most_frequent')
df[cat_cols] = imputer_cat.fit_transform(df[cat_cols])
test_data[cat_cols] = imputer_cat.transform(test_data[cat_cols])

print("Missing values handled successfully!")


Missing values handled successfully!


In [14]:
df['DateAS'] = pd.to_datetime(df['DateAS']).dt.month
df['DateOS'] = pd.to_datetime(df['DateOS']).dt.month

## Use Label Encoding for All Categorical Columns

In [15]:
from sklearn.preprocessing import LabelEncoder

# Automatically identify categorical columns
categorical_cols = df.select_dtypes(include=['object']).columns
test_categorical_cols = test_data.select_dtypes(include=['object']).columns

# Apply Label Encoding
encoder = LabelEncoder()
for col in categorical_cols:
    df[col] = encoder.fit_transform(df[col])
for col in test_categorical_cols:
    test_data[col] = encoder.fit_transform(test_data[col])    

In [16]:
Q1 = df.target.quantile(0.25)
Q3 = df.target.quantile(0.75)
IQR = Q3 - Q1
df = df[(df.target >= (Q1 - 1.5*IQR)) & (df.target <= (Q3+1.5*IQR))]


In [17]:
from sklearn.model_selection import train_test_split
X = df.drop(columns=['target'])   # Features
y = df['target']     # Target column
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)



In [18]:
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
model = GradientBoostingRegressor(
    n_estimators=550,
    learning_rate=0.05,
    max_depth=5,
    min_samples_split=10,
    min_samples_leaf=5,
    subsample=0.8,
    max_features=0.8,
    loss='huber',
    alpha=0.75,
)
model.fit(X_train, y_train)
y_pred_train = model.predict(X_train)
y_pred_test = model.predict(X_test)
# Train performance
train_r2 = r2_score(y_train, y_pred_train)
train_mae = mean_absolute_error(y_train, y_pred_train)
train_mse = mean_squared_error(y_train, y_pred_train)

# Test performance
test_r2 = r2_score(y_test, y_pred_test)
test_mae = mean_absolute_error(y_test, y_pred_test)
test_mse = mean_squared_error(y_test, y_pred_test)

print(f"Train R²: {train_r2}")
print(f"Test R²: {test_r2}")
print(f"Train MAE: {train_mae}")
print(f"Test MAE: {test_mae}")
print(f"Train MSE: {train_mse}")
print(f"Test MSE: {test_mse}")


Train R²: 0.15349755044352908
Test R²: 0.059956944557101544
Train MAE: 0.4098400227057514
Test MAE: 0.43481371687790343
Train MSE: 0.21160413119906613
Test MSE: 0.23497578485863155


In [19]:
test_predictions = model.predict(test_data)  # Generate predictions

# Create submission DataFrame (Ensure sample_submission exists)
submission = sample_submission.copy()
submission['target'] = test_predictions  

# If classification-like labels are needed (Assuming binary case)
# Ensure model output is properly rounded or thresholded
submission['target'] = submission['target'].apply(lambda x: '1' if x > 0.5 else '0')

# Save to CSV
submission.to_csv('submission.csv', index=False)

# Output file ready for submission
print("Submission file created.")


Submission file created.
