In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score
import joblib

In [4]:
# Load dataset
DATASET_PATH = "latest.csv"  # Replace with your actual dataset path
df = pd.read_csv(DATASET_PATH)
df.head(3)

Unnamed: 0,sha256,sha1,md5,dex_date,apk_size,pkg_name,vercode,Vulnerability_Present,vt_scan_date,dex_size,markets
0,0000003B455A6C7AF837EF90F2EAFFD856E3B5CF49F5E2...,9C14D537A7ADB4CFC43D291352F73E05E0CCDD4A,3EDFC78AB53521942798AD551027D04F,05-04-2016 17:58,10386469,com.zte.bamachaye,121.0,0.0,15-06-2016 15:26,4765888,anzhi
1,0000014A634DB98F85038B833A8DFC50D5FB13A464E0B2...,C3EBEC52C9388BF67479FF1385A56C59B3E39E81,0A146750FB447CF3859C9CB659AB04F1,20-08-2014 13:58,3537486,com.tanersenel.onlinetvizle,16.0,0.0,20-11-2016 10:26,3170096,PlayDrone
2,000001A94F46A0C3DDA514E1F24E675648835BBA5EF3C3...,C0444D784685EFE5F6D9F28683B24B5873E509CB,EC82771AE018B93AD784A1FD2B625216,01-01-1980 00:00,52469861,com.firstchoice.myfirstchoice,1206145.0,0.0,17-03-2021 08:02,9201656,play.google.com


In [14]:
df['dex_date'] = pd.to_datetime(df['dex_date'], errors='coerce')
df['vt_scan_date'] = pd.to_datetime(df['vt_scan_date'], errors='coerce')
df['days_since_scan'] = (df['vt_scan_date'] - df['dex_date']).dt.days
df['days_since_scan'].fillna(0, inplace=True)  # Replace NaN with 0

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['days_since_scan'].fillna(0, inplace=True)  # Replace NaN with 0


In [15]:
# Handle missing values in the target variable
df = df.dropna(subset=['Vulnerability_Present'])  # Drop rows with NaN in the target column

In [16]:
# Define features and target variable
features = ['apk_size', 'dex_size', 'vercode', 'days_since_scan']
target = 'Vulnerability_Present'

In [17]:
# Check columns
print(df.columns)

Index(['sha256', 'sha1', 'md5', 'dex_date', 'apk_size', 'pkg_name', 'vercode',
       'Vulnerability_Present', 'vt_scan_date', 'dex_size', 'markets',
       'days_since_scan'],
      dtype='object')


In [18]:
# Splitting data into training and testing sets
X = df[features]
y = df[target]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [19]:
# Train the Random Forest model
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

In [20]:
# Evaluate the model
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"Model Accuracy: {accuracy:.2f}")
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

Model Accuracy: 0.85

Classification Report:
              precision    recall  f1-score   support

         0.0       0.88      0.99      0.93    176239
         1.0       0.48      0.12      0.19     12460
         2.0       0.35      0.09      0.14      3095
         3.0       0.24      0.06      0.10      1686
         4.0       0.22      0.08      0.11      1249
         5.0       0.21      0.07      0.10      1328
         6.0       0.18      0.05      0.08      1151
         7.0       0.18      0.05      0.07      1104
         8.0       0.18      0.05      0.08       954
         9.0       0.11      0.03      0.04       758
        10.0       0.10      0.03      0.04       587
        11.0       0.09      0.02      0.04       553
        12.0       0.11      0.03      0.05       487
        13.0       0.10      0.03      0.05       524
        14.0       0.14      0.05      0.07       545
        15.0       0.16      0.06      0.09       467
        16.0       0.09      0.03   

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [21]:
# Save the trained model to a .pkl file
joblib.dump(model, "model.pkl")
print("Model saved as 'model.pkl'")

Model saved as 'model.pkl'
