In [1]:
import pandas as pd
import numpy as np
import random
import hashlib
import string

In [2]:
np.random.seed(42)
random.seed(42)

N = 50000  # total rows

In [20]:
def random_sha256():
    return hashlib.sha256(''.join(random.choices(string.ascii_letters + string.digits, k=20)).encode()).hexdigest()

def random_permissions_list():
    perms = ["INTERNET", "READ_SMS", "WRITE_EXTERNAL_STORAGE", "ACCESS_FINE_LOCATION",
             "CALL_PHONE", "READ_CONTACTS", "CAMERA", "RECORD_AUDIO", "SEND_SMS"]
    return ','.join(random.sample(perms, random.randint(1,5)))

def random_app_label():
    return ''.join(random.choices(string.ascii_letters + string.digits, k=random.randint(5,12)))

def random_icon_hash():
    return hashlib.md5(''.join(random.choices(string.ascii_letters + string.digits, k=10)).encode()).hexdigest()

def random_cert_fingerprint():
    return hashlib.sha1(''.join(random.choices(string.ascii_letters + string.digits, k=15)).encode()).hexdigest()

def random_version_code():
    return random.randint(1, 200)

In [21]:
num_real = N // 2
num_fake = N - num_real
labels = [0]*num_real + [1]*num_fake
random.shuffle(labels)

In [35]:
rows = []

for label in labels:
    # Continuous features with overlap
    permissions_score = np.random.normal(10 + label*2, 5)  # overlap
    entropy = np.random.normal(4.5 + label*0.8, 1.5)       # increased std for overlap
    
    # Correlated suspicious strings
    base_suspicious = 4 + int(entropy-4) + np.random.randint(0,4) if label==0 else 6 + int(entropy-4) + np.random.randint(0,6)
    suspicious_strings = np.random.poisson(base_suspicious)
    
    # Discrete features
    cert_mismatch = np.random.choice([0,1], p=[0.85 - 0.3*label, 0.15 + 0.3*label])
    vt_ratio = np.random.randint(0 + label*5, 5 + label*10)
    icon_similarity = np.random.normal(0.85 - label*0.25, 0.15)
    ip_count = np.random.poisson(1 + label*2)
    url_count = np.random.poisson(1 + label*2)
    
    # Dangerous permissions probability
    probs_dp = np.array([0.7-0.3*label, 0.2, 0.1+0.2*label])
    probs_dp = probs_dp / probs_dp.sum()
    dangerous_permissions = np.random.choice([0,1,2], p=probs_dp)
    
    # Cert trusted match probability
    probs_ctm = np.array([0.9-0.5*label, 0.1+0.5*label])
    probs_ctm = probs_ctm / probs_ctm.sum()
    cert_trusted_match = np.random.choice([0,1], p=probs_ctm)
    
    probability_fake = np.random.uniform(0 + label*0.4, 0.6 + label*0.4)
    
    # Clip / realistic limits
    permissions_score = int(np.clip(permissions_score, 0, 30))
    entropy = round(np.clip(entropy, 2.5, 8.0),2)
    icon_similarity = round(np.clip(icon_similarity, 0.0, 1.0),2)
    ip_count = int(np.clip(ip_count, 0, 10))
    url_count = int(np.clip(url_count, 0, 10))
    suspicious_strings = int(np.clip(suspicious_strings, 0, 50))
    
    row = {
        "permissions_score": permissions_score,
        "entropy": entropy,
        "cert_mismatch": int(cert_mismatch),
        "suspicious_strings": suspicious_strings,
        "vt_ratio": vt_ratio,
        "icon_similarity": icon_similarity,
        "ip_count": ip_count,
        "url_count": url_count,
        "dangerous_permissions": int(dangerous_permissions),
        "permissions_list": random_permissions_list(),
        "cert_fingerprint": random_cert_fingerprint(),
        "cert_trusted_match": int(cert_trusted_match),
        "icon_hash": random_icon_hash(),
        "app_label": random_app_label(),
        "version_code": random_version_code(),
        "sha256": random_sha256(),
        "probability_fake": round(probability_fake,2),
        "label": label
    }
    rows.append(row)

In [49]:
df = pd.DataFrame(rows)

# ------------------------
# Inject 10-20% noise in key discrete features
# ------------------------
noise_cols = ['cert_mismatch','dangerous_permissions','cert_trusted_match']
for col in noise_cols:
    mask = np.random.rand(len(df)) < 0.15
    if df[col].dtype in [int,float]:
        df.loc[mask, col] = np.random.randint(df[col].min(), df[col].max()+1, mask.sum())
    else:
        df.loc[mask, col] = df[col].sample(mask.sum(), replace=True).values

# ------------------------
# Save CSV
# ------------------------
df.to_csv("finalmodel1.csv", index=False)
print("finalmodel.csv")

finalmodel.csv


In [5]:
import joblib
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report
import shap
import os

In [6]:
data = pd.read_csv("finalmodel1.csv")

In [7]:
drop_cols = ["vt_ratio", "probability_fake", "sha256", "cert_fingerprint", "icon_hash", "app_label"]
data = data.drop(columns=[c for c in drop_cols if c in data.columns])

In [8]:
def process_permissions(perm_str):
    if not isinstance(perm_str,str) or perm_str.strip()=="":
        return [],0,0,0
    perms = [p.strip() for p in perm_str.split(",") if p.strip()]
    dangerous_keywords = ["READ_SMS","SEND_SMS","RECORD_AUDIO","READ_CONTACTS",
                          "WRITE_CONTACTS","READ_CALL_LOG","WRITE_CALL_LOG",
                          "ACCESS_FINE_LOCATION","READ_PHONE_STATE"]
    dangerous_count = sum(1 for p in perms if any(k in p for k in dangerous_keywords))
    normal_count = len(perms) - dangerous_count
    custom_count = sum(1 for p in perms if p.startswith("com.") or p.startswith("org."))
    return perms, dangerous_count, normal_count, custom_count

perm_info = data['permissions_list'].fillna("").apply(process_permissions)
data['perm_dangerous_count'] = perm_info.apply(lambda x:x[1])
data['perm_normal_count'] = perm_info.apply(lambda x:x[2])
data['perm_custom_count'] = perm_info.apply(lambda x:x[3])

# ------------------------
# Binning / Bucketing
# ------------------------
data['perm_dangerous_count'] = data['perm_dangerous_count'].apply(lambda x: 0 if x==0 else (1 if x==1 else 2))
data['perm_normal_count'] = data['perm_normal_count'].apply(lambda x: 0 if x<=2 else (1 if x<=5 else 2))
data['perm_custom_count'] = data['perm_custom_count'].apply(lambda x: 0 if x==0 else 1)
data['dangerous_permissions'] = data['dangerous_permissions'].apply(lambda x: 0 if x==0 else (1 if x==1 else 2))

# ------------------------
# Noise Injection (15% flip for high-leakage features)
# ------------------------
np.random.seed(42)
for col, max_val in zip(['perm_dangerous_count','perm_custom_count','cert_trusted_match','dangerous_permissions'], [2,1,1,2]):
    mask = np.random.rand(len(data)) < 0.15  # 15% randomly flip
    data.loc[mask, col] = np.random.randint(0, max_val+1, mask.sum())

# ------------------------
# Numeric Cleanup
# ------------------------
num_cols = ['permissions_score','entropy','cert_mismatch','suspicious_strings',
            'icon_similarity','ip_count','url_count',
            'dangerous_permissions','perm_dangerous_count','perm_normal_count','perm_custom_count']
for col in num_cols:
    data[col] = pd.to_numeric(data[col], errors="coerce").fillna(0)

In [9]:
features = ['permissions_score','entropy','cert_mismatch','suspicious_strings',
            'icon_similarity','ip_count','url_count',
            'dangerous_permissions','cert_trusted_match',
            'perm_dangerous_count','perm_normal_count','perm_custom_count']

X = data[features]
y = data['label']

In [10]:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# ------------------------
# Train/Test Split
# ------------------------
X_train, X_test, y_train, y_test = train_test_split(
    X_scaled, y, test_size=0.2, random_state=42, stratify=y
)

In [11]:
clf = RandomForestClassifier(
    n_estimators=200,
    max_depth=12,
    min_samples_split=20,
    min_samples_leaf=10,
    class_weight="balanced",
    random_state=42,
    n_jobs=-1
)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)

In [12]:
print("📊 Classification Report (Hold-out Test):")
print(classification_report(y_test, y_pred))

📊 Classification Report (Hold-out Test):
              precision    recall  f1-score   support

           0       0.94      0.93      0.94      5000
           1       0.93      0.94      0.94      5000

    accuracy                           0.94     10000
   macro avg       0.94      0.94      0.94     10000
weighted avg       0.94      0.94      0.94     10000



In [13]:
cv_scores = cross_val_score(clf, X_scaled, y, cv=5, scoring='accuracy')
print(f"\n✅ CV Accuracy: {cv_scores.mean():.3f} ± {cv_scores.std():.3f}")


✅ CV Accuracy: 0.934 ± 0.004


In [14]:
# Count unique labels
print("Label distribution:")
print(data['label'].value_counts())

Label distribution:
label
1    25000
0    25000
Name: count, dtype: int64


In [15]:
import numpy as np
from sklearn.metrics import accuracy_score

print("🔎 Checking each feature individually...\n")

for col in features:
    # Simple threshold-based classifier
    preds = (data[col] > data[col].median()).astype(int)
    acc = accuracy_score(y, preds)
    unique_vals = data[col].nunique()
    print(f"{col:<30} | Unique: {unique_vals:<5} | Single-feature Accuracy: {acc:.3f}")

    if acc > 0.95:
        print(f"⚠️  Potential leakage: {col} alone gives {acc:.3f} accuracy!\n")


🔎 Checking each feature individually...

permissions_score              | Unique: 31    | Single-feature Accuracy: 0.576
entropy                        | Unique: 551   | Single-feature Accuracy: 0.608
cert_mismatch                  | Unique: 2     | Single-feature Accuracy: 0.629
suspicious_strings             | Unique: 28    | Single-feature Accuracy: 0.706
icon_similarity                | Unique: 99    | Single-feature Accuracy: 0.204
ip_count                       | Unique: 11    | Single-feature Accuracy: 0.751
url_count                      | Unique: 11    | Single-feature Accuracy: 0.748
dangerous_permissions          | Unique: 3     | Single-feature Accuracy: 0.594
cert_trusted_match             | Unique: 2     | Single-feature Accuracy: 0.679
perm_dangerous_count           | Unique: 3     | Single-feature Accuracy: 0.500
perm_normal_count              | Unique: 2     | Single-feature Accuracy: 0.498
perm_custom_count              | Unique: 2     | Single-feature Accuracy: 0.497

In [16]:
import shap
explainer = shap.TreeExplainer(clf)
shap_values = explainer.shap_values(X_scaled)

In [28]:
MODEL_FILE = "datamodel.pkl"

joblib.dump({
    'model': clf,
    'scaler': scaler,
    'features': features
}, MODEL_FILE)

['datamodel.pkl']

In [29]:
MODEL_FILE = "datamodel.pkl"
ml_bundle = joblib.load(MODEL_FILE)

In [32]:
clf = ml_bundle['model']
scaler = ml_bundle['scaler']
features = ml_bundle['features']

In [36]:
dummy_data = pd.DataFrame({
    'permissions_score': [5, 12, 0],
    'entropy': [3.5, 7.2, 1.1],
    'cert_mismatch': [0, 1, 0],
    'suspicious_strings': [2, 8, 0],
    'icon_similarity': [0.9, 0.1, 0.5],
    'vt_ratio': [0.0, 0.8, 0.2],
    'perm_dangerous_count': [0, 2, 1],
    'perm_normal_count': [1, 2, 0],
    'ip_count':[0,1,0],
    'url_count':[0,2,0],
    'dangerous_permissions':[0,1,0],
    'cert_trusted_match':[1,0,1],
    'perm_custom_count':[0,1,0]
})


In [37]:
X_dummy_scaled = scaler.transform(dummy_data[features])

In [38]:
preds = clf.predict(X_dummy_scaled)
probs = clf.predict_proba(X_dummy_scaled)

print("Predictions:", preds)
print("Prediction probabilities:\n", probs)

Predictions: [0 1 0]
Prediction probabilities:
 [[0.99473925 0.00526075]
 [0.11660188 0.88339812]
 [0.66971512 0.33028488]]


In [47]:
dummy_apks = [
    {'permissions_score': 3, 'entropy': 2.1, 'cert_mismatch': 0, 'suspicious_strings': 1,
     'icon_similarity': 0.8, 'vt_ratio': 0.0, 'perm_dangerous_count': 0, 'perm_normal_count': 0,
     'ip_count':0, 'url_count':0, 'dangerous_permissions':0, 'cert_trusted_match':1, 'perm_custom_count':0},

    {'permissions_score': 15, 'entropy': 8.3, 'cert_mismatch': 1, 'suspicious_strings': 12,
     'icon_similarity': 0.2, 'vt_ratio': 0.9, 'perm_dangerous_count': 3, 'perm_normal_count': 2,
     'ip_count':2, 'url_count':3, 'dangerous_permissions':1, 'cert_trusted_match':0, 'perm_custom_count':1},

    {'permissions_score': 0, 'entropy': 1.0, 'cert_mismatch': 0, 'suspicious_strings': 0,
     'icon_similarity': 0.5, 'vt_ratio': 0.1, 'perm_dangerous_count': 0, 'perm_normal_count': 0,
     'ip_count':0, 'url_count':0, 'dangerous_permissions':0, 'cert_trusted_match':1, 'perm_custom_count':0},

    {'permissions_score': 8, 'entropy': 4.5, 'cert_mismatch': 0, 'suspicious_strings': 5,
     'icon_similarity': 0.6, 'vt_ratio': 0.3, 'perm_dangerous_count': 1, 'perm_normal_count': 1,
     'ip_count':1, 'url_count':1, 'dangerous_permissions':0, 'cert_trusted_match':1, 'perm_custom_count':0},

    {'permissions_score': 20, 'entropy': 9.0, 'cert_mismatch': 1, 'suspicious_strings': 15,
     'icon_similarity': 0.1, 'vt_ratio': 1.0, 'perm_dangerous_count': 4, 'perm_normal_count': 3,
     'ip_count':3, 'url_count':4, 'dangerous_permissions':1, 'cert_trusted_match':0, 'perm_custom_count':2},

    {'permissions_score': 2, 'entropy': 2.5, 'cert_mismatch': 0, 'suspicious_strings': 0,
     'icon_similarity': 0.7, 'vt_ratio': 0.0, 'perm_dangerous_count': 0, 'perm_normal_count': 1,
     'ip_count':0, 'url_count':0, 'dangerous_permissions':0, 'cert_trusted_match':1, 'perm_custom_count':0},

    {'permissions_score': 10, 'entropy': 6.5, 'cert_mismatch': 1, 'suspicious_strings': 10,
     'icon_similarity': 0.3, 'vt_ratio': 0.7, 'perm_dangerous_count': 2, 'perm_normal_count': 2,
     'ip_count':2, 'url_count':2, 'dangerous_permissions':1, 'cert_trusted_match':0, 'perm_custom_count':1},

    {'permissions_score': 1, 'entropy': 1.5, 'cert_mismatch': 0, 'suspicious_strings': 0,
     'icon_similarity': 0.9, 'vt_ratio': 0.1, 'perm_dangerous_count': 0, 'perm_normal_count': 0,
     'ip_count':0, 'url_count':0, 'dangerous_permissions':0, 'cert_trusted_match':1, 'perm_custom_count':0},

    {'permissions_score': 18, 'entropy': 8.0, 'cert_mismatch': 1, 'suspicious_strings': 14,
     'icon_similarity': 0.2, 'vt_ratio': 0.9, 'perm_dangerous_count': 3, 'perm_normal_count': 2,
     'ip_count':3, 'url_count':3, 'dangerous_permissions':1, 'cert_trusted_match':0, 'perm_custom_count':2},

    {'permissions_score': 5, 'entropy': 3.0, 'cert_mismatch': 0, 'suspicious_strings': 1,
     'icon_similarity': 0.8, 'vt_ratio': 0.2, 'perm_dangerous_count': 1, 'perm_normal_count': 1,
     'ip_count':1, 'url_count':1, 'dangerous_permissions':0, 'cert_trusted_match':1, 'perm_custom_count':0}
]


In [48]:
explainer = shap.TreeExplainer(clf)

In [49]:

for i, apk in enumerate(dummy_apks):
    # Keep feature names
    X_input = pd.DataFrame([apk], columns=features)
    X_scaled_input = scaler.transform(X_input)

    prob_fake = clf.predict_proba(X_scaled_input)[0][1] * 100
    shap_vals = explainer.shap_values(X_scaled_input)

    # Handle binary-class shap output
    shap_for_class1 = shap_vals[1] if isinstance(shap_vals, list) and len(shap_vals)==2 else shap_vals

    shap_values_sample = shap_for_class1[0].flatten()  # 1D array for features

    explanations = []
    for j, f in enumerate(features):
        if shap_values_sample[j] > 0:
            explanations.append(f"High {f} contributes to fake prediction")

    recommendation = "Uninstall immediately. Do not enter credentials." if prob_fake > 50 else "Likely safe, but exercise caution."

    print(f"\n--- Dummy APK {i+1} ---")
    print(f"Probability Fake: {prob_fake:.2f}%")
    print("Explanations:", explanations)
    print("Recommendation:", recommendation)



--- Dummy APK 1 ---
Probability Fake: 1.61%
Explanations: ['High permissions_score contributes to fake prediction', 'High cert_mismatch contributes to fake prediction', 'High icon_similarity contributes to fake prediction', 'High url_count contributes to fake prediction', 'High cert_trusted_match contributes to fake prediction', 'High perm_normal_count contributes to fake prediction']
Recommendation: Likely safe, but exercise caution.

--- Dummy APK 2 ---
Probability Fake: 98.37%
Explanations: ['High entropy contributes to fake prediction', 'High suspicious_strings contributes to fake prediction', 'High ip_count contributes to fake prediction', 'High dangerous_permissions contributes to fake prediction', 'High perm_dangerous_count contributes to fake prediction', 'High perm_custom_count contributes to fake prediction']
Recommendation: Uninstall immediately. Do not enter credentials.

--- Dummy APK 3 ---
Probability Fake: 32.47%
Explanations: ['High permissions_score contributes to fak