In [9]:
import csv
import random
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, accuracy_score, precision_score
from sklearn.model_selection import train_test_split, cross_val_score
import pandas as pd
import numpy as np

In [10]:
# Generate simulated dataset
def generate_data(num_samples=200000):
    data = []
    for _ in range(num_samples):
        connection_duration = random.uniform(1, 60)  # in minutes
        signal_strength = random.uniform(-100, -30)  # in dBm
        data_transfer_rate = random.uniform(1, 100)  # in Mbps
        time_of_day = random.uniform(0, 24)  # in hours

        # Simulate intrusions (10% chance)
        is_intrusion = random.random() < 0.3

        # Adjust parameters for intrusions to make them more distinct
        if is_intrusion:
            connection_duration *= 1.5
            signal_strength *= 0.8
            data_transfer_rate *= 1.2

        data.append([connection_duration, signal_strength, data_transfer_rate, time_of_day, int(is_intrusion)])

    with open('bluetooth_new_data.csv', 'w', newline='') as file:
        writer = csv.writer(file)
        writer.writerow(['connection_duration', 'signal_strength', 'data_transfer_rate', 'time_of_day', 'is_intrusion'])
        writer.writerows(data)
        
generate_data(num_samples=200000)

In [11]:
def preprocess_data(file_path):
    try:
        data = pd.read_csv(file_path)
        print(f"NaN values in dataset:\n{data.isna().sum()}")
        
        # Remove rows with NaN values
        data_cleaned = data.dropna()
        
        # Split features and target
        X = data_cleaned.iloc[:, :-1].values
        y = data_cleaned.iloc[:, -1].values
        
        return X, y
    except FileNotFoundError:
        print(f"Error: File '{file_path}' not found. Generating new data...")
        generate_data()
        return preprocess_data(file_path)

X, y = preprocess_data('bluetooth_new_data.csv')

NaN values in dataset:
connection_duration    0
signal_strength        0
data_transfer_rate     0
time_of_day            0
is_intrusion           0
dtype: int64


In [12]:
def train_model():
    X, y = preprocess_data('bluetooth_data.csv')
    
    if len(X) == 0 or len(y) == 0:
        raise ValueError("All data rows contained NaN values and were removed. Please review your data collection process.")

    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)

    X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.3, random_state=42)

    # Try both Logistic Regression and Random Forest
    models = {
        'Logistic Regression': LogisticRegression(class_weight='balanced'),
        'Random Forest': RandomForestClassifier(class_weight='balanced', n_estimators=100)
    }

    best_model = None
    best_accuracy = 0

    for name, model in models.items():
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        accuracy = accuracy_score(y_test, y_pred)
        precision = precision_score(y_test, y_pred)
        
        print(f"\n{name} Results:")
        print(classification_report(y_test, y_pred))
        print(f"Accuracy: {accuracy:.2f}")
        print(f"Precision: {precision:.2f}")

        # Perform cross-validation
        cv_scores = cross_val_score(model, X_scaled, y, cv=5)
        print(f"Cross-validation scores: {cv_scores}")
        print(f"Mean CV score: {cv_scores.mean():.2f}")

        if accuracy > best_accuracy:
            best_accuracy = accuracy
            best_model = model

    return best_model, scaler

train_model()

NaN values in dataset:
connection_duration    0
signal_strength        0
data_transfer_rate     0
time_of_day            0
label                  0
dtype: int64

Logistic Regression Results:
              precision    recall  f1-score   support

           0       0.91      0.54      0.68      2711
           1       0.10      0.50      0.17       289

    accuracy                           0.53      3000
   macro avg       0.51      0.52      0.42      3000
weighted avg       0.83      0.53      0.63      3000

Accuracy: 0.53
Precision: 0.10
Cross-validation scores: [0.539  0.529  0.5005 0.523  0.507 ]
Mean CV score: 0.52

Random Forest Results:
              precision    recall  f1-score   support

           0       0.90      1.00      0.95      2711
           1       0.00      0.00      0.00       289

    accuracy                           0.90      3000
   macro avg       0.45      0.50      0.47      3000
weighted avg       0.82      0.90      0.86      3000

Accuracy: 0.90
Pre

(RandomForestClassifier(class_weight='balanced'), StandardScaler())

In [6]:
# Classify a connection
def classify_connection(model, scaler, sample):
    sample_scaled = scaler.transform([sample])
    return model.predict(sample_scaled)[0]

# Block an intrusion (simulated)
def block_intrusion(sample):
    print(f"Blocking intrusion: {sample}\n")

In [7]:
# Visualization functions
def plot_connections(data):
    plt.figure(figsize=(10, 6))
    plt.scatter(data[:, 0], data[:, 2], c=data[:, -1], cmap='bwr')
    plt.xlabel('Connection Duration')
    plt.ylabel('Data Transfer Rate')
    plt.title('Bluetooth Connections')
    plt.colorbar(label='Intrusion (1) / Normal (0)')
    plt.show()

def plot_feature_importance(model, feature_names):
    if hasattr(model, 'feature_importances_'):
        importances = model.feature_importances_
        indices = np.argsort(importances)[::-1]

        plt.figure(figsize=(10, 6))
        plt.title("Feature Importances")
        plt.bar(range(len(importances)), importances[indices])
        plt.xticks(range(len(importances)), [feature_names[i] for i in indices], rotation=45)
        plt.tight_layout()
        plt.show()
    else:
        print("This model doesn't support feature importance visualization.")