This notebook is for exploring, visualizing, and analyzing the processed data.

***


In [3]:
# This script handles data cleaning and preparation.
import pandas as pd
import numpy as np
import re

def load_data(file_path):
    """
    Load the raw cybercrime dataset.
    """
    return pd.read_csv(file_path)

def clean_data(df):
    """
    Clean and preprocess the dataset.
    """
    # Normalize column names
    df.columns = [col.strip().lower().replace(" ", "_") for col in df.columns]

    # Explicitly cast numeric columns before filling
    for col in df.select_dtypes(include=[np.number]).columns:
        df[col] = df[col].astype(float).fillna(0)  # Fill numeric NaNs with 0

    # Fill non-numeric columns
    for col in df.select_dtypes(exclude=[np.number]).columns:
        df[col] = df[col].fillna("Unknown")  # Fill non-numeric NaNs with "Unknown"

    # Extract year from a date column (example)
    if 'date' in df.columns:
        df['year'] = pd.to_datetime(df['date'], errors='coerce').dt.year

    # Example: Extract IP addresses using regex
    if 'description' in df.columns:
        df['ip_address'] = df['description'].str.extract(r'(\b\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}\b)')

    return df


def save_processed_data(df, output_path):
    """
    Save the cleaned dataset to a new CSV file.
    """
    df.to_csv(output_path, index=False)

if __name__ == "__main__":
    input_file = "/repos/PFDA-project/data/cyber_data.csv"
    output_file = "/repos/PFDA-project/data/processed_data.csv"

    # Load, clean, and save the data
    print("Loading data...")
    raw_data = load_data(input_file)
    print("Cleaning data...")
    cleaned_data = clean_data(raw_data)
    print(f"Saving processed data to {output_file}...")
    save_processed_data(cleaned_data, output_file)
    print("Preprocessing complete!")



Loading data...
Cleaning data...
Saving processed data to /repos/PFDA-project/data/processed_data.csv...
Preprocessing complete!


In [4]:
# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense

# Load the processed data
data_path = "/repos/PFDA-project/data/processed_data.csv"
df = pd.read_csv(data_path)

# Display the first few rows of the dataset
df.head()

# --- Data Exploration ---
# Overview of the dataset
df.info()

# Summary statistics
df.describe()

# Distribution of cybercrime categories (example)
if 'category' in df.columns:
    plt.figure(figsize=(10, 6))
    sns.countplot(data=df, x='category', order=df['category'].value_counts().index, palette='viridis')
    plt.title('Distribution of Cybercrime Categories')
    plt.xticks(rotation=45)
    plt.show()

# --- Feature Engineering ---
# Create feature columns (example for machine learning)
if 'year' in df.columns and 'category' in df.columns:
    features = df[['year']]
    target = df['category']

    # Split the data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=42)

# --- Machine Learning Model ---
# Random Forest Classifier (example)
model = RandomForestClassifier()
model.fit(X_train, y_train)

# Evaluate the model
accuracy = model.score(X_test, y_test)
print(f"Model Accuracy: {accuracy:.2f}")

# --- Neural Network with TensorFlow ---
# Build a simple neural network
nn_model = Sequential([
    Dense(16, activation='relu', input_dim=X_train.shape[1]),
    Dense(8, activation='relu'),
    Dense(len(y_train.unique()), activation='softmax')  # Adjust for the number of categories
])

nn_model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
nn_model.fit(X_train, y_train, epochs=10, batch_size=32)

# --- Advanced Visualizations ---
# Correlation heatmap
plt.figure(figsize=(10, 8))
sns.heatmap(df.corr(), annot=True, cmap='coolwarm', fmt=".2f")
plt.title("Correlation Heatmap")
plt.show()

# Trends over time (example for 'year' and 'category')
if 'year' in df.columns and 'category' in df.columns:
    crime_trends = df.groupby('year')['category'].count()
    plt.figure(figsize=(12, 6))
    sns.lineplot(data=crime_trends, marker="o")
    plt.title("Cybercrime Trends Over Time")
    plt.xlabel("Year")
    plt.ylabel("Number of Crimes")
    plt.show()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 77623 entries, 0 to 77622
Data columns (total 18 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   attackdate            77623 non-null  object 
 1   country               77623 non-null  object 
 2   spam                  77623 non-null  float64
 3   ransomware            77623 non-null  float64
 4   local_infection       77623 non-null  float64
 5   exploit               77623 non-null  float64
 6   malicious_mail        77623 non-null  float64
 7   network_attack        77623 non-null  float64
 8   on_demand_scan        77623 non-null  float64
 9   web_threat            77623 non-null  float64
 10  rank_spam             77623 non-null  float64
 11  rank_ransomware       77623 non-null  float64
 12  rank_local_infection  77623 non-null  float64
 13  rank_exploit          77623 non-null  float64
 14  rank_malicious_mail   77623 non-null  float64
 15  rank_network_attack

NameError: name 'X_train' is not defined