This notebook is for exploring, visualizing, and analyzing the processed data.

***


In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense

# Load the processed data
data_path = "data/processed_data.csv"
df = pd.read_csv(data_path)

# Display the first few rows of the dataset
df.head()

# --- Data Exploration ---
# Overview of the dataset
df.info()

# Summary statistics
df.describe()

# Distribution of cybercrime categories (example)
if 'category' in df.columns:
    plt.figure(figsize=(10, 6))
    sns.countplot(data=df, x='category', order=df['category'].value_counts().index, palette='viridis')
    plt.title('Distribution of Cybercrime Categories')
    plt.xticks(rotation=45)
    plt.show()

# --- Feature Engineering ---
# Create feature columns (example for machine learning)
if 'year' in df.columns and 'category' in df.columns:
    features = df[['year']]
    target = df['category']

    # Split the data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=42)

# --- Machine Learning Model ---
# Random Forest Classifier (example)
model = RandomForestClassifier()
model.fit(X_train, y_train)

# Evaluate the model
accuracy = model.score(X_test, y_test)
print(f"Model Accuracy: {accuracy:.2f}")

# --- Neural Network with TensorFlow ---
# Build a simple neural network
nn_model = Sequential([
    Dense(16, activation='relu', input_dim=X_train.shape[1]),
    Dense(8, activation='relu'),
    Dense(len(y_train.unique()), activation='softmax')  # Adjust for the number of categories
])

nn_model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
nn_model.fit(X_train, y_train, epochs=10, batch_size=32)

# --- Advanced Visualizations ---
# Correlation heatmap
plt.figure(figsize=(10, 8))
sns.heatmap(df.corr(), annot=True, cmap='coolwarm', fmt=".2f")
plt.title("Correlation Heatmap")
plt.show()

# Trends over time (example for 'year' and 'category')
if 'year' in df.columns and 'category' in df.columns:
    crime_trends = df.groupby('year')['category'].count()
    plt.figure(figsize=(12, 6))
    sns.lineplot(data=crime_trends, marker="o")
    plt.title("Cybercrime Trends Over Time")
    plt.xlabel("Year")
    plt.ylabel("Number of Crimes")
    plt.show()
