In [None]:
import os
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import IsolationForest
import matplotlib.pyplot as plt
import pickle

In [None]:
dataset=pd.read_csv("application_logs.csv")

In [None]:
for column in dataset.columns:
    print(f"Unique values in column '{column}':")
    print(dataset[column].nunique())
    print()  

In [None]:
def name_clean(app):
    sanitized_app_name = app.replace(" ", "_").replace("(", "").replace(")", "").replace(".", "").replace("/", "").replace("\\", "")
    return sanitized_app_name

In [None]:
def data_clean(data):
    columns_to_drop = [
        "timestamp","application_name"
    ]
    
    df=data.drop(columns=columns_to_drop, errors="ignore")
    
    # Function to extract the port numbers
    def extract_ports(domains_accessed):
        # Split the domain-port pairs and extract the port number
        ports = []
        for domain in domains_accessed.split(','):
            port = domain.split(':')[-1]  # Extract port number after ':'
            ports.append(port)
        return ports


    # Apply the function to extract port numbers and transform the data
    new_rows = []
    for index, row in df.iterrows():
        if row.isna().any():
            new_row = row.drop("domains_accessed")  # Remove the original domain column
            new_row["connected_ports"] = 0
            new_rows.append(new_row)
            continue

        # Extract ports for the current row
        ports = extract_ports(row["domains_accessed"])
        
        # Create new rows with extracted port numbers
        for port in ports:
            new_row = row.drop("domains_accessed")  # Remove the original domain column
            new_row["connected_ports"] = port 
            new_rows.append(new_row)

    # Create a new DataFrame with the transformed data
    transformed_df = pd.DataFrame(new_rows)
    
    return transformed_df

In [None]:
import os 
CURRENT_PATH=os.getcwd()
DATASET_PATH=CURRENT_PATH+"\\dataset"
os.makedirs(DATASET_PATH, exist_ok=True)

#this is give a list of all the different app that are running 
app_list=dataset["application_name"].unique().tolist()

for app in app_list:
    
    if pd.isna(app):  # Skip missing values
        continue
    
    app_data = dataset[dataset['application_name'] == app]
    
    app=name_clean(app)
    app_data= data_clean(app_data)

    os.makedirs(DATASET_PATH + f"\\{app}",exist_ok=True)
    app_data.to_csv(DATASET_PATH + f"\\{app}"+f"\\{app}.csv")
    

In [None]:

CURRENT_PATH = os.getcwd()
MODEL_PATH = CURRENT_PATH + "\\model"
DETECTED_OUTPUT_PATH = os.path.join(CURRENT_PATH, "detected_output")
os.makedirs(MODEL_PATH, exist_ok=True)

app_list = dataset["application_name"].unique().tolist()

# Add a function to plot anomalies based on anomaly scores
def plot_anomalies(data_scaled, anomaly_scores, app_name):
    plt.figure(figsize=(10, 6))
    plt.scatter(range(len(data_scaled)), anomaly_scores, c='blue', label='Normal', alpha=0.7)
    plt.scatter(range(len(data_scaled)), anomaly_scores, c=anomaly_scores, cmap='coolwarm', label='Anomaly', alpha=0.7)
    plt.title(f"Anomalies detected for {app_name}")
    plt.xlabel("Index")
    plt.ylabel("Anomaly Score")
    plt.colorbar(label='Anomaly Score')
    plt.legend()
    plt.show()

for app in app_list:
    if pd.isna(app):  
        continue
    app = name_clean(app)
    app_data = pd.read_csv(DATASET_PATH + f"\\{app}" + f"\\{app}.csv")
    
    scaler = StandardScaler()
    data_scaled = scaler.fit_transform(app_data)
 
    # Train an anomaly detection model
    model = IsolationForest(n_estimators=100, contamination=0.01, random_state=42)
    model.fit(data_scaled)
    
    # Predict anomalies
    app_data["Anomaly"] = model.predict(data_scaled)
    app_data["Anomaly"] = app_data["Anomaly"].map({1: "Normal", -1: "Anomaly"})
    
    # Evaluate the model: Get the anomaly scores
    anomaly_scores = model.decision_function(data_scaled)
    
    # Plot anomalies
    plot_anomalies(data_scaled, anomaly_scores, app)
    
    # Save the cleaned data with predictions
    output = os.path.join(DETECTED_OUTPUT_PATH, f"{app}")
    os.makedirs(output, exist_ok=True)
    app_data.to_csv(os.path.join(output, f"cleaned_data_with_predictions_{app}.csv"), index=False)
    
    # Save the trained model
    MODEL_APP_PATH = os.path.join(MODEL_PATH, f"{app}")
    os.makedirs(MODEL_APP_PATH, exist_ok=True)
    model_filename = os.path.join(MODEL_APP_PATH, f"{app}_model.pkl")
    
    with open(model_filename, 'wb') as model_file:
        pickle.dump(model, model_file)
    
    # Display a summary for the current app
    print(f"Summary of anomalies detected for {app}:")
    print(app_data["Anomaly"].value_counts())
    
    # Display the anomaly score statistics (e.g., mean and std deviation)
    print(f"Anomaly scores for {app}:")
    print(f"Mean anomaly score: {anomaly_scores.mean()}")
    print(f"Std dev of anomaly score: {anomaly_scores.std()}")