<a href="https://colab.research.google.com/github/harshanand63/CodeAlpha_Titanic_Classification/blob/main/Copy_of_Final_Major_Project_Code.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Import Necessary Libraries

In [None]:
import pandas as pd
import numpy as np
import nltk
import plotly.express as px
import seaborn as sns
import matplotlib.pyplot as plt
import ipywidgets as widgets
from sklearn.ensemble import IsolationForest
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from google.colab import files

# Download NLTK resources
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

Upload and Select a File

In [None]:
uploaded_files = files.upload()
file_names = list(uploaded_files.keys())

file_selector = widgets.Dropdown(
    options=file_names,
    description='Select File:',
    disabled=False,
)
display(file_selector)

Saving BOOTLOG.TXT to BOOTLOG.TXT
Saving DETLOG.TXT to DETLOG.TXT
Saving extracted_data.txt to extracted_data.txt
Saving FRUNLOG.TXT to FRUNLOG.TXT
Saving installed_software.txt to installed_software.txt
Saving Logs data.txt to Logs data.txt
Saving NETLOG.TXT to NETLOG.TXT
Saving SETUPLOG.TXT to SETUPLOG.TXT
Saving startup_programs.txt to startup_programs.txt
Saving suspicious_files.txt to suspicious_files.txt
Saving temp_debug_files.txt to temp_debug_files.txt
Saving text data.txt to text data.txt


Dropdown(description='Select File:', options=('BOOTLOG.TXT', 'DETLOG.TXT', 'extracted_data.txt', 'FRUNLOG.TXT'…

Parse the Uploaded File

In [None]:
def parse_output_txt(file_path):
    data = []
    with open(file_path, 'r') as file:
        for line in file:
            if line.strip():
                entry_type = line.split()[0]  # Extract entry type
                entry = ' '.join(line.split()[1:])  # Extract the remaining entry
                data.append({'type': entry_type, 'entry': entry})
    return pd.DataFrame(data)


Generate Synthetic Data Based on File Type

In [None]:
def generate_synthetic_data(file_label):
    synthetic_data = []
    labels = []
    file_specific_entries = {
        "BOOTLOG.TXT": "boot_device.sys",
        "DETLOG.TXT": "detection_event.log",
        "extracted_data.txt": "extracted_record.dat",
        "FRUNLOG.TXT": "first_run.log",
        "installed_software.txt": "software_list.inf",
        "Logs data.txt": "log_entry.dat",
        "NETLOG.TXT": "network_event.log",
        "SETUPLOG.TXT": "setup_process.log",
        "startup_programs.txt": "startup_task.exe",
        "suspicious_files.txt": "malware_detected.tmp",
        "temp_debug_files.txt": "debug_log.tmp",
        "text data.txt": "text_log.txt"
    }

    base_name = file_specific_entries.get(file_label, "default_file.txt")

    for _ in range(1000):
        entry = {'id': np.random.randint(1000, 10000), 'level': np.random.randint(1, 5),
                 'type': 'file', 'name': f'normal_{base_name}'}
        synthetic_data.append(entry)
        labels.append(1)

    for _ in range(50):
        entry = {'id': np.random.randint(1000, 10000), 'level': np.random.randint(1, 5),
                 'type': 'file', 'name': f'anomalous_{base_name}'}
        synthetic_data.append(entry)
        labels.append(-1)

    df = pd.DataFrame(synthetic_data)
    df['label'] = labels
    df['processed_name'] = df['name'].apply(lambda x: x.split('.')[0].lower())
    return df

Process the Selected File for Anomaly Detection

In [None]:
def process_selected_file(file_name):
    df = parse_output_txt(file_name)
    df['label'] = 1  # Assume normal, anomaly detection will update it
    df['processed_entry'] = df['entry'].apply(lambda x: x.split(':')[-1].lower().strip())

    synthetic_df = generate_synthetic_data(file_name)
    combined_entries = pd.concat([df['processed_entry'], synthetic_df['processed_name']])

    vectorizer = TfidfVectorizer()
    X_tfidf_combined = vectorizer.fit_transform(combined_entries)
    X_tfidf_real = X_tfidf_combined[:len(df)]
    X_tfidf_synthetic = X_tfidf_combined[len(df):]

    iso_forest = IsolationForest(contamination=0.05, random_state=42)
    y_pred_real = iso_forest.fit_predict(X_tfidf_real.toarray())
    y_pred_synthetic = iso_forest.predict(X_tfidf_synthetic.toarray())

    df['impact_score'] = X_tfidf_real.max(axis=1).toarray().flatten()
    df['behavior_deviation_score'] = iso_forest.decision_function(X_tfidf_real.toarray())
    df['anomaly_label'] = y_pred_real

    synthetic_df['impact_score'] = X_tfidf_synthetic.max(axis=1).toarray().flatten()
    synthetic_df['behavior_deviation_score'] = iso_forest.decision_function(X_tfidf_synthetic.toarray())
    synthetic_df['anomaly_label'] = y_pred_synthetic

    return df, synthetic_df

# Process the selected file
df, synthetic_df = process_selected_file(file_selector.value)

Display synthetic data

In [None]:
print("Synthetic Data:")
print(synthetic_df.head(10).to_string())

Synthetic Data:
     id  level  type                  name  label    processed_name  impact_score  behavior_deviation_score  anomaly_label
0  5205      3  file  normal_log_entry.dat      1  normal_log_entry           1.0                  0.030774              1
1  5007      4  file  normal_log_entry.dat      1  normal_log_entry           1.0                  0.030774              1
2  6097      2  file  normal_log_entry.dat      1  normal_log_entry           1.0                  0.030774              1
3  9786      4  file  normal_log_entry.dat      1  normal_log_entry           1.0                  0.030774              1
4  4580      2  file  normal_log_entry.dat      1  normal_log_entry           1.0                  0.030774              1
5  4589      1  file  normal_log_entry.dat      1  normal_log_entry           1.0                  0.030774              1
6  5513      3  file  normal_log_entry.dat      1  normal_log_entry           1.0                  0.030774              1


Evaluate Model Accuracy

In [None]:
# Evaluate the model
accuracy_real = accuracy_score(df['label'], df['anomaly_label'])
cm_real = confusion_matrix(df['label'], df['anomaly_label'])
print(f"Real Data Accuracy for {file_selector.value}: {accuracy_real * 100:.2f}%")
print("Real Data Confusion Matrix:", cm_real)

Real Data Accuracy for Logs data.txt: 94.74%
Real Data Confusion Matrix: [[  0   0]
 [  9 162]]


Display Anomaly Detection Results

In [None]:
print(classification_report(df['label'], df['anomaly_label'], target_names=['Normal', 'Anomaly']))

# Display real data anomaly detection results
print("Anomaly Detection Results:")
print(df[['type', 'entry', 'impact_score', 'behavior_deviation_score', 'anomaly_label']].head(10).to_string())

              precision    recall  f1-score   support

      Normal       0.00      0.00      0.00         0
     Anomaly       1.00      0.95      0.97       171

    accuracy                           0.95       171
   macro avg       0.50      0.47      0.49       171
weighted avg       1.00      0.95      0.97       171

Anomaly Detection Results:
  type                         entry  impact_score  behavior_deviation_score  anomaly_label
0  Log                  files found:      0.000000                  0.030774              1
1    -         345-128-4: NTUSER.DAT      0.854940                  0.012730              1
2    -         9787-128-4: index.dat      0.821334                  0.013684              1
3    -          454-128-3: index.dat      0.821334                  0.013684              1
4    -         9789-128-4: index.dat      0.821334                  0.013684              1
5    -        11567-128-3: index.dat      0.821334                  0.013684              1
6 

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Visualize Anomaly Detection

For synthetic data

In [None]:
fig_synthetic = px.scatter(synthetic_df, x=synthetic_df.index, y='behavior_deviation_score', color='anomaly_label',
                           title=f'Anomaly Detection for Synthetic Data ({file_selector.value})',
                           labels={'behavior_deviation_score': 'Deviation Score'})
fig_synthetic.show()

For Real Data

In [None]:
# Visualization
fig = px.scatter(df, x=df.index, y='behavior_deviation_score', color='anomaly_label',
                 title=f'Anomaly Detection for {file_selector.value}',
                 labels={'behavior_deviation_score': 'Deviation Score', 'index': 'Data Entry'})
fig.show()