# WHAT NOT TO DO

In [53]:
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, StandardScaler

# Defining Autoencoder model
class Autoencoder(nn.Module):
    def __init__(self, input_size, hidden_size):
        super(Autoencoder, self).__init__()
        self.encoder = nn.Sequential(
            nn.Linear(input_size, hidden_size),
            nn.ReLU()
        )
        self.decoder = nn.Sequential(
            nn.Linear(hidden_size, input_size),
            nn.Sigmoid()
        )

    def forward(self, x):
        x = self.encoder(x)
        x = self.decoder(x)
        return x


# Preprocessing
def preprocess_data(df):
    selected_features = ['user.id', 'client.ip', 'o365.audit.ApplicationDisplayName', 'o365.audit.AuthenticationType']
    X = df[selected_features].copy()

    label_encoder_user_id = LabelEncoder()
    X['user.id'] = label_encoder_user_id.fit_transform(X['user.id'])

    X['client.ip'] = X['client.ip'].apply(lambda x: int(''.join(x.split('.'))))

    categorical_features = ['o365.audit.ApplicationDisplayName', 'o365.audit.AuthenticationType']
    X = pd.get_dummies(X, columns=categorical_features)

    scaler = StandardScaler()
    numerical_features = ['client.ip']
    X[numerical_features] = scaler.fit_transform(X[numerical_features])

    return X

df = pd.read_csv('FileAccessX.csv')
df_preprocessed = preprocess_data(df)

# Convert dataframe to PyTorch tensors
data = torch.tensor(df_preprocessed.values, dtype=torch.float64)

# Set hyperparameters
input_size = len(df_preprocessed.columns)
hidden_size = 8
learning_rate = 0.001
num_epochs = 20
batch_size = 32

model = Autoencoder(input_size, hidden_size)
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

data_loader = DataLoader(data, batch_size=batch_size, shuffle=True)

# Training
for epoch in range(num_epochs):
    for batch in data_loader:
        # Forward pass
        output = model(batch)
        loss = criterion(output, batch)
        
        # Backward pass
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
    
    print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}')

# Evaluate
with torch.no_grad():
    reconstructions = model(data)
    mse_loss = nn.MSELoss(reduction='none')(reconstructions, data)
    anomalies = torch.mean(mse_loss, dim=1).numpy()

threshold = anomalies.mean() + 3 * anomalies.std()

anomaly_indices = anomalies > threshold
anomaly_samples = df_preprocessed[anomaly_indices]
anomaly_samples_original = df.iloc[anomaly_indices]

anomaly_samples_original

Epoch [1/20], Loss: 17213.8125
Epoch [2/20], Loss: 14354.4951
Epoch [3/20], Loss: 9977.1201
Epoch [4/20], Loss: 15992.7539
Epoch [5/20], Loss: 19430.0781
Epoch [6/20], Loss: 15079.1543
Epoch [7/20], Loss: 13777.4141
Epoch [8/20], Loss: 19416.2012
Epoch [9/20], Loss: 10373.2568
Epoch [10/20], Loss: 15364.5850
Epoch [11/20], Loss: 16404.5996
Epoch [12/20], Loss: 15176.6631
Epoch [13/20], Loss: 11686.9072
Epoch [14/20], Loss: 16997.4121
Epoch [15/20], Loss: 17770.0605
Epoch [16/20], Loss: 14433.0322
Epoch [17/20], Loss: 20658.5430
Epoch [18/20], Loss: 11955.1025
Epoch [19/20], Loss: 12001.1719
Epoch [20/20], Loss: 14619.4453


Unnamed: 0,timestamp,client.ip,file.directory,file.extension,file.name,network.type,o365.audit.ApplicationDisplayName,o365.audit.AuthenticationType,o365.audit.IsManagedDevice,o365.audit.Platform,url.original,user.id,user_agent.original,count


In [58]:
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, StandardScaler

# Define the autoencoder model
class Autoencoder(nn.Module):
    def __init__(self, input_size, hidden_size):
        super(Autoencoder, self).__init__()
        self.encoder = nn.Sequential(
            nn.Linear(input_size, hidden_size),
            nn.ReLU()
        )
        self.decoder = nn.Sequential(
            nn.Linear(hidden_size, input_size),
            nn.Sigmoid()
        )

    def forward(self, x):
        x = self.encoder(x)
        x = self.decoder(x)
        return x

def preprocess_data(df):
    selected_features = ['client.ip', 'o365.audit.ApplicationDisplayName', 'o365.audit.AuthenticationType']
    df = df[selected_features]

    # One-hot encoding for everything
    df = pd.get_dummies(df, columns=['client.ip', 'o365.audit.ApplicationDisplayName', 'o365.audit.AuthenticationType'])
    return df


df = pd.read_csv('FileAccessX.csv')
df_preprocessed = preprocess_data(df)

data = torch.tensor(df_preprocessed.values, dtype=torch.float)

# Set hyperparameters
input_size = len(df_preprocessed.columns)
hidden_size = 8
learning_rate = 0.001
num_epochs = 20
batch_size = 32

model = Autoencoder(input_size, hidden_size)
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

data_loader = DataLoader(data, batch_size=batch_size, shuffle=True)

for epoch in range(num_epochs):
    for batch in data_loader:
        # Forward pass
        output = model(batch)
        loss = criterion(output, batch)
        
        # Backward pass
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
    
    print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}')

with torch.no_grad():
    reconstructions = model(data)
    mse_loss = nn.MSELoss(reduction='none')(reconstructions, data)
    anomalies = torch.mean(mse_loss, dim=1).numpy()

threshold = anomalies.mean() + 3 * anomalies.std()

anomaly_indices = anomalies > threshold
anomaly_samples = df_preprocessed[anomaly_indices]

anomaly_samples_original = df.iloc[anomaly_indices]

anomaly_samples_original[['client.ip', 'o365.audit.ApplicationDisplayName', 'o365.audit.AuthenticationType', 'count']]

Epoch [1/20], Loss: 0.0035
Epoch [2/20], Loss: 0.0022
Epoch [3/20], Loss: 0.0021
Epoch [4/20], Loss: 0.0017
Epoch [5/20], Loss: 0.0011
Epoch [6/20], Loss: 0.0011
Epoch [7/20], Loss: 0.0007
Epoch [8/20], Loss: 0.0005
Epoch [9/20], Loss: 0.0005
Epoch [10/20], Loss: 0.0008
Epoch [11/20], Loss: 0.0007
Epoch [12/20], Loss: 0.0006
Epoch [13/20], Loss: 0.0004
Epoch [14/20], Loss: 0.0005
Epoch [15/20], Loss: 0.0005
Epoch [16/20], Loss: 0.0005
Epoch [17/20], Loss: 0.0002
Epoch [18/20], Loss: 0.0003
Epoch [19/20], Loss: 0.0004
Epoch [20/20], Loss: 0.0001


Unnamed: 0,client.ip,o365.audit.ApplicationDisplayName,o365.audit.AuthenticationType,count
35,4.229.225.166,MSOCSWord,OAuth,4789
54,173.178.11.167,SharePoint Online Client Extensibility,OAuth,705
55,173.178.11.167,SharePoint Online Client Extensibility,OAuth,1410
368,4.229.225.166,MSOCSWord,OAuth,3005
454,142.126.11.168,OneNoteLegacyClient,FormsCookieAuth,645
...,...,...,...,...
29695,216.144.124.109,sharepoint,,35
29708,216.144.124.109,sharepoint,FormsCookieAuth,35
29717,216.144.124.109,sharepoint for ios,FormsCookieAuth,69
29718,216.144.124.109,SharePoint,OAuth,34


In [45]:
import pandas as pd
from sklearn.preprocessing import OneHotEncoder

ip_address = '10.10.20.31'

# Split IP address into octets
octets = ip_address.split('.')

# Encode octets using one-hot encoding
onehot_encoder = OneHotEncoder(sparse_output=False, categories='auto')
encoded_octets = onehot_encoder.fit_transform([[octet] for octet in octets])

# Flatten
encoded_ip = ''.join([str(int(bit)) for octet in encoded_octets for bit in octet])
encoded_ip

'100100010001'