In [43]:
# Generowanie danych

import pandas as pd
import numpy as np
from datetime import datetime, timedelta

num_trams = 200
frequency = '1h'  # Odczyt co godzinę
start_date = '2024-01-01'

def generate_tram_data(days):
    dates = pd.date_range(start=start_date, periods=days*24, freq=frequency)
    data = []
    
    for tram_id in range(num_trams):
        base_temp = np.random.normal(70, 5)
        base_vibration = np.random.normal(4.5, 1)
        base_speed = np.random.normal(35, 10)
        
        # Wzorce sezonowe
        time_of_day = np.sin(2 * np.pi * np.arange(len(dates)) / 24)
        day_of_week = np.sin(2 * np.pi * np.arange(len(dates)) / (24*7))
        
        for i, date in enumerate(dates):
            # Normalne wahania wartości
            temp = base_temp + \
                   time_of_day[i] * 8 + \
                   day_of_week[i] * 3 + \
                   np.random.normal(0, 2)
            
            vibration = base_vibration + \
                       time_of_day[i] * 1.2 + \
                       np.random.normal(0, 0.3)
            
            speed = np.clip(base_speed + \
                           time_of_day[i] * 15 + \
                           np.random.normal(0, 5), 0, 80)
            
            # Oznaczenie anomalii (0 = normalny, 1 = anomalia)
            anomaly = 0
            
            # Symulacja awarii (5% tramwajów doświadczy awarii)
            if tram_id < num_trams * 0.05:
                # Punkt rozpoczęcia awarii
                failure_start = np.random.randint(0, len(dates)-10)
                
                if i >= failure_start:
                    # Postępująca awaria
                    progress = (i - failure_start) / 10
                    
                    if progress < 0.3: # Faza wstępna
                        temp += progress * 20
                        vibration += progress * 3
                    elif progress < 0.7:  # Faza krytyczna
                        temp += 15 + progress * 30
                        vibration += 2 + progress * 8
                        speed *= (1 - progress*0.5)
                        anomaly = 1
                    else:  # Pełna awaria
                        temp += 40 + progress * 50
                        vibration += 10 + progress * 15
                        speed = 0
                        anomaly = 1

            data.append({
                'timestamp': date,
                'tram_id': f'tram_{tram_id:03d}',
                'temperature': max(temp, 20),
                'vibration': max(vibration, 0),
                'speed': max(speed, 0),
                'anomaly': anomaly
            })
    
    return pd.DataFrame(data)




In [44]:
def generate_sensor_data (n_normal=180, n_anomaly=20, random_state=42):
    """ Generuje dane czujnikow z anomaliami ."""
    np.random.seed(random_state)
    # Normalne tramwaje - 3 klastry operacyjne
    normal = np . vstack ([
        np.random.randn(60, 3) * 0.5 + [50, 10, 30], 
        np.random.randn(60, 3) * 0.5 + [55, 12, 35],
        np.random.randn(60, 3) * 0.5 + [48, 8, 28]
    ])
    # Anomalie - nietypowe zachowania
    anomalies = np.vstack ([
        np.random.randn (10, 3) * 0.3 + [80, 25, 15], # przegrzanie
        np.random.randn(10, 3) * 0.3 + [30, 30, 5] # awaria silnika
    ])
    X = np.vstack([normal, anomalies])
    y_true = np.array([0] * n_normal + [1] * n_anomaly)
    return X , y_true

In [45]:
from sklearn.model_selection import train_test_split
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report
from sklearn.model_selection import GridSearchCV

df = generate_tram_data(5)
y = df['anomaly']
X = df[['speed', 'vibration', 'temperature']]

# X, y = generate_sensor_data()
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

X_train, y_train, X_test, y_test = train_test_split(X_scaled, y, shuffle=True, random_state=42)

def kmeans_anomaly_detection(X, y, percentile=95):
    param_grid = {
        "n_clusters": [1, 2, 5, 10, 20, 30, 50, 100]
    }
    kmeans = KMeans()
    grid = GridSearchCV(
        estimator=kmeans,
        param_grid=param_grid,
        scoring='f1_macro'
    )

    grid.fit(X, y)
    n_clusters = grid.best_params_['n_clusters']
    kmeans = KMeans(n_clusters=n_clusters, n_init=10)
    kmeans.fit(X)

    distances = np.array([np.linalg.norm(X - center, axis=1) for center in kmeans.cluster_centers_])    
    min_distances = np.min(distances, axis=0)
    threshold = np.percentile(min_distances, percentile)
    anomalies = (distances > threshold).astype(int)
    y_anomalies = []
    for i, l in enumerate(kmeans.labels_):
        y_anomalies.append(anomalies[l, i])

    print(f"kmeans anomaly detection scores for n_clusters={n_clusters} \n {classification_report(y, y_anomalies)}")

kmeans_anomaly_detection(X_scaled, y)

kmeans anomaly detection scores for n_clusters=1 
               precision    recall  f1-score   support

           0       1.00      0.97      0.99     23371
           1       0.48      0.92      0.64       629

    accuracy                           0.97     24000
   macro avg       0.74      0.95      0.81     24000
weighted avg       0.98      0.97      0.98     24000



In [58]:
from sklearn.cluster import DBSCAN

def dbscan_anomaly_prediction(X, y, eps, min_samples):
    db = DBSCAN(eps=eps, min_samples=min_samples)
    db.fit(X)

    y_anomalies = [1 if l==-1 else 0 for l in db.labels_]
    print(f"dbscan anomaly detection scores for eps={eps}, minPts={min_samples} \n \
          {classification_report(y, y_anomalies)}")
    
dbscan_anomaly_prediction(X, y, 2.0, 10)



dbscan anomaly detection scores for eps=2.0, minPts=10 
                         precision    recall  f1-score   support

           0       1.00      0.99      0.99     23371
           1       0.66      0.97      0.79       629

    accuracy                           0.99     24000
   macro avg       0.83      0.98      0.89     24000
weighted avg       0.99      0.99      0.99     24000



In [47]:
from sklearn.neighbors import LocalOutlierFactor

def lof_anomalies_predict(X, y, n_neighbors):
    lof = LocalOutlierFactor(n_neighbors=n_neighbors)
    anomalies = lof.fit_predict(X)

    y_anomalies = [1 if a == -1 else 0 for a in anomalies]
    print(f"lof anomaly detection scores for n_neighbors={n_neighbors} \n \
          {classification_report(y, y_anomalies)}")
    
lof_anomalies_predict(X, y, 1200)

lof anomaly detection scores for n_neighbors=1200 
                         precision    recall  f1-score   support

           0       1.00      0.98      0.99     23371
           1       0.62      1.00      0.76       629

    accuracy                           0.98     24000
   macro avg       0.81      0.99      0.88     24000
weighted avg       0.99      0.98      0.99     24000

