# AIX System Metrics Analysis & Model Training

This notebook connects to the PostgreSQL database to retrieve system metrics data collected from AIX servers. We'll use this data to:
1. Explore and visualize the metrics,
2. Preprocess the data for machine learning,
3. Train anomaly detection models,
4. Evaluate and save the models for use in the Django application

# System Metrics Analysis : Anomaly Dtetection and Classification
# ================================================================

# Import and Setup

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import IsolationForest, RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.svm import OneClassSVM
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import GridSearchCV
import joblib
import os
import warnings
warnings.filterwarnings('ignore')

#Set style for plots
plt.style.use('ggplot')
sns.set(font_scale=1.2)

# 2. Load and Prepare data from postgres
# Import necessary library for postgres

In [2]:
# import psycopg2
# from psycopg2 import sql
# from sqlalchemy import create_engine
# import json
# import datetime

# def load_system_metrics(db_config=None, use_sample_data=False, sample_size=10000, data_source="db"):
#     """
#     Load system metrics from PostgreSQL database in Docker or from a JSON file
    
#     Parameters:
#     -----------
#     db_config : dict
#         Dictionary with database connection parameters (host, port, dbname, user, password)
#         If None, default values will be used
#     use_sample_data : bool
#         If True, generates sample data instead of connecting to the database
#     sample_size : int
#         Number of rows to sample from the database (to avoid memory issues)
#     data_source : str
#         Source of data: "db" for database, "json" for JSON files
        
#     Returns:
#     --------
#     DataFrame : Pandas DataFrame with system metrics data
#     """
#     if use_sample_data:
#         return generate_sample_data()
    
#     if data_source == "json":
#         return load_from_json()
        
#     # Default database configuration
#     if db_config is None:
#         db_config = {
#             'host': 'postgres',  # Docker host - may need to be changed if Docker is on a different host
#             'port': '5432',       # Default PostgreSQL port - change if mapped differently
#             'dbname': 'aix_monitor',  # Your database name
#             'user': 'postgres',   # Your database username
#             'password': '@dmin**@@2025'  # Your database password
#         }
    
#     try:
#         # Create connection string
#         conn_string = f"postgresql://{db_config['user']}:{db_config['password']}@{db_config['host']}:{db_config['port']}/{db_config['dbname']}"
        
#         # Connect using SQLAlchemy (easier for pandas integration)
#         engine = create_engine(conn_string)
        
#         print("Connected to PostgreSQL. Loading data...")
        
#         # Strategy for handling large datasets:
#         # 1. Sample a subset of the data (based on sample_size parameter)
#         # 2. Use SQL to aggregate or filter data when possible
        
#         # Use more efficient SQL query with sampling to avoid memory issues
#         # This example assumes specific table structures - adjust based on your schema
        
#         # For vmstat metrics
#         vmstat_query = f"""
#         SELECT * FROM vmstat_metrics
#         TABLESAMPLE BERNOULLI ({sample_size * 100.0 / 80000})  -- Adjust percentage based on your total rows
#         ORDER BY RANDOM()
#         LIMIT {sample_size}
#         """
        
#         # For iostat metrics (assuming similar volume)
#         iostat_query = f"""
#         SELECT * FROM iostat_metrics
#         TABLESAMPLE BERNOULLI ({sample_size * 100.0 / 80000})
#         ORDER BY RANDOM()
#         LIMIT {sample_size}
#         """
        
#         # For netstat metrics
#         netstat_query = f"""
#         SELECT * FROM netstat_metrics
#         TABLESAMPLE BERNOULLI ({sample_size * 100.0 / 80000})
#         ORDER BY RANDOM()
#         LIMIT {sample_size}
#         """
        
#         # For process metrics
#         process_query = f"""
#         SELECT * FROM process_metrics
#         TABLESAMPLE BERNOULLI ({sample_size * 100.0 / 80000})
#         ORDER BY RANDOM()
#         LIMIT {sample_size}
#         """
        
#         # Load data from each table
#         try:
#             vmstat_df = pd.read_sql(vmstat_query, engine)
#             print(f"Loaded {len(vmstat_df)} vmstat records")
#         except Exception as e:
#             print(f"Error loading vmstat data: {e}")
#             vmstat_df = pd.DataFrame()
            
#         try:
#             iostat_df = pd.read_sql(iostat_query, engine)
#             print(f"Loaded {len(iostat_df)} iostat records")
#         except Exception as e:
#             print(f"Error loading iostat data: {e}")
#             iostat_df = pd.DataFrame()
            
#         try:
#             netstat_df = pd.read_sql(netstat_query, engine)
#             print(f"Loaded {len(netstat_df)} netstat records")
#         except Exception as e:
#             print(f"Error loading netstat data: {e}")
#             netstat_df = pd.DataFrame()
            
#         try:
#             process_df = pd.read_sql(process_query, engine)
#             print(f"Loaded {len(process_df)} process records")
#         except Exception as e:
#             print(f"Error loading process data: {e}")
#             process_df = pd.DataFrame()
        
#         # Now we need to merge these datasets on timestamp or other appropriate keys
#         # This is a simplified example - adjust the merge logic based on your data structure
#         if not vmstat_df.empty:
#             # Start with vmstat as the base
#             merged_df = vmstat_df
            
#             # Prepare to add additional features from other tables
#             # This assumes a timestamp field exists in all tables
#             # You may need to adjust the merge logic based on your schema
            
#             # Function to merge data frames approximately on timestamp
#             def merge_on_approximate_timestamp(base_df, add_df, suffix):
#                 if add_df.empty:
#                     return base_df
                
#                 # Make sure timestamp columns are datetime type
#                 for df in [base_df, add_df]:
#                     if 'timestamp' in df.columns and not pd.api.types.is_datetime64_dtype(df['timestamp']):
#                         df['timestamp'] = pd.to_datetime(df['timestamp'])
                
#                 # Find closest timestamp pairs
#                 # This is a simple approach - for production use a more efficient method
#                 merged = pd.merge_asof(
#                     base_df.sort_values('timestamp'),
#                     add_df.sort_values('timestamp'),
#                     on='timestamp',
#                     direction='nearest',
#                     tolerance=pd.Timedelta('5m'),  # Allow 5 minute tolerance
#                     suffixes=('', suffix)
#                 )
#                 return merged
            
#             # Merge with iostat data
#             if not iostat_df.empty:
#                 try:
#                     merged_df = merge_on_approximate_timestamp(merged_df, iostat_df, '_iostat')
#                     print("Merged iostat data")
#                 except Exception as e:
#                     print(f"Error merging iostat data: {e}")
            
#             # Merge with netstat data
#             if not netstat_df.empty:
#                 try:
#                     merged_df = merge_on_approximate_timestamp(merged_df, netstat_df, '_netstat')
#                     print("Merged netstat data")
#                 except Exception as e:
#                     print(f"Error merging netstat data: {e}")
            
#             # For process data, we might want to aggregate by timestamp first
#             if not process_df.empty:
#                 try:
#                     # Group process data by timestamp and compute averages
#                     process_agg = process_df.groupby('timestamp').agg({
#                         'cpu': 'mean',
#                         'mem': 'mean',
#                         'pid': 'count'
#                     }).rename(columns={'pid': 'process_count'}).reset_index()
                    
#                     merged_df = merge_on_approximate_timestamp(merged_df, process_agg, '_process')
#                     print("Merged process data (aggregated)")
#                 except Exception as e:
#                     print(f"Error merging process data: {e}")
            
#             print(f"Final merged dataset shape: {merged_df.shape}")
            
#             # Add classification labels (state) based on system metrics
#             # These thresholds need to be adjusted based on your specific environment
#             merged_df['state'] = 0  # normal by default
            
#             # High load when CPU usage > 60% or runnable processes > 5
#             if 'idle' in merged_df.columns:
#                 high_load = ((1 - merged_df['idle']) > 0.6) | (merged_df['r'] > 5)
#                 merged_df.loc[high_load, 'state'] = 1
            
#                 # Critical when CPU usage > 85% or free memory < 2000
#                 critical = ((1 - merged_df['idle']) > 0.85) | (merged_df['fre'] < 2000)
#                 merged_df.loc[critical, 'state'] = 2
            
#             return merged_df
#         else:
#             print("No vmstat data available. Cannot create merged dataset.")
#             return generate_sample_data()
            
#     except Exception as e:
#         print(f"Error connecting to database: {e}")
#         print("Falling back to sample data...")
#         return generate_sample_data()

In [3]:
# def load_from_json(vmstat_path="D:\\projet\\migration_data\\vmstat_metrics.json", 
#                   iostat_path="D:\\projet\\migration_data\\iostat_metrics.json",
#                   netstat_path="D:\\projet\\migration_data\\netstat_metrics.json",
#                   process_path="D:\\projet\\migration_data\\process_metrics.json",
#                   sample_size=5000):
#     """
#     Load system metrics from JSON files
#     This is an alternative to loading from the database
#     """
#     try:
#         # Load vmstat data
#         with open(vmstat_path, 'r') as f:
#             vmstat_data = json.load(f)
#         # Convert to DataFrame and sample
#         vmstat_df = pd.DataFrame(vmstat_data)
#         if len(vmstat_df) > sample_size:
#             vmstat_df = vmstat_df.sample(sample_size, random_state=42)
#         print(f"Loaded {len(vmstat_df)} vmstat records from JSON")
        
#         # Load iostat data
#         try:
#             with open(iostat_path, 'r') as f:
#                 iostat_data = json.load(f)
#             iostat_df = pd.DataFrame(iostat_data)
#             if len(iostat_df) > sample_size:
#                 iostat_df = iostat_df.sample(sample_size, random_state=42)
#             print(f"Loaded {len(iostat_df)} iostat records from JSON")
#         except Exception as e:
#             print(f"Error loading iostat data: {e}")
#             iostat_df = pd.DataFrame()
        
#         # Load netstat data
#         try:
#             with open(netstat_path, 'r') as f:
#                 netstat_data = json.load(f)
#             netstat_df = pd.DataFrame(netstat_data)
#             if len(netstat_df) > sample_size:
#                 netstat_df = netstat_df.sample(sample_size, random_state=42)
#             print(f"Loaded {len(netstat_df)} netstat records from JSON")
#         except Exception as e:
#             print(f"Error loading netstat data: {e}")
#             netstat_df = pd.DataFrame()
        
#         # Load process data
#         try:
#             with open(process_path, 'r') as f:
#                 process_data = json.load(f)
#             process_df = pd.DataFrame(process_data)
#             if len(process_df) > sample_size:
#                 process_df = process_df.sample(sample_size, random_state=42)
#             print(f"Loaded {len(process_df)} process records from JSON")
#         except Exception as e:
#             print(f"Error loading process data: {e}")
#             process_df = pd.DataFrame()
        
#         # Merge datasets (similar to database loading method)
#         # Start with vmstat as the base
#         merged_df = vmstat_df
        
#         # Function to merge data frames approximately on timestamp
#         def merge_on_approximate_timestamp(base_df, add_df, suffix):
#             if add_df.empty:
#                 return base_df
            
#             # Make sure timestamp columns are datetime type
#             for df in [base_df, add_df]:
#                 if 'timestamp' in df.columns and not pd.api.types.is_datetime64_dtype(df['timestamp']):
#                     df['timestamp'] = pd.to_datetime(df['timestamp'])
            
#             # Find closest timestamp pairs
#             merged = pd.merge_asof(
#                 base_df.sort_values('timestamp'),
#                 add_df.sort_values('timestamp'),
#                 on='timestamp',
#                 direction='nearest',
#                 tolerance=pd.Timedelta('5m'),  # Allow 5 minute tolerance
#                 suffixes=('', suffix)
#             )
#             return merged
        
#         # Try to merge with each dataset
#         if not iostat_df.empty:
#             try:
#                 merged_df = merge_on_approximate_timestamp(merged_df, iostat_df, '_iostat')
#             except Exception as e:
#                 print(f"Error merging iostat data: {e}")
        
#         if not netstat_df.empty:
#             try:
#                 merged_df = merge_on_approximate_timestamp(merged_df, netstat_df, '_netstat')
#             except Exception as e:
#                 print(f"Error merging netstat data: {e}")
        
#         if not process_df.empty:
#             try:
#                 # Group process data by timestamp and compute averages
#                 process_agg = process_df.groupby('timestamp').agg({
#                     'cpu': 'mean',
#                     'mem': 'mean',
#                     'pid': 'count'
#                 }).rename(columns={'pid': 'process_count'}).reset_index()
                
#                 merged_df = merge_on_approximate_timestamp(merged_df, process_agg, '_process')
#             except Exception as e:
#                 print(f"Error merging process data: {e}")
        
#         # Add classification labels
#         merged_df['state'] = 0  # normal by default
        
#         # High load when CPU usage > 60% or runnable processes > 5
#         if 'idle' in merged_df.columns:
#             high_load = ((1 - merged_df['idle']) > 0.6) | (merged_df['r'] > 5)
#             merged_df.loc[high_load, 'state'] = 1
        
#             # Critical when CPU usage > 85% or free memory < 2000
#             critical = ((1 - merged_df['idle']) > 0.85) | (merged_df['fre'] < 2000)
#             merged_df.loc[critical, 'state'] = 2
        
#         return merged_df
        
#     except Exception as e:
#         print(f"Error loading data from JSON: {e}")
#         print("Falling back to sample data...")
#         return generate_sample_data()


In [4]:
# def generate_sample_data(n_samples=1000):
#     """
#     Generate sample system metrics data for testing
#     """
#     print("Generating sample data for demonstration...")
    
#     np.random.seed(42)
    
#     # Create timestamp index
#     timestamps = pd.date_range(start='2024-01-01', periods=n_samples, freq='5min')
    
#     # VM stats
#     r = np.random.poisson(2, n_samples)  # runnable processes
#     b = np.random.poisson(1, n_samples)  # processes in uninterruptible sleep
#     avm = np.random.normal(8000, 1000, n_samples)  # active virtual memory
#     fre = np.random.normal(4000, 800, n_samples)  # free memory
    
#     # CPU stats
#     us = np.clip(np.random.beta(2, 5, n_samples), 0, 1)  # user CPU time
#     sy = np.clip(np.random.beta(1.5, 6, n_samples), 0, 1)  # system CPU time
#     idle = 1 - (us + sy)  # idle CPU time
#     idle = np.clip(idle, 0, 1)
    
#     # Disk stats - create patterns for 2 disks
#     tps1 = np.random.gamma(2, 10, n_samples)
#     kb_read1 = np.random.gamma(3, 500, n_samples)
#     kb_wrtn1 = np.random.gamma(2, 300, n_samples)
    
#     tps2 = np.random.gamma(1.5, 8, n_samples)
#     kb_read2 = np.random.gamma(2, 400, n_samples)
#     kb_wrtn2 = np.random.gamma(1.5, 250, n_samples)
    
#     # Network stats - 2 interfaces
#     ipkts1 = np.random.gamma(5, 200, n_samples)
#     opkts1 = np.random.gamma(4, 180, n_samples)
    
#     ipkts2 = np.random.gamma(3, 150, n_samples)
#     opkts2 = np.random.gamma(2.5, 130, n_samples)
    
#     # Create the DataFrame
#     df = pd.DataFrame({
#         'timestamp': timestamps,
#         'r': r,
#         'b': b,
#         'avm': avm,
#         'fre': fre,
#         'us': us,
#         'sy': sy,
#         'idle': idle,
#         'disk1_tps': tps1,
#         'disk1_kb_read': kb_read1,
#         'disk1_kb_wrtn': kb_wrtn1,
#         'disk2_tps': tps2,
#         'disk2_kb_read': kb_read2,
#         'disk2_kb_wrtn': kb_wrtn2,
#         'net1_ipkts': ipkts1,
#         'net1_opkts': opkts1,
#         'net2_ipkts': ipkts2,
#         'net2_opkts': opkts2
#     })
    
#     # Add some anomalies 
#     # High load periods
#     anomaly_indices = np.random.choice(range(n_samples), size=50, replace=False)
#     for idx in anomaly_indices:
#         df.loc[idx, 'us'] = np.clip(df.loc[idx, 'us'] * 2.5, 0, 1)
#         df.loc[idx, 'sy'] = np.clip(df.loc[idx, 'sy'] * 2, 0, 1)
#         df.loc[idx, 'idle'] = np.clip(1 - (df.loc[idx, 'us'] + df.loc[idx, 'sy']), 0, 1)
#         df.loc[idx, 'r'] = df.loc[idx, 'r'] * 3
#         df.loc[idx, 'avm'] = df.loc[idx, 'avm'] * 1.5
#         df.loc[idx, 'fre'] = df.loc[idx, 'fre'] * 0.6
    
#     # Add classification labels
#     # 0: normal, 1: high load, 2: critical
#     df['state'] = 0  # normal by default
    
#     # High load when CPU usage > 60% or runnable processes > 5
#     high_load = ((1 - df['idle']) > 0.6) | (df['r'] > 5)
#     df.loc[high_load, 'state'] = 1
    
#     # Critical when CPU usage > 85% or free memory < 2000
#     critical = ((1 - df['idle']) > 0.85) | (df['fre'] < 2000)
#     df.loc[critical, 'state'] = 2
    
#     # return df

# Load the data
You can choose from three approaches:
 1. Direct DB connection (may be slow for large datasets)
 2. JSON files (more efficient for large datasets)
 3. Sample data (for testing)

In [5]:
# Approach 1: Direct DB connection with sampling
# Configure your database connection here
# db_config = {
#     'host': 'postgres',
#     'port': '5432',
#     'dbname': 'aix_monitor',
#     'user': 'postgres',
#     'password': '%40dmin**%40%402025'  # URL-encoded password
# }

# Option 1: Load directly from database with sampling
# Adjust sample_size based on your machine's memory capacity
# data = load_system_metrics(db_config=db_config, sample_size=2000, data_source="db")

# Option 2: Load from JSON files (if you've exported your data to JSON)
# data = load_from_json()

# Option 3: Use sample data (for testing)
# data = load_system_metrics(use_sample_data=True)


In [6]:
vmpath = "D:\\projet\\migration_data\\vmstat_metrics.json"
iopath = "D:\\projet\\migration_data\\iostat_metrics.json"
netpath = "D:\\projet\\migration_data\\netstat_metrics.json"
propath = "D:\\projet\\migration_data\\process_metrics.json"

In [7]:
# Read JSON first, then rename columns
vmstat_df = pd.read_json(vmpath)
iostat_df = pd.read_json(iopath)
netstat_df = pd.read_json(netpath)
process_df = pd.read_json(propath)
vmstat_df.columns = [
    'timestamp', 
    'r', 
    'b', 
    'avm', 
    'fre', 
    'pi',
    'po',
    'fr',
    'in',
    'cs',
    'us', 
    'sy', 
    'idle'
]
iostat_df.columns = [
    'timestamp', 
    'disk', 
    'tps', 
    'kB_read', 
    'kB_wrtn', 
    'service_time'
]
netstat_df.columns = [
    'timestamp', 
    'interface', 
    'ipkts', 
    'ierrs',
    'opkts',
    'oerrs', 
    'coll'
]
process_df.columns = [
    'timestamp', 
    'user',
    'pid', 
    'cpu', 
    'mem', 
    'command',
    'do notknow',
    'donotknow2'
]

In [8]:
iostat_df.head()

Unnamed: 0,timestamp,disk,tps,kB_read,kB_wrtn,service_time
0,2025-03-21T13:30:05.498549,hdisk3,0.0,0,0,0
1,2025-03-21T13:30:05.498549,hdisk2,0.0,0,0,0
2,2025-03-21T13:30:05.498549,hdisk1,0.0,0,0,0
3,2025-03-21T13:30:05.498549,hdisk0,3.0,144,36,0
4,2025-03-21T13:30:11.566351,hdisk3,0.0,0,0,0


In [9]:
vmstat_df.head()

Unnamed: 0,timestamp,r,b,avm,fre,pi,po,fr,in,cs,us,sy,idle
0,2025-03-21T13:30:05.498549,3,0,0,0,0,0,0,0,66,52546.0,27.0,46.0
1,2025-03-21T13:30:11.566351,0,0,0,0,0,0,0,0,43,61322.0,22.0,44.0
2,2025-03-21T13:30:18.075893,2,0,0,0,0,0,0,0,36,65857.0,21.0,43.0
3,2025-03-21T13:30:23.467769,0,0,0,0,0,0,0,0,37,59241.0,21.0,43.0
4,2025-03-21T13:30:28.041217,1,0,0,0,0,0,0,0,47,59518.0,21.0,43.0


In [10]:
netstat_df.head()

Unnamed: 0,timestamp,interface,ipkts,ierrs,opkts,oerrs,coll
0,2025-03-24T09:48:02.535188,en0,384380616,0,586361020,0,0
1,2025-03-24T09:48:02.535188,en0,384380616,0,586361020,0,0
2,2025-03-24T09:48:02.535188,lo0,4302493,0,4302493,0,0
3,2025-03-24T09:48:07.828480,en0,384380685,0,586361096,0,0
4,2025-03-24T09:48:07.828480,en0,384380685,0,586361096,0,0


In [11]:
process_df.head(5)

Unnamed: 0,timestamp,user,pid,cpu,mem,command,do notknow,donotknow2
0,2025-04-14T12:10:12.359560,11338124,root,5.2,0,59002:32 sudo vi /etc/g,,
1,2025-04-14T12:10:12.359560,1048872,root,2.3,0,28819:44 wait,,
2,2025-04-14T12:10:12.359560,983334,root,2.3,0,28803:30 wait,,
3,2025-04-14T12:10:12.359560,917796,root,1.8,0,21866:59 wait,,
4,2025-04-14T12:10:12.359560,7274790,root,1.7,0,18881:29 vi /etc/group,,


In [12]:
def perform_eda(df):
    """
    Perform exploratory data analysis on system metrics
    """
    print("Data shape:", df.shape)
    print("\nData types:")
    print(df.dtypes)
    
    print("\nBasic statistics:")
    display(df.describe())
    
    # Missing values check
    missing = df.isnull().sum()
    if missing.sum() > 0:
        print("\nMissing values:")
        print(missing[missing > 0])
    else:
        print("\nNo missing values found")
    
    # # Distribution of states (classification target)
    # plt.figure(figsize=(10, 6))
    # sns.countplot(x='state', data=df)
    # plt.title('Distribution of System States')
    # plt.xlabel('State (0: Normal, 1: High Load, 2: Critical)')
    # plt.ylabel('Count')
    # plt.show()
    
    # Time series plot of key metrics
    plt.figure(figsize=(16, 12))
    
    # CPU metrics
    plt.subplot(3, 1, 1)
    plt.plot(df['timestamp'], df['us'], label='User CPU')
    plt.plot(df['timestamp'], df['sy'], label='System CPU')
    plt.plot(df['timestamp'], df['idle'], label='Idle CPU')
    plt.title('CPU Metrics Over Time')
    plt.ylabel('CPU Usage (0-1)')
    plt.legend()
    
    # Memory metrics
    plt.subplot(3, 1, 2)
    plt.plot(df['timestamp'], df['avm'], label='Active Virtual Memory')
    plt.plot(df['timestamp'], df['fre'], label='Free Memory')
    plt.title('Memory Metrics Over Time')
    plt.ylabel('Memory (MB)')
    plt.legend()
    
    # Process metrics
    plt.subplot(3, 1, 3)
    plt.plot(df['timestamp'], df['r'], label='Runnable Processes')
    plt.plot(df['timestamp'], df['b'], label='Blocked Processes')
    plt.title('Process Metrics Over Time')
    plt.ylabel('Count')
    plt.legend()
    
    plt.tight_layout()
    plt.show()
    
    # # Correlation matrix
    # plt.figure(figsize=(14, 12))
    # numeric_cols = df.select_dtypes(include=np.number).columns
    # corr_matrix = df[numeric_cols].corr()
    # sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', fmt='.2f', linewidths=0.5)
    # plt.title('Correlation Matrix of System Metrics')
    # plt.xticks(rotation=45, ha='right')
    # plt.tight_layout()
    # plt.show()
    
    # Feature distributions by state
    # key_metrics = ['us', 'sy', 'idle', 'r', 'avm', 'fre']
    # plt.figure(figsize=(16, 12))
    
    # for i, metric in enumerate(key_metrics):
    #     plt.subplot(3, 2, i+1)
    #     for state in sorted(df['state'].unique()):
    #         sns.kdeplot(df[df['state'] == state][metric], label=f'State {state}')
    #     plt.title(f'Distribution of {metric} by State')
    #     plt.legend()
    
    plt.tight_layout()
    plt.show()


In [None]:
perform_eda(vmstat_df)

Data shape: (227778, 13)

Data types:
timestamp     object
r              int64
b              int64
avm            int64
fre            int64
pi             int64
po             int64
fr             int64
in             int64
cs             int64
us           float64
sy           float64
idle         float64
dtype: object

Basic statistics:


Unnamed: 0,r,b,avm,fre,pi,po,fr,in,cs,us,sy,idle
count,227778.0,227778.0,227778.0,227778.0,227778.0,227778.0,227778.0,227778.0,227778.0,227778.0,227778.0,227778.0
mean,1.658128,0.000869,0.0,0.0,0.0,9.479594,16.577497,0.0,55.802751,131.35083,23.787323,42.676689
std,1.01662,0.029619,0.0,0.0,0.0,478.67467,1027.290341,0.0,139.369689,2099.161786,8.872937,3.320829
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,31.0,14.126,0.02,0.043
25%,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,37.0,60.364,21.0,43.0
50%,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,41.0,63.411,21.0,43.0
75%,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,47.0,65.818,21.0,44.0
max,10.0,2.0,0.0,0.0,0.0,93102.0,161909.0,0.0,14936.0,71396.0,90.0,89.0



No missing values found


KeyboardInterrupt: 

: 