# AWS RDS - CPU
This notebook shows the CPU on an AWS RDS: 

This early version of a notebook brings the data as is, without any analysis of the information. 
## Cells
- Configuration - open an AWS session to CloudWatch. 
- Get a lits of RDS servers
- Show the CPU (min, max, avg) of a given RDS, between two dates. 

## Resources
- https://www.metisdata.io/blog/hold-your-horses-postgres-how-to-debug-high-cpu-usage

## Configuration
- Configure the AWS credentials: access_key_id, secret_access_key, region_name
- Configure the PG connections string

In [17]:
import configparser

# Read from the Config file
try: 
    config = configparser.ConfigParser() 
    config.read_file(open(r'../ipynb.cfg'))
except Exception as e:
    print(f"Error opening the configuration file: {e}")

try: 
    # con_str = config.get('con_str', 'PG_AIRBASES')
    access_key_id = config.get('credentials', 'ACCESS_KEY_ID') 
    secret_access_key = config.get('credentials', 'SECRET_ACCESS_KEY')   
    region_name =  'eu-central-1' 
except Exception as e:
    print(f"Error opening the configuration file: {e}")



## Prereq
- pip install boto3
- pip install dash

## Show the CPU 
Using Boto3, to get the data from the RDS object. 

TODO: Calculate the actual CPU, memory and IO from the instance type (such as  db.m6g.large)

In [18]:
import boto3
import pandas as pd
from datetime import datetime, timedelta
import plotly.graph_objects as go


# Shows the CPU utilization of an RDS instance
# period - Period for metric data retrieval (in seconds). For ex 300 means 5 minutes. 
def rds_cpu(rds_instance_id, session,  period = 60):
    try:
        metric_name = 'CPUUtilization'

        # Create a CloudWatch client using the existing AWS session
        cloudwatch = session.client('cloudwatch')

        # Calculate 7 days ago from the current time
        end_time = datetime.utcnow()
        start_time = end_time - timedelta(days=7)

        # Convert start_time and end_time to ISO format
        start_time_iso = start_time.isoformat()
        end_time_iso = end_time.isoformat()

        # Get CPU utilization metric data
        response = cloudwatch.get_metric_data(
            MetricDataQueries=[
                {
                    'Id': 'm1',
                    'MetricStat': {
                        'Metric': {
                            'Namespace': 'AWS/RDS',
                            'MetricName': metric_name,
                            'Dimensions': [
                                {
                                    'Name': 'DBInstanceIdentifier',
                                    'Value': rds_instance_id
                                },
                            ]
                        },
                        'Period': period,
                        'Stat': 'Average',
                    },
                    'ReturnData': True,
                },
            ],
            StartTime=start_time_iso,
            EndTime=end_time_iso,
        )

        # Extract and return the CPU utilization data as a Pandas DataFrame
        timestamps = response['MetricDataResults'][0]['Timestamps']
        values = response['MetricDataResults'][0]['Values']
        cpu_data = pd.DataFrame({'Timestamp': timestamps, metric_name: values})

        return cpu_data

    except Exception as e:
        print("An error occurred while retrieving CPU utilization:", str(e))
        return None

# Replace these values with your AWS credentials and RDS instance name
rds_instance_id = 'database-2'

# Get the CPU utilization data for the specified RDS instance using the existing session
cpu_data = rds_cpu(rds_instance_id, session)

# Print the CPU utilization data
if cpu_data is not None:
    print(cpu_data)


                      Timestamp  CPUUtilization
0     2023-11-02 11:21:00+00:00        7.075000
1     2023-11-02 11:20:00+00:00        6.608333
2     2023-11-02 11:19:00+00:00        6.375000
3     2023-11-02 11:18:00+00:00        6.225000
4     2023-11-02 11:17:00+00:00        6.766667
...                         ...             ...
10074 2023-10-26 11:26:00+00:00       45.958333
10075 2023-10-26 11:25:00+00:00       52.008333
10076 2023-10-26 11:24:00+00:00       42.425000
10077 2023-10-26 11:23:00+00:00       48.991667
10078 2023-10-26 11:22:00+00:00       39.583333

[10079 rows x 2 columns]


In [11]:
import boto3
import pandas as pd
from datetime import datetime, timedelta

def create_aws_session(access_key_id, secret_access_key, region_name):
    try:
        # Create a session using AWS credentials
        session = boto3.Session(
            aws_access_key_id=access_key_id,
            aws_secret_access_key=secret_access_key,
            region_name=region_name
        )
        return session
    except Exception as e:
        print("An error occurred while creating the AWS session:", str(e))
        return None

# Function to round a datetime object to the nearest 10 minutes
def round_to_nearest_10_minutes(dt):
    minute = (dt.minute // 10) * 10
    return dt.replace(second=0, microsecond=0, minute=minute)

# Shows the CPU utilization of an RDS instance
def rds_cpu(rds_instance_id, session, period=10):
    try:
        metric_name = 'CPUUtilization'

        # Create a CloudWatch client using the existing AWS session
        cloudwatch = session.client('cloudwatch')

        # Calculate the end time (now) and round it to the nearest 10 minutes
        end_time = round_to_nearest_10_minutes(datetime.utcnow())

        # Calculate the start time as 7 days ago from the end time
        start_time = end_time - timedelta(days=1)

        # Convert start_time and end_time to ISO format
        start_time_iso = start_time.isoformat()
        end_time_iso = end_time.isoformat()

        # Get CPU utilization metric data (average, minimum, and maximum)
        response = cloudwatch.get_metric_data(
            MetricDataQueries=[
                {
                    'Id': 'm1',
                    'MetricStat': {
                        'Metric': {
                            'Namespace': 'AWS/RDS',
                            'MetricName': metric_name,
                            'Dimensions': [
                                {
                                    'Name': 'DBInstanceIdentifier',
                                    'Value': rds_instance_id
                                },
                            ]
                        },
                        'Period': period,
                        'Stat': 'Average',
                    },
                    'ReturnData': True,
                },
                {
                    'Id': 'm2',
                    'MetricStat': {
                        'Metric': {
                            'Namespace': 'AWS/RDS',
                            'MetricName': metric_name,
                            'Dimensions': [
                                {
                                    'Name': 'DBInstanceIdentifier',
                                    'Value': rds_instance_id
                                },
                            ]
                        },
                        'Period': period,
                        'Stat': 'Minimum',
                    },
                    'ReturnData': True,
                },
                {
                    'Id': 'm3',
                    'MetricStat': {
                        'Metric': {
                            'Namespace': 'AWS/RDS',
                            'MetricName': metric_name,
                            'Dimensions': [
                                {
                                    'Name': 'DBInstanceIdentifier',
                                    'Value': rds_instance_id
                                },
                            ]
                        },
                        'Period': period,
                        'Stat': 'Maximum',
                    },
                    'ReturnData': True,
                },
            ],
            StartTime=start_time_iso,
            EndTime=end_time_iso,
        )

        # Extract and return the CPU utilization data as a Pandas DataFrame
        avg_values = response['MetricDataResults'][0]['Values']
        min_values = response['MetricDataResults'][1]['Values']
        max_values = response['MetricDataResults'][2]['Values']
        timestamps = response['MetricDataResults'][0]['Timestamps']

        cpu_data = pd.DataFrame({
            'Timestamp': timestamps,
            'Average_CPUUtilization': avg_values,
            'Minimum_CPUUtilization': min_values,
            'Maximum_CPUUtilization': max_values
        })

        return cpu_data

    except Exception as e:
        print("An error occurred while retrieving CPU utilization:", str(e))
        return None

# Replace these values with your AWS credentials and RDS instance name
rds_instance_id = 'database-2'
session = create_aws_session(access_key_id, secret_access_key, region_name)

# Get the CPU utilization data for the specified RDS instance using the existing session
cpu_data = rds_cpu(rds_instance_id, session, period=600)  # Use a different period (e.g., 600) if needed

# Print the CPU utilization data
#if cpu_data is not None:
#     print(cpu_data)


In [12]:
# Updated rds_cpu function
def rds_cpu(rds_instance_id, session, start_time, end_time=None, period=600):
    try:
        metric_name = 'CPUUtilization'

        # Create a CloudWatch client using the existing AWS session
        cloudwatch = session.client('cloudwatch')

        # If end_time is not provided, use the current time (now)
        if end_time is None:
            end_time = datetime.utcnow()

        # Round start_time and end_time to the nearest 10 minutes
        start_time = round_to_nearest_10_minutes(start_time)
        end_time = round_to_nearest_10_minutes(end_time)

        # Convert start_time and end_time to ISO format
        start_time_iso = start_time.isoformat()
        end_time_iso = end_time.isoformat()

        # Get CPU utilization metric data (average, minimum, and maximum)
        response = cloudwatch.get_metric_data(
            MetricDataQueries=[
                {
                    'Id': 'm1',
                    'MetricStat': {
                        'Metric': {
                            'Namespace': 'AWS/RDS',
                            'MetricName': metric_name,
                            'Dimensions': [
                                {
                                    'Name': 'DBInstanceIdentifier',
                                    'Value': rds_instance_id
                                },
                            ]
                        },
                        'Period': period,
                        'Stat': 'Average',
                    },
                    'ReturnData': True,
                },
                {
                    'Id': 'm2',
                    'MetricStat': {
                        'Metric': {
                            'Namespace': 'AWS/RDS',
                            'MetricName': metric_name,
                            'Dimensions': [
                                {
                                    'Name': 'DBInstanceIdentifier',
                                    'Value': rds_instance_id
                                },
                            ]
                        },
                        'Period': period,
                        'Stat': 'Minimum',
                    },
                    'ReturnData': True,
                },
                {
                    'Id': 'm3',
                    'MetricStat': {
                        'Metric': {
                            'Namespace': 'AWS/RDS',
                            'MetricName': metric_name,
                            'Dimensions': [
                                {
                                    'Name': 'DBInstanceIdentifier',
                                    'Value': rds_instance_id
                                },
                            ]
                        },
                        'Period': period,
                        'Stat': 'Maximum',
                    },
                    'ReturnData': True,
                },
            ],
            StartTime=start_time_iso,
            EndTime=end_time_iso,
        )

        # Extract and return the CPU utilization data as a Pandas DataFrame
        avg_values = response['MetricDataResults'][0]['Values']
        min_values = response['MetricDataResults'][1]['Values']
        max_values = response['MetricDataResults'][2]['Values']
        timestamps = response['MetricDataResults'][0]['Timestamps']

        cpu_data = pd.DataFrame({
            'Timestamp': timestamps,
            'Average_CPUUtilization': avg_values,
            'Minimum_CPUUtilization': min_values,
            'Maximum_CPUUtilization': max_values
        })

        return cpu_data

    except Exception as e:
        print("An error occurred while retrieving CPU utilization:", str(e))
        return None


rds_instance_id = 'database-2'
start_time = datetime.utcnow() - timedelta(days=1)
end_time = datetime.utcnow()

# Get the CPU utilization data for the specified RDS instance using the existing session
cpu_data = rds_cpu(rds_instance_id, session, start_time, end_time, period=10)

# Convert start_time and end_time to formatted strings
start_time_str = start_time.strftime("%Y-%m-%d %H:%M:%S")
end_time_str = end_time.strftime("%Y-%m-%d %H:%M:%S")

# Concatenate rds_instance_id and formatted start/end times with the table title
table_title = f"RDS CPU Utilization Data - {rds_instance_id} (From: {start_time_str} To: {end_time_str})"

# Create a Plotly figure for the table
table_fig = go.Figure(data=[go.Table(
    header=dict(values=["Timestamp", "Average CPU Utilization", "Minimum CPU Utilization", "Maximum CPU Utilization"]),
    cells=dict(values=[cpu_data['Timestamp'], cpu_data['Average_CPUUtilization'], cpu_data['Minimum_CPUUtilization'], cpu_data['Maximum_CPUUtilization']])
)])

# Update the layout of the table figure
table_fig.update_layout(
    title=table_title,
    margin=dict(l=0, r=0, t=30, b=0)  # Adjust the margins as needed
)

# Display the table figure
table_fig.show()


In [None]:
import plotly.subplots as sp
import plotly.graph_objs as go
import pandas as pd
from datetime import datetime, timedelta

# Assuming you have the cpu_data DataFrame containing the CPU utilization data
# Make sure the DataFrame is correctly populated from your previous code

# Create a subplot with multiple lines
fig = sp.make_subplots(rows=1, cols=1)  # You can adjust the number of rows and columns as needed

# Add traces for average, minimum, and maximum CPU utilization
trace_min = go.Scatter(x=cpu_data['Timestamp'], y=cpu_data['Minimum_CPUUtilization'], mode='lines', name='Minimum CPU Utilization')
trace_max = go.Scatter(x=cpu_data['Timestamp'], y=cpu_data['Maximum_CPUUtilization'], mode='lines', name='Maximum CPU Utilization')
trace_avg = go.Scatter(x=cpu_data['Timestamp'], y=cpu_data['Average_CPUUtilization'], mode='lines', name='Average CPU Utilization')

# Add the traces to the subplot

fig.add_trace(trace_min)
fig.add_trace(trace_max)
fig.add_trace(trace_avg)

# Customize the layout
fig.update_layout(
    title='RDS CPU Utilization Over Time',
    xaxis=dict(title='Timestamp'),
    yaxis=dict(title='CPU Utilization (%)'),
    legend=dict(title='Metric Type')
)

# Show the interactive chart
fig.show()


In [15]:
import boto3
import pandas as pd
from datetime import datetime, timedelta
import time
import plotly.graph_objects as go

# Shows the CPU utilization of an RDS instance
# period - Period for metric data retrieval (in seconds). For example, 10 means 10 seconds.
def rds_cpu(rds_instance_id, session, period=10):
    try:
        metric_name = 'CPUUtilization'

        # Create a CloudWatch client using the existing AWS session
        cloudwatch = session.client('cloudwatch')

        while True:
            # Calculate end_time and start_time for the current interval
            end_time = datetime.utcnow()
            start_time = end_time - timedelta(hours=1)

            # Convert start_time and end_time to ISO format
            start_time_iso = start_time.isoformat()
            end_time_iso = end_time.isoformat()

            # Get CPU utilization metric data
            response = cloudwatch.get_metric_data(
                MetricDataQueries=[
                    {
                        'Id': 'm1',
                        'MetricStat': {
                            'Metric': {
                                'Namespace': 'AWS/RDS',
                                'MetricName': metric_name,
                                'Dimensions': [
                                    {
                                        'Name': 'DBInstanceIdentifier',
                                        'Value': rds_instance_id
                                    },
                                ]
                            },
                            'Period': period,
                            'Stat': 'Average',
                        },
                        'ReturnData': True,
                    },
                ],
                StartTime=start_time_iso,
                EndTime=end_time_iso,
            )

            # Extract and return the CPU utilization data as a Pandas DataFrame
            timestamps = response['MetricDataResults'][0]['Timestamps']
            values = response['MetricDataResults'][0]['Values']
            cpu_data = pd.DataFrame({'Timestamp': timestamps, metric_name: values})

            yield cpu_data  # Using 'yield' to return data and maintain state

            time.sleep(period)

    except Exception as e:
        print("An error occurred while retrieving CPU utilization:", str(e))
        return None

# Replace these values with your AWS credentials and RDS instance name
rds_instance_id = 'database-2'

# Get the CPU utilization data for the specified RDS instance using the existing session
session = boto3.Session(
    aws_access_key_id=access_key_id,
    aws_secret_access_key=secret_access_key,
    region_name=region_name
)

# Create an instance of the generator
cpu_data_generator = rds_cpu(rds_instance_id, session, period=1)

# Example: Retrieve and print CPU utilization data for 5 intervals
for _ in range(1):
    cpu_data = next(cpu_data_generator)
    print(cpu_data)


                   Timestamp  CPUUtilization
0  2023-10-17 12:41:00+00:00        8.708333
1  2023-10-17 12:40:00+00:00        8.958333
2  2023-10-17 12:39:00+00:00        8.616667
3  2023-10-17 12:38:00+00:00        8.850000
4  2023-10-17 12:37:00+00:00        8.175000
5  2023-10-17 12:36:00+00:00        8.266667
6  2023-10-17 12:35:00+00:00        8.433333
7  2023-10-17 12:34:00+00:00        8.600000
8  2023-10-17 12:33:00+00:00        7.900000
9  2023-10-17 12:32:00+00:00        8.208333
10 2023-10-17 12:31:00+00:00        9.966667
11 2023-10-17 12:30:00+00:00        9.050000
12 2023-10-17 12:29:00+00:00        8.141667
13 2023-10-17 12:28:00+00:00        7.708333
14 2023-10-17 12:27:00+00:00        7.708333
15 2023-10-17 12:26:00+00:00        8.266667
16 2023-10-17 12:25:00+00:00        9.641667
17 2023-10-17 12:24:00+00:00        9.008333
18 2023-10-17 12:23:00+00:00        9.066667
19 2023-10-17 12:22:00+00:00        7.733333
20 2023-10-17 12:21:00+00:00        7.991667
21 2023-10