In [1]:
import subprocess
import pandas as pd

def run_command(cmd):
    process = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=True)
    output, error = process.communicate()

    if error:
        raise Exception(f"Error executing command '{cmd}': {error.decode('utf-8')}")

    return output.decode('utf-8')

def get_slurm_jobs(username=None):
    base_cmd = "squeue -o '%.18i %.9P %.8j %.8u %.2t %.10M %.6D %R'"
    if username:
        base_cmd += f" -u {username}"
    output = run_command(base_cmd)

    lines = output.split('\n')[1:]  # Skip the header line
    data = [line.split() for line in lines if line]  # Skip empty lines
    return data

def get_slurm_accounting_data(username=None):
    base_cmd = "sacct -X --format=JobID,Partition,State,Elapsed,User --parsable2"
    if username:
        base_cmd += f" -u {username}"
    output = run_command(base_cmd)

    lines = output.split('\n')[2:]  # Skip the header lines
    data = [line.split('|') for line in lines if line]  # Skip empty lines
    return data

def convert_to_dataframe(data, columns):
    df = pd.DataFrame(data, columns=columns)
    return df

# Example usage
username = 'joeschm'  # Specify the username here
# jobs_data = get_slurm_jobs(username)
# jobs_df = convert_to_dataframe(jobs_data, ['JobID', 'Partition', 'Name', 'User', 'State', 'Time', 'Nodes', 'Nodelist'])
# print(jobs_df)

# For accounting data
acct_data = get_slurm_accounting_data(username)
acct_df = convert_to_dataframe(acct_data, ['JobID', 'Partition', 'State', 'Elapsed', 'User'])
print(acct_df)


       JobID           Partition               State   Elapsed     User
0   22242853  regular_milan_ss11             TIMEOUT  00:30:01  joeschm
1   22243206  regular_milan_ss11             TIMEOUT  00:30:04  joeschm
2   22243472  regular_milan_ss11             TIMEOUT  00:30:05  joeschm
3   22243871  regular_milan_ss11           COMPLETED  00:23:18  joeschm
4   22246009  regular_milan_ss11             TIMEOUT  00:30:26  joeschm
5   22246122  regular_milan_ss11           COMPLETED  00:12:40  joeschm
6   22246517  regular_milan_ss11             TIMEOUT  00:30:18  joeschm
7   22255027  regular_milan_ss11             TIMEOUT  00:30:14  joeschm
8   22255214  regular_milan_ss11             TIMEOUT  00:30:15  joeschm
9   22255293  regular_milan_ss11           COMPLETED  00:25:24  joeschm
10  22257129  regular_milan_ss11             TIMEOUT  00:30:22  joeschm
11  22257165  regular_milan_ss11           COMPLETED  00:00:21  joeschm
12  22261717  regular_milan_ss11           COMPLETED  00:16:12  

In [2]:
from datetime import datetime, timedelta
import subprocess
import pandas as pd

def run_command(cmd):
    process = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=True)
    output, error = process.communicate()

    if error:
        raise Exception(f"Error executing command '{cmd}': {error.decode('utf-8')}")

    return output.decode('utf-8')

def get_slurm_accounting_data_chunked(username=None, days_back=180, chunk_size=30):
    all_data = []
    end_date = datetime.now()
    start_date = end_date - timedelta(days=chunk_size)

    while days_back > 0:
        start_time = start_date.strftime("%Y-%m-%d")
        end_time = end_date.strftime("%Y-%m-%d")
        # Update the format here to match the requested columns
        format_str = "JobID,Submit,Start,End,NNodes,QOS,State,Elapsed,User"
        base_cmd = f"sacct -X --format={format_str} --parsable2 --starttime={start_time} --endtime={end_time}"
        if username:
            base_cmd += f" -u {username}"
        output = run_command(base_cmd)

        lines = output.split('\n')[2:]  # Skip the header lines
        data = [line.split('|') for line in lines if line]  # Skip empty lines
        all_data.extend(data)

        # Update dates for the next chunk
        end_date = start_date
        start_date -= timedelta(days=chunk_size)
        days_back -= chunk_size

    return all_data

def convert_to_dataframe(data, columns):
    df = pd.DataFrame(data, columns=columns)
    return df




# def load_existing_data(filename):
#     try:
#         existing_df = pd.read_csv(filename)
#         if not existing_df.empty:
#             # Assuming 'Submit' is in YYYY-MM-DD format; adjust parsing as necessary
#             latest_date = pd.to_datetime(existing_df['Submit']).max()
#             return existing_df, latest_date
#     except FileNotFoundError:
#         pass
#     return pd.DataFrame(), None



# def adjusted_days_back(latest_date):
#     if latest_date is not None:
#         now = datetime.now()
#         delta = now - latest_date
#         return max(0, delta.days - 1)  # Subtract 1 to avoid overlap; adjust as needed
#     return 180  # Default days_back value if no existing data



# Example usage
username = 'joeschm'  # Specify the username here
days_back = 10  # Specify the total number of days back you want to fetch data for
chunk_size = 10  # Specify the chunk size for each query
acct_data = get_slurm_accounting_data_chunked(username, days_back, chunk_size)
acct_df = convert_to_dataframe(acct_data, ['JobID', 'Submit', 'Start', 'End', 'NNodes', 'QOS', 'State', 'Elapsed', 'User'])
print(acct_df)


filename = 'slurm_jobs_data.csv'  # Specify your desired path and filename
acct_df.to_csv(filename, index=False)
print(f"DataFrame saved to {filename}")


# # Load existing data and determine how far back to query
# filename = 'slurm_jobs_data.csv'
# existing_df, latest_date = load_existing_data(filename)
# new_days_back = adjusted_days_back(latest_date)



# # Fetch new data based on adjusted days_back
# if new_days_back > 0:
#     acct_data = get_slurm_accounting_data_chunked(username, new_days_back, chunk_size)
#     new_df = convert_to_dataframe(acct_data, ['JobID', 'Submit', 'Start', 'End', 'NNodes', 'QOS', 'State', 'Elapsed', 'User'])

#     # Combine new data with existing data
#     combined_df = pd.concat([existing_df, new_df], ignore_index=True)
# else:
#     combined_df = existing_df  # No new data to fetch

# # Save combined DataFrame to CSV
# combined_df.to_csv(filename, index=False)
# print(f"Updated DataFrame saved to {filename}")

# print(combined_df)


       JobID               Submit                Start                  End  \
0   22111430  2024-02-23T16:17:10  2024-02-23T16:23:27  2024-02-23T16:53:54   
1   22113855  2024-02-23T18:04:44  2024-02-23T18:05:12  2024-02-23T18:24:58   
2   22114095  2024-02-23T18:11:11  2024-02-23T18:11:18  2024-02-23T18:41:26   
3   22114264  2024-02-23T18:27:12  2024-02-23T18:28:42  2024-02-23T18:33:49   
4   22136583  2024-02-24T16:09:38  2024-02-24T16:09:48  2024-02-24T16:40:17   
5   22138909  2024-02-24T17:33:45  2024-02-24T17:39:15  2024-02-24T17:45:42   
6   22175721  2024-02-25T14:58:27  2024-02-25T14:59:14  2024-02-25T15:04:34   
7   22175776  2024-02-25T15:00:33  2024-02-25T15:20:49  2024-02-25T15:25:57   
8   22176043  2024-02-25T15:24:51                 None  2024-02-25T15:44:08   
9   22176362  2024-02-25T15:30:27  2024-02-25T15:53:39  2024-02-25T16:38:15   
10  22176591  2024-02-25T15:45:38  2024-02-25T15:45:52  2024-02-25T15:46:31   
11  22176686  2024-02-25T15:47:16  2024-02-25T15:47: