# Calculate Compute

Normalize job times by compute used.

In [15]:
# Packages
import pandas as pd

## Job History Data

In [16]:
# List of jobs we are interested in
list_of_jobs = [
                'Fit_Logistic_Regression_Classifier_All_Data',
                'Fit_Logistic_Regression_Classifier_All_Features_PCA',
                'Fit_Logistic_Regression_Classifier_Individual_Features_PCA',
                #'Fit_SVM_Classifier_All_Data',
                'Fit_SVM_Classifier_All_Features_PCA',
                'Fit_SVM_Classifier_Individual_Features_PCA',
                'fit_XGBoost_All_data',
                'fit_XGBoost_All_features_PCA',
                'fit_XGBoost_Individual_Features_PCA',
                ]

In [17]:
# Read pipe-delimited file '../../Data/Compute/job_history.txt' into a DataFrame
job_history = pd.read_csv('../../Data/Compute/job_history.txt', delimiter='|')
job_history

Unnamed: 0,JobID,JobName,Partition,State,Start,End,AllocNodes,AllocCPUS,NodeList
0,385319,jupyterhub,standalone,CANCELLED by 3581,2024-03-09T14:21:23,2024-03-09T23:02:28,1,1,arwen
1,385319.batch,batch,,CANCELLED,2024-03-09T14:21:23,2024-03-09T23:03:00,1,1,arwen
2,385319.extern,extern,,COMPLETED,2024-03-09T14:21:23,2024-03-09T23:02:28,1,1,arwen
3,385387,transcript_ner,jsteinhardt,FAILED,2024-03-09T21:07:13,2024-03-09T21:07:19,1,1,saruman
4,385387.batch,batch,,FAILED,2024-03-09T21:07:13,2024-03-09T21:07:19,1,1,saruman
...,...,...,...,...,...,...,...,...,...
730,439672.extern,extern,,COMPLETED,2024-04-09T21:25:33,2024-04-09T21:26:16,1,1,smokyquartz
731,439672.0,bash,,COMPLETED,2024-04-09T21:25:33,2024-04-09T21:26:16,1,1,smokyquartz
732,439673,bash,jsteinhardt,COMPLETED,2024-04-09T21:26:33,2024-04-09T21:27:09,1,1,rainbowquartz
733,439673.extern,extern,,COMPLETED,2024-04-09T21:26:33,2024-04-09T21:27:09,1,1,rainbowquartz


In [18]:
# Selected jobs - Filter job_history DataFrame to only include jobs in list_of_jobs
# require State == 'COMPLETED'
# select row of maximum End time for each job
selected_jobs = (job_history[job_history['JobName'].isin(list_of_jobs)]
                                                   .query('State == "COMPLETED"')
                                                   .groupby('JobName')
                                                   .apply(lambda x: x.loc[x['End'].idxmax()])
                                                   .reset_index(drop=True)
                                                   )
# Get time elapsed by differencing Start and End times
selected_jobs['TimeElapsed'] = pd.to_datetime(selected_jobs['End']) - pd.to_datetime(selected_jobs['Start'])
selected_jobs

Unnamed: 0,JobID,JobName,Partition,State,Start,End,AllocNodes,AllocCPUS,NodeList,TimeElapsed
0,434001,Fit_Logistic_Regression_Classifier_All_Data,jsteinhardt,COMPLETED,2024-04-07T15:19:21,2024-04-08T16:23:21,1,16,shadowfax,1 days 01:04:00
1,434003,Fit_Logistic_Regression_Classifier_All_Feature...,jsteinhardt,COMPLETED,2024-04-07T15:19:38,2024-04-07T19:27:29,1,16,smokyquartz,0 days 04:07:51
2,434002,Fit_Logistic_Regression_Classifier_Individual_...,jsteinhardt,COMPLETED,2024-04-07T15:19:30,2024-04-07T22:24:49,1,16,shadowfax,0 days 07:05:19
3,435811,Fit_SVM_Classifier_All_Features_PCA,jsteinhardt,COMPLETED,2024-04-07T23:26:10,2024-04-08T02:21:37,1,16,sunstone,0 days 02:55:27
4,435809,Fit_SVM_Classifier_Individual_Features_PCA,jsteinhardt,COMPLETED,2024-04-07T23:25:50,2024-04-08T07:05:06,1,16,sunstone,0 days 07:39:16
5,434006,fit_XGBoost_All_data,jsteinhardt,COMPLETED,2024-04-07T15:20:30,2024-04-07T19:15:20,1,16,smokyquartz,0 days 03:54:50
6,434005,fit_XGBoost_All_features_PCA,epurdom,COMPLETED,2024-04-07T15:20:20,2024-04-07T15:55:20,1,16,frodo,0 days 00:35:00
7,434004,fit_XGBoost_Individual_Features_PCA,epurdom,COMPLETED,2024-04-07T15:20:11,2024-04-07T16:02:43,1,16,frodo,0 days 00:42:32


## CPU data

In [19]:
# Load '../../Data/Compute/scf_cpus.xlsx'
scf_cpus = pd.read_excel('../../Data/Compute/scf_cpus.xlsx')
scf_cpus

Unnamed: 0,partition,node,CPU model,passmark score
0,jsteinhardt,shadowfax,Intel Xeon Silver 4214 CPU @ 2.20GHz,16190
1,jsteinhardt,balrog,Intel Xeon Gold 5220R CPU @ 2.20GHz,33370
2,jsteinhardt,saruman,Intel Xeon Gold 5320 CPU @ 2.20GHz,37558
3,jsteinhardt,sunstone,Intel Xeon Gold 5218 CPU @ 2.30GHz,21986
4,epurdom,frodo,AMD EPYC 7543 32-Core Processor,60132
5,jsteinhardt,smokyquartz,Intel Xeon Gold 5218 CPU @ 2.30GHz,21986
6,jsteinhardt,rainbowquartz,Intel Xeon Gold 5218 CPU @ 2.30GHz,21986


## Join on NodeList and node

In [20]:
job_cpu = pd.merge(selected_jobs, scf_cpus, left_on = 'NodeList', right_on = 'node', how = 'left')
job_cpu

Unnamed: 0,JobID,JobName,Partition,State,Start,End,AllocNodes,AllocCPUS,NodeList,TimeElapsed,partition,node,CPU model,passmark score
0,434001,Fit_Logistic_Regression_Classifier_All_Data,jsteinhardt,COMPLETED,2024-04-07T15:19:21,2024-04-08T16:23:21,1,16,shadowfax,1 days 01:04:00,jsteinhardt,shadowfax,Intel Xeon Silver 4214 CPU @ 2.20GHz,16190
1,434003,Fit_Logistic_Regression_Classifier_All_Feature...,jsteinhardt,COMPLETED,2024-04-07T15:19:38,2024-04-07T19:27:29,1,16,smokyquartz,0 days 04:07:51,jsteinhardt,smokyquartz,Intel Xeon Gold 5218 CPU @ 2.30GHz,21986
2,434002,Fit_Logistic_Regression_Classifier_Individual_...,jsteinhardt,COMPLETED,2024-04-07T15:19:30,2024-04-07T22:24:49,1,16,shadowfax,0 days 07:05:19,jsteinhardt,shadowfax,Intel Xeon Silver 4214 CPU @ 2.20GHz,16190
3,435811,Fit_SVM_Classifier_All_Features_PCA,jsteinhardt,COMPLETED,2024-04-07T23:26:10,2024-04-08T02:21:37,1,16,sunstone,0 days 02:55:27,jsteinhardt,sunstone,Intel Xeon Gold 5218 CPU @ 2.30GHz,21986
4,435809,Fit_SVM_Classifier_Individual_Features_PCA,jsteinhardt,COMPLETED,2024-04-07T23:25:50,2024-04-08T07:05:06,1,16,sunstone,0 days 07:39:16,jsteinhardt,sunstone,Intel Xeon Gold 5218 CPU @ 2.30GHz,21986
5,434006,fit_XGBoost_All_data,jsteinhardt,COMPLETED,2024-04-07T15:20:30,2024-04-07T19:15:20,1,16,smokyquartz,0 days 03:54:50,jsteinhardt,smokyquartz,Intel Xeon Gold 5218 CPU @ 2.30GHz,21986
6,434005,fit_XGBoost_All_features_PCA,epurdom,COMPLETED,2024-04-07T15:20:20,2024-04-07T15:55:20,1,16,frodo,0 days 00:35:00,epurdom,frodo,AMD EPYC 7543 32-Core Processor,60132
7,434004,fit_XGBoost_Individual_Features_PCA,epurdom,COMPLETED,2024-04-07T15:20:11,2024-04-07T16:02:43,1,16,frodo,0 days 00:42:32,epurdom,frodo,AMD EPYC 7543 32-Core Processor,60132


## Compute units - compute time times num cpus times passmark score

In [25]:
# compute units - TimeElapsed * CPUs * passmark score
job_cpu['Units'] = job_cpu['TimeElapsed'].dt.total_seconds() * job_cpu['AllocCPUS'] * job_cpu['passmark score']
job_cpu

Unnamed: 0,JobID,JobName,Partition,State,Start,End,AllocNodes,AllocCPUS,NodeList,TimeElapsed,partition,node,CPU model,passmark score,TimePerPassmark,EstimatedLaptopTime,Units
0,434001,Fit_Logistic_Regression_Classifier_All_Data,jsteinhardt,COMPLETED,2024-04-07T15:19:21,2024-04-08T16:23:21,1,16,shadowfax,1 days 01:04:00,jsteinhardt,shadowfax,Intel Xeon Silver 4214 CPU @ 2.20GHz,16190,0 days 00:00:05.573810994,0 days 15:56:11.236561242,23375770000.0
1,434003,Fit_Logistic_Regression_Classifier_All_Feature...,jsteinhardt,COMPLETED,2024-04-07T15:19:38,2024-04-07T19:27:29,1,16,smokyquartz,0 days 04:07:51,jsteinhardt,smokyquartz,Intel Xeon Gold 5218 CPU @ 2.30GHz,21986,0 days 00:00:00.676384972,0 days 01:56:02.030516796,5231261000.0
2,434002,Fit_Logistic_Regression_Classifier_Individual_...,jsteinhardt,COMPLETED,2024-04-07T15:19:30,2024-04-07T22:24:49,1,16,shadowfax,0 days 07:05:19,jsteinhardt,shadowfax,Intel Xeon Silver 4214 CPU @ 2.20GHz,16190,0 days 00:00:01.576219888,0 days 04:30:24.031307184,6610442000.0
3,435811,Fit_SVM_Classifier_All_Features_PCA,jsteinhardt,COMPLETED,2024-04-07T23:26:10,2024-04-08T02:21:37,1,16,sunstone,0 days 02:55:27,jsteinhardt,sunstone,Intel Xeon Gold 5218 CPU @ 2.30GHz,21986,0 days 00:00:00.478804693,0 days 01:22:08.336705049,3703146000.0
4,435809,Fit_SVM_Classifier_Individual_Features_PCA,jsteinhardt,COMPLETED,2024-04-07T23:25:50,2024-04-08T07:05:06,1,16,sunstone,0 days 07:39:16,jsteinhardt,sunstone,Intel Xeon Gold 5218 CPU @ 2.30GHz,21986,0 days 00:00:01.253343036,0 days 03:35:00.659869548,9693539000.0
5,434006,fit_XGBoost_All_data,jsteinhardt,COMPLETED,2024-04-07T15:20:30,2024-04-07T19:15:20,1,16,smokyquartz,0 days 03:54:50,jsteinhardt,smokyquartz,Intel Xeon Gold 5218 CPU @ 2.30GHz,21986,0 days 00:00:00.640862366,0 days 01:49:56.396333238,4956524000.0
6,434005,fit_XGBoost_All_features_PCA,epurdom,COMPLETED,2024-04-07T15:20:20,2024-04-07T15:55:20,1,16,frodo,0 days 00:35:00,epurdom,frodo,AMD EPYC 7543 32-Core Processor,60132,0 days 00:00:00.034923169,0 days 00:05:59.464178517,2020435000.0
7,434004,fit_XGBoost_Individual_Features_PCA,epurdom,COMPLETED,2024-04-07T15:20:11,2024-04-07T16:02:43,1,16,frodo,0 days 00:42:32,epurdom,frodo,AMD EPYC 7543 32-Core Processor,60132,0 days 00:00:00.042439965,0 days 00:07:16.834559745,2455310000.0


## Get compute time, roughly, if the job was run on laptop

11th Gen Intel(R) Core(TM) i7-1165G7 @ 2.80 GHz

Passmark: 10,293

In [28]:
# Add estimated laptop time as Units divided by 10,293
job_cpu['LaptopTime'] = job_cpu['Units'] / 10293
# Convert to minutes by dividing by 60
job_cpu['LaptopTime minutes'] = job_cpu['LaptopTime'] / 60
# Convert to hours by dividing by 60
job_cpu['LaptopTime hours'] = job_cpu['LaptopTime minutes'] / 60
job_cpu

Unnamed: 0,JobID,JobName,Partition,State,Start,End,AllocNodes,AllocCPUS,NodeList,TimeElapsed,partition,node,CPU model,passmark score,TimePerPassmark,EstimatedLaptopTime,Units,LaptopTime,LaptopTime minutes,LaptopTime hours
0,434001,Fit_Logistic_Regression_Classifier_All_Data,jsteinhardt,COMPLETED,2024-04-07T15:19:21,2024-04-08T16:23:21,1,16,shadowfax,1 days 01:04:00,jsteinhardt,shadowfax,Intel Xeon Silver 4214 CPU @ 2.20GHz,16190,0 days 00:00:05.573810994,0 days 15:56:11.236561242,23375770000.0,2271036.0,37850.593607,630.843227
1,434003,Fit_Logistic_Regression_Classifier_All_Feature...,jsteinhardt,COMPLETED,2024-04-07T15:19:38,2024-04-07T19:27:29,1,16,smokyquartz,0 days 04:07:51,jsteinhardt,smokyquartz,Intel Xeon Gold 5218 CPU @ 2.30GHz,21986,0 days 00:00:00.676384972,0 days 01:56:02.030516796,5231261000.0,508234.8,8470.580161,141.176336
2,434002,Fit_Logistic_Regression_Classifier_Individual_...,jsteinhardt,COMPLETED,2024-04-07T15:19:30,2024-04-07T22:24:49,1,16,shadowfax,0 days 07:05:19,jsteinhardt,shadowfax,Intel Xeon Silver 4214 CPU @ 2.20GHz,16190,0 days 00:00:01.576219888,0 days 04:30:24.031307184,6610442000.0,642226.9,10703.782117,178.396369
3,435811,Fit_SVM_Classifier_All_Features_PCA,jsteinhardt,COMPLETED,2024-04-07T23:26:10,2024-04-08T02:21:37,1,16,sunstone,0 days 02:55:27,jsteinhardt,sunstone,Intel Xeon Gold 5218 CPU @ 2.30GHz,21986,0 days 00:00:00.478804693,0 days 01:22:08.336705049,3703146000.0,359773.2,5996.220655,99.937011
4,435809,Fit_SVM_Classifier_Individual_Features_PCA,jsteinhardt,COMPLETED,2024-04-07T23:25:50,2024-04-08T07:05:06,1,16,sunstone,0 days 07:39:16,jsteinhardt,sunstone,Intel Xeon Gold 5218 CPU @ 2.30GHz,21986,0 days 00:00:01.253343036,0 days 03:35:00.659869548,9693539000.0,941760.4,15696.006114,261.600102
5,434006,fit_XGBoost_All_data,jsteinhardt,COMPLETED,2024-04-07T15:20:30,2024-04-07T19:15:20,1,16,smokyquartz,0 days 03:54:50,jsteinhardt,smokyquartz,Intel Xeon Gold 5218 CPU @ 2.30GHz,21986,0 days 00:00:00.640862366,0 days 01:49:56.396333238,4956524000.0,481543.2,8025.719486,133.761991
6,434005,fit_XGBoost_All_features_PCA,epurdom,COMPLETED,2024-04-07T15:20:20,2024-04-07T15:55:20,1,16,frodo,0 days 00:35:00,epurdom,frodo,AMD EPYC 7543 32-Core Processor,60132,0 days 00:00:00.034923169,0 days 00:05:59.464178517,2020435000.0,196292.2,3271.535995,54.5256
7,434004,fit_XGBoost_Individual_Features_PCA,epurdom,COMPLETED,2024-04-07T15:20:11,2024-04-07T16:02:43,1,16,frodo,0 days 00:42:32,epurdom,frodo,AMD EPYC 7543 32-Core Processor,60132,0 days 00:00:00.042439965,0 days 00:07:16.834559745,2455310000.0,238541.7,3975.695171,66.261586
