In [1]:
import h5py
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import timeit

### Ingest data

In [2]:
df = pd.read_hdf('accounting-2018-10-deid.h5', 'table')

### EDA

In [3]:
df[df == -200].info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4468061 entries, 0 to 4580086
Data columns (total 23 columns):
group              object
owner              object
job_number         float64
submission_time    datetime64[ns]
start_time         datetime64[ns]
end_time           datetime64[ns]
failed             float64
exit_status        float64
granted_pe         object
slots              float64
task_number        float64
maxvmem            float64
h_data             float64
h_rt               float64
highp              float64
exclusive          float64
h_vmem             float64
gpu                float64
pe                 object
slot               float64
wait_time          timedelta64[ns]
wtime              timedelta64[ns]
campus             float64
dtypes: datetime64[ns](3), float64(14), object(4), timedelta64[ns](2)
memory usage: 818.1+ MB


In [4]:
df.shape

(4468061, 23)

In [5]:
df.columns

Index(['group', 'owner', 'job_number', 'submission_time', 'start_time',
       'end_time', 'failed', 'exit_status', 'granted_pe', 'slots',
       'task_number', 'maxvmem', 'h_data', 'h_rt', 'highp', 'exclusive',
       'h_vmem', 'gpu', 'pe', 'slot', 'wait_time', 'wtime', 'campus'],
      dtype='object')

In [6]:
df.columns.shape

(23,)

In [7]:
# dataframe size in memory (differs somewhat from csv/h5 file size)
print(df.memory_usage().sum(), " bytes\n", 
      int(round(df.memory_usage().sum() / 1024)), " KB\n",
      int(round(df.memory_usage().sum() / 1024**2)), " MB", sep = '')

857867712 bytes
837761 KB
818 MB


In [8]:
df.head(3)

Unnamed: 0,group,owner,job_number,submission_time,start_time,end_time,failed,exit_status,granted_pe,slots,...,h_rt,highp,exclusive,h_vmem,gpu,pe,slot,wait_time,wtime,campus
0,g1,u1,3912841,2018-09-29 16:03:49,2018-10-01 07:53:27,2018-10-01 08:00:18,0,0,single,1,...,24.0,0,0,4.0,0,single,1,1 days 15:49:38,00:06:51,1
1,g2,u2,3902779,2018-09-27 21:38:06,2018-10-01 07:24:38,2018-10-01 08:00:42,0,0,single,1,...,6.0,0,0,4.0,0,single,1,3 days 09:46:32,00:36:04,0
2,g1,u1,3912841,2018-09-29 16:03:49,2018-10-01 07:58:42,2018-10-01 08:00:56,0,0,single,1,...,24.0,0,0,4.0,0,single,1,1 days 15:54:53,00:02:14,1


^^^ Output is truncated

**Make it show ALL columns**

In [9]:
# This is supposed to temporarily set options and then unset when exit with statement

with pd.option_context('display.max_rows', -1, 'display.max_columns', 5):
    print(df.head(3))

..   ...   ...  ...        ...    ...
   group owner  ...      wtime campus
0     g1    u1  ...   00:06:51      1
1     g2    u2  ...   00:36:04      0
1     g2    u2  ...   00:36:04      0
2     g1    u1  ...   00:02:14      1

[3 rows x 23 columns]


In [10]:
# # but it throws error when max_columns > 7. WHY ???

# # ValueError: max() arg is an empty sequence

# with pd.option_context('display.max_rows', -1, 'display.max_columns', 23):
#     print(df.head(3))

In [11]:
# Since the above did not work, use this:
# Set to print/output all columns. 
# Setting last for life of kernel or until set again

pd.set_option('display.max_columns', None)  
pd.set_option('display.expand_frame_repr', True)
pd.set_option('max_colwidth', -1)

In [12]:
# PRINT does so by wrapping

print(df.head(3))

  group owner  job_number     submission_time          start_time  \
0  g1    u1    3912841    2018-09-29 16:03:49 2018-10-01 07:53:27   
1  g2    u2    3902779    2018-09-27 21:38:06 2018-10-01 07:24:38   
2  g1    u1    3912841    2018-09-29 16:03:49 2018-10-01 07:58:42   

             end_time  failed  exit_status granted_pe  slots  task_number  \
0 2018-10-01 08:00:18  0       0            single     1      1338          
1 2018-10-01 08:00:42  0       0            single     1      55696         
2 2018-10-01 08:00:56  0       0            single     1      1368          

        maxvmem  h_data  h_rt  highp  exclusive  h_vmem  gpu      pe  slot  \
0  4.040196e+09  4.0     24.0  0      0          4.0     0    single  1      
1  6.098657e+08  4.0     6.0   0      0          4.0     0    single  1      
2  3.326935e+09  4.0     24.0  0      0          4.0     0    single  1      

        wait_time    wtime  campus  
0 1 days 15:49:38 00:06:51  1       
1 3 days 09:46:32 00:36:04 

In [13]:
# OUTPUT prints HTML scrollable view

df.head(3)

Unnamed: 0,group,owner,job_number,submission_time,start_time,end_time,failed,exit_status,granted_pe,slots,task_number,maxvmem,h_data,h_rt,highp,exclusive,h_vmem,gpu,pe,slot,wait_time,wtime,campus
0,g1,u1,3912841,2018-09-29 16:03:49,2018-10-01 07:53:27,2018-10-01 08:00:18,0,0,single,1,1338,4040196000.0,4.0,24.0,0,0,4.0,0,single,1,1 days 15:49:38,00:06:51,1
1,g2,u2,3902779,2018-09-27 21:38:06,2018-10-01 07:24:38,2018-10-01 08:00:42,0,0,single,1,55696,609865700.0,4.0,6.0,0,0,4.0,0,single,1,3 days 09:46:32,00:36:04,0
2,g1,u1,3912841,2018-09-29 16:03:49,2018-10-01 07:58:42,2018-10-01 08:00:56,0,0,single,1,1368,3326935000.0,4.0,24.0,0,0,4.0,0,single,1,1 days 15:54:53,00:02:14,1


In [14]:
groups =        df['group'].unique()
users =         df['owner'].unique()
job_numbers =   df['job_number'].unique()
task_numbers =  df['task_number'].unique()
start_times =   df['start_time'].unique()
end_times =     df['end_time'].unique()
faileds =       df['failed'].unique()
exit_statuses = df['exit_status'].unique()
granted_pes =   df['granted_pe'].unique()
slots =         df['slots'].unique()
        
print("FEATURE SET         # UNIQ VALUES IN", df.shape[0], "ROWS", 
      "\ngroups:            ", len(groups), 
      "\nusers:             ", len(users),
      "\njob_numbers:       ", len(job_numbers),
      "\ntask_numbers:      ", len(task_numbers),
      "\nstart_times:       ", len(start_times),
      "\nend_times:         ", len(end_times),
      "\nfaileds:           ", len(faileds),
      "\nexit_statuses:     ", len(exit_statuses),
      "\ngranted_pes:       ", len(granted_pes),
      "\nslots:             ", len(slots)
     )

FEATURE SET         # UNIQ VALUES IN 4468061 ROWS 
groups:             203 
users:              699 
job_numbers:        184688 
task_numbers:       156538 
start_times:        469299 
end_times:          1347025 
faileds:            7 
exit_statuses:      28 
granted_pes:        17 
slots:              43


**Delete columns not needed for assignment**

In [15]:
df = df[['group', 'owner', 'job_number', 'task_number', 'slot', 'slots', 'h_rt']]

In [16]:
# smaller dataframe size in memory 
print(int(round(df.memory_usage().sum() / 1000000)), " MB", sep = '')  # 214 MB
df.head(3)

286 MB


Unnamed: 0,group,owner,job_number,task_number,slot,slots,h_rt
0,g1,u1,3912841,1338,1,1,24.0
1,g2,u2,3902779,55696,1,1,6.0
2,g1,u1,3912841,1368,1,1,24.0


In [17]:
# Useful values

groups =       df['group'].unique()
users =        df['owner'].unique()
job_numbers =  df['job_number'].unique()
task_numbers = df['task_number'].unique()
slot =         df['slot'].unique()
slots =        df['slots'].unique()

print("rows:              ", df.shape[0], 
      "\ncolumns:           ", df.shape[1],
      "\ngroups:            ", len(groups), 
      "\nusers:             ", len(users),
      "\njob_numbers:       ", len(job_numbers),
      "\ntask_numbers:      ", len(task_numbers),
      "\nslot:              ", len(slot),
      "\nslots:             ", len(slots)
     )

rows:               4468061 
columns:            7 
groups:             203 
users:              699 
job_numbers:        184688 
task_numbers:       156538 
slot:               43 
slots:              43


**Findings**: 
- `job_number` is NOT unique on each row. Only 184,688 job numbers for 4,468,061 rows. This means that multiple rows can constitute a single job. See rows 0 and 3.
- There are fewer task numbers than job numbers. This means that some task numbers are repeated for some job numbers.
- Are `slot` and `slots` redundant? If so we can delete one of them.

In [18]:
all(df['slot'] == df['slots'])  # False

False

**Findings**: No, they are not ALL the same, though most of them seem to be.  
- Find differences

In [19]:
for i,j in enumerate(df['slot'] == df['slots']):
    if j == False:  # if slot and slots are DIFFERENT
        print('index', i, ':', 
              'slot =',  df['slot'][i],  '(', type(df['slot'][i]),  '):', 
              'slots =', df['slots'][i], '(', type(df['slots'][i]), ')')
        print()    # index 3839451 : slot = 1 : slots = 1

index 3839451 : slot = 1 ( <class 'numpy.int64'> ): slots = 1 ( <class 'numpy.int64'> )



**Findings**: One index returns False, yet each is a numpy.int64 value of 1. What is different about these?

**Next**: Find difference(s).

In [20]:
print(df['slot'][3839451], type(df['slot'][3839451]),    # 1 <class 'numpy.int64'>
      df['slots'][3839451], type(df['slots'][3839451]))  # 1 <class 'numpy.int64'>

df['slot'][3839451] == df['slots'][3839451]  # True

1 <class 'numpy.int64'> 1 <class 'numpy.int64'>


True

**Findings:**: Huh?

**Next**: Delete this row and continue analysis.

In [21]:
# Before deleting a row by index, first make sure the index values are unique 
# (pandas does not require them to be):
all(df.index.unique() == df.index)

True

In [22]:
# ^^^True. The list of uniq indexes is the same as the list of indexes.
#Therefore there are no duplicate indexes. Since all indexes are unique 
# deleting this index will delete only one row.
df.drop(df.index[[3839451]], inplace=True)

In [23]:
df.shape[0]

4468060

### Analysis

### Assignment 1-1

(1) How many jobs are run by each USER? (e.g. user1: 34, user2: 21, …)

In [25]:
# take a subset of owner/user col and job_num col, which will include many duplicates
# drop all but one of each set

df_jobs_by_user = df.drop_duplicates(subset=['owner', 'job_number'], keep='first')

# print(df_jobs_by_user)  #         group owner  job_number  task_number  slot  slots  h_rt
                         # 0        g1    u1     3912841     1338         1     1      24.000000 
                         # 1        g2    u2     3902779     55696        1     1      6.000000  
                         # 3        g3    u3     3907911     1241         1     1      1.500000
                         # 19       g4    u4     3913291     0            24    24     24.000000 
                         # 21       g5    u5     3914733     0            1     1      24.000000 
                         # 22       g5    u5     3914732     0            1     1      24.000000 
                         # 23       g5    u5     3914735     0            1     1      24.000000 
                         # [184687 rows x 7 columns]

In [26]:
# Group by owner and get the number of rows (.size) for each owner.
# There will be one row for each owner for each job number,
# so if a owner has 2 jobs we want to capture '2'.

df_jobs_by_user = df_jobs_by_user.copy()
print(len(df_jobs_by_user))  # 184687

184687


In [27]:
df_jobs_by_user.groupby(['owner'])['owner'].size()  
# This does NOT update df in-place. Instead it creates a slice.
# owner
# u1      83   
# u10     84   
# u100    11 
# ...
# u99     21   
# Name: owner, Length: 699, dtype: int64

owner
u1      83   
u10     84   
u100    11   
u101    42   
u102    96   
u103    788  
u104    31   
u105    90   
u106    38   
u107    198  
u108    5    
u109    44   
u11     157  
u110    241  
u111    41   
u112    920  
u113    30   
u114    394  
u115    54   
u116    33   
u117    69   
u118    2113 
u119    120  
u12     2780 
u120    256  
u121    46   
u122    113  
u123    71   
u124    17   
u125    37   
        ..   
u72     320  
u73     889  
u74     145  
u75     125  
u76     95   
u77     2054 
u78     127  
u79     50   
u8      499  
u80     205  
u81     3    
u82     72   
u83     30   
u84     52   
u85     24   
u86     93   
u87     289  
u88     47   
u89     408  
u9      58   
u90     376  
u91     17   
u92     52   
u93     41   
u94     11366
u95     223  
u96     17   
u97     9    
u98     9    
u99     21   
Name: owner, Length: 699, dtype: int64

In [28]:
df_jobs_by_user = df_jobs_by_user.groupby(['owner'])['owner'].size() 
print(len(df_jobs_by_user))  # 699

699


In [29]:
df_jobs_by_user
# owner
# u1      83   
# u10     84   
# u100    11   
# u101    42 ...

type(df_jobs_by_user)  # pandas.core.series.Series. THIS IS NOT A DF ANYMORE.
df_jobs_by_user = pd.DataFrame(df_jobs_by_user)
type(df_jobs_by_user)  # it's a df again, but only has 1 col with the owner info as index

pandas.core.frame.DataFrame

In [30]:
# print((df_jobs_by_user.index)[:3], '\n')  # Index(['u1', 'u10', 'u100', ..., 'u99'], 
#                                           # dtype='object', name='owner', length=699)

# print((df_jobs_by_user.columns)[:3], '\n')  # Index(['owner'], dtype='object')

# print((df_jobs_by_user['owner']).head(3), '\n')  # Returns same as the pandas Series 
#                                                  # df_jobs_by_user w/o the index
# # # owner
# # u1      83   
# # u10     84   
# # u100    11   
# # u101    42 ...

# print((df_jobs_by_user[['owner']]).head(3), '\n')  # double brackets
# # Returns a fancier printout:
# # 	owner
# # owner	
# # u1	83
# # u10	84 ...

# # Can query the table to find where a VALUE is True:
# print((df_jobs_by_user[['owner']] == 83).head(3))  # conditional statement
# # owner
# # owner	
# # u1	True  <--
# # u10	False ...

# # ^^^ This returns a boolean mask of the entire df.

# OUTPUT OF THE ABOVE:

# Index(['u1', 'u10', 'u100'], dtype='object', name='owner') 

# Index(['owner'], dtype='object') 

# owner
# u1      83
# u10     84
# u100    11
# Name: owner, dtype: int64 

#        owner
# owner       
# u1     83   
# u10    84   
# u100   11    

#        owner
# owner       
# u1     True 
# u10    False
# u100   False

In [31]:
# # To return **one** of the values
# # or assign a value based on the index string or number:

# print("df_jobs_by_user['owner'][0]  -->", df_jobs_by_user['owner'][0])       # 83
# print("df_jobs_by_user['owner']['u1'] -->", df_jobs_by_user['owner']['u1'])  # 83

# print('df_jobs_by_user.index[0]     -->', df_jobs_by_user.index[0])  # 'u1' Now we're cooking with lajiao. 
# # df_jobs_by_user.index == 'u1'  # array([ True, False, False, ...

# # OUTPUT OF THE ABOVE:

# # df_jobs_by_user['owner'][0]  --> 83
# # df_jobs_by_user['owner']['u1'] --> 83
# # df_jobs_by_user.index[0]     --> u1

In [32]:
# Jobs by user answer:
print(df_jobs_by_user)

#         owner
# owner       
# u1     83   
# u10    84   
# u100   11 ...

       owner
owner       
u1     83   
u10    84   
u100   11   
u101   42   
u102   96   
u103   788  
u104   31   
u105   90   
u106   38   
u107   198  
u108   5    
u109   44   
u11    157  
u110   241  
u111   41   
u112   920  
u113   30   
u114   394  
u115   54   
u116   33   
u117   69   
u118   2113 
u119   120  
u12    2780 
u120   256  
u121   46   
u122   113  
u123   71   
u124   17   
u125   37   
...    ..   
u72    320  
u73    889  
u74    145  
u75    125  
u76    95   
u77    2054 
u78    127  
u79    50   
u8     499  
u80    205  
u81    3    
u82    72   
u83    30   
u84    52   
u85    24   
u86    93   
u87    289  
u88    47   
u89    408  
u9     58   
u90    376  
u91    17   
u92    52   
u93    41   
u94    11366
u95    223  
u96    17   
u97    9    
u98    9    
u99    21   

[699 rows x 1 columns]


**(2) How many jobs are run by each GROUP?** (e.g. group1: 324, group2: 24, ..)

In [33]:
df_jobs_by_group = df.drop_duplicates(subset=['group', 'job_number'], keep='first')  # drop dupes
df_jobs_by_group = df_jobs_by_group.copy()  # copy to avoid assigning to a slice
df_jobs_by_group = df_jobs_by_group.groupby(['group'])['group'].size()  # Converts df to a series
df_jobs_by_group = pd.DataFrame(df_jobs_by_group)  # convert back to a Dataframe
df_jobs_by_group.head()
# group
# g1      85  
# g10     295 
# g100    85  
# g101    136 
# g102    6  
# Name: group, Length: 203, dtype: int64

Unnamed: 0_level_0,group
group,Unnamed: 1_level_1
g1,85
g10,295
g100,85
g101,136
g102,6


Can a job number be associated with more than one user or group?

In [36]:
# # look at users by job number instead of job numbers by user
users_by_job_number = df.groupby(['job_number', 'owner'])['job_number', 'owner']     

In [37]:
type(users_by_job_number)

pandas.core.groupby.DataFrameGroupBy

In [40]:
users_by_job_number

<pandas.core.groupby.DataFrameGroupBy object at 0x000002B00AC8B780>

In [39]:
for i,j in enumerate(users_by_job_number):
    if i < 5:
        print(j)

((130207, 'u468'),         group owner  job_number  task_number  slot  slots  h_rt
1135622  g2    u468  130207      0            1     1      4.0 )
((130254, 'u468'),         group owner  job_number  task_number  slot  slots  h_rt
1135684  g2    u468  130254      0            1     1      4.0 )
((130362, 'u468'),         group owner  job_number  task_number  slot  slots  h_rt
1135918  g2    u468  130362      0            1     1      2.0 )
((130454, 'u468'),         group owner  job_number  task_number  slot  slots  h_rt
1135641  g2    u468  130454      0            2     2      2.0 )
((2946857, 'u49'),        group owner  job_number  task_number  slot  slots   h_rt
4696    g7    u49   2946857     938          1     1      336.0
8908    g7    u49   2946857     940          1     1      336.0
9494    g7    u49   2946857     941          1     1      336.0
10372   g7    u49   2946857     933          1     1      336.0
11048   g7    u49   2946857     951          1     1      336.0
12613

In [None]:
# 0          u1  
# 1          u2  
# 2          u1 ...
# 4580078    u409
# Name: owner, Length: 233008, dtype: object

# 233008 is a strange len. It does not approximate any of the values I would expect:
# groups:             203 
# users:              699 
# job_numbers:        184688 
# task_numbers:       156538 

**How get rid of "Name: owner, Length: 233008, dtype: object" text at end of pandas printouts???**

In [44]:
x = pd.Series(users_by_job_number)
x.head()
# 0    ((130207, u468), [u468])                                                                                                                       
# 1    ((130254, u468), [u468])                                                                                                                       
# 2    ((130362, u468), [u468])                                                                                                                       
# 3    ((130454, u468), [u468])                                                                                                                       
# 4    ((2946857, u49), [u49, u49, u49, u49, u49, u49, u49, u49, u49, u49, u49, u49, u49, u49, u49, u49, u49, u49, u49, u49, u49, u49, u49, u49, u49])
# dtype: object

StopIteration: 

In [43]:
# ^^^ IT'S MISSING ALL THE INDEX VALUES FOR THE LISTS.

x = pd.DataFrame(users_by_job_number)  # ValueError: DataFrame constructor not properly called!

print(x.to_string(header=None, index=None))

ValueError: DataFrame constructor not properly called!

In [None]:
# As last resort, convert to list if can't figure out how object structure
# of SeriesGroupBy
users_by_job_number_SHORT = list(users_by_job_number)  
type(users_by_job_number_SHORT)  # list
users_by_job_number_SHORT[:5]

# users_by_job_number = pd.Series(users_by_job_number_SHORT)   # taking long time
# users_by_job_number = pd.DataFrame(users_by_job_number)  # taking long time

In [65]:
for i,j in enumerate(users_by_job_number):
    if i < 5:   # during development just process first few elements
        print('i :', i, '\nj :', j)  
                    # i : 0 
                    # j : ((130207, 'u468'), 1135622    u468
                    # Name: owner, dtype: object)
                    # i : 1 
                    # j : ((130254, 'u468'), 1135684    u468
                    # Name: owner, dtype: object)
                    # i : 2 
                    # j : ((130362, 'u468'), 1135918    u468
                    # Name: owner, dtype: object)
                    # i : 3 
                    # j : ((130454, 'u468'), 1135641    u468
                    # Name: owner, dtype: object)
                    # i : 4 
                    # j : ((2946857, 'u49'), 4696      u49
                    # 8908      u49
                    # 9494      u49
                    # 10372     u49
                    # 11048     u49
                    # 12613     u49
                    # 22427     u49
                    # 24999     u49
                    # 29856     u49
                    # 29871     u49
                    # 41455     u49
                    # 42068     u49
                    # 46821     u49
                    # 47045     u49
                    # 48097     u49
                    # 50461     u49
                    # 53510     u49
                    # 57404     u49
                    # 59820     u49
                    # 61092     u49
                    # 61248     u49
                    # 75935     u49
                    # 87112     u49
                    # 93150     u49
                    # 110248    u49
                    # Name: owner, dtype: object)
            
#         for k,l in enumerate(j):  # iterate over 1 or more items in j
#             # first item, j[k] when k = 0, is always a tuple with 2 elements
#             if k == 0:
#                 # print
#                 print('job', j[k][0], ': user', j[k][1])
#                 print('k ==', k, ' : "j[k]"', j[k])
#                 print('k ==', k, ' : "l[k]"', l[k])
#                 print('k ==', k, ' : "l":', l)
                
#             # any further items are always in a Series with 2 elements per line
#             for k in range(1, len(j)):
#                 # print each
#                 print('k ==', k, ' : "j[k]"', j[k])
#                 print('k ==', k, ' : "l[k]"', l[k])
#                 print('k ==', k, ' : "l":', l)
#                 print('k ==', k, "l['1135622'].index", l['1135622'].index)
#                 print('k ==', k, 'job', j[k].index[0][0])
#                 print('k ==', k, ': user', j[k].index[0][1])
            

#         print(j[1])
#         print(type(j))     # tuple
#         print(type(j[0]))  # tuple
#         print(type(j[1]))  # pandas.core.series.Series
        
#         print(j[1].index[0])

i : 0 
j : ((130207, 'u468'), 1135622    u468
Name: owner, dtype: object)
i : 1 
j : ((130254, 'u468'), 1135684    u468
Name: owner, dtype: object)
i : 2 
j : ((130362, 'u468'), 1135918    u468
Name: owner, dtype: object)
i : 3 
j : ((130454, 'u468'), 1135641    u468
Name: owner, dtype: object)
i : 4 
j : ((2946857, 'u49'), 4696      u49
8908      u49
9494      u49
10372     u49
11048     u49
12613     u49
22427     u49
24999     u49
29856     u49
29871     u49
41455     u49
42068     u49
46821     u49
47045     u49
48097     u49
50461     u49
53510     u49
57404     u49
59820     u49
61092     u49
61248     u49
75935     u49
87112     u49
93150     u49
110248    u49
Name: owner, dtype: object)


**(3) For each group, list the number of users, and list each groups' users (i.e. the users having the same “group” field). **

For example: group 1 (4 users): user2, user32, user41, user56

In [None]:
# define LIST to hold (group,user) TUPLES, e.g., [(g1,u1),(g1,u49),(g2,u33),...]
group_users = []

# remove unneeded cols in df
df2 = df[['group', 'owner']]

# create LIST of all **unique** group-user TUPLES (weed out duplicates)
for index, row in df2.iterrows():
    # check that tuple doesn't already exist in the list
    if not (row['group'], row['owner']) in group_users:  
        group_users.append((row['group'], row['owner']))

# print(len(group_users), sorted(group_users))

```700 [('g1', 'u1'), ('g1', 'u145'), ('g10', 'u101'), ('g10', 'u11'), ('g10', 'u493'), ('g10', 'u535'), ('g10', 'u692'), ('g100', 'u252'), ('g101', 'u260'), ('g101', 'u547'), ('g102', 'u261'), ('g103', 'u263'), ('g104', 'u265'), ('g104', 'u335'), ('g104', 'u396'), ('g104', 'u438'), ('g104', 'u531'), ('g105', 'u266'), ('g105', 'u484'), ('g106', 'u267'), ('g106', 'u691'), ('g106', 'u696'), ('g107', 'u271'), ('g107', 'u300'), ('g108', 'u283'), ('g109', 'u286'), ('g109', 'u478'), ('g109', 'u491'), ('g11', 'u12'), ('g11', 'u52'), ('g110', 'u287'), ('g110', 'u502'), ('g111', 'u292'), ('g111', 'u327'), ('g112', 'u293'), ('g113', 'u294'), ('g114', 'u295'), . . . ]```

In [None]:
# create dict where k:v is group#:[all_users], e.g.:
# {'g1': ['u1', 'u145'], 'g2': ['u2', 'u77', 'u154', 'u187', 'u210', 'u274', 
# 'u276', 'u285', 'u367', 'u420', 'u468'], 'g3': ['u3',...}

d = {}

for t in group_users:  # t = group-user tuple, e.g., (1, 3)
    if not t[0] in d.keys():       # key is NOT in dict ...
        d.update({t[0]: []})           # (empty list 1st bc otherwise lists chars of str)
        d[t[0]].append(str(t[1]))  # ... so add it with value.
    elif not t[1] in d[t[0]]:      # key IS in dict but t[1] is not in value list ...
        d[t[0]].append(str(t[1]))  # ... so append it to value list.

# print(d)

```{'g1': ['u1', 'u145'], 'g2': ['u2', 'u77', 'u154', 'u187', 'u210', 'u274', 'u276', 'u285', 'u367', 'u420', 'u468', 'u476', 'u580', 'u633', 'u646', 'u659'], 'g3': ['u3', 'u626'], 'g4': ['u4', 'u15', 'u33', 'u53', 'u59', 'u62', 'u69', 'u99', 'u417'], 'g5': ['u5', 'u23', 'u47', 'u149', 'u336', 'u664'], 'g6': ['u6', 'u7', 'u48', 'u84', 'u87', 'u103', 'u164', 'u182', 'u200', 'u217', 'u218', 'u230', 'u243', 'u254', 'u255', 'u257', 'u323', 'u362', 'u409', 'u422', 'u432', 'u436', 'u503', 'u539', 'u559', 'u563', 'u589', 'u590', 'u597', 'u600', 'u614', 'u638', 'u657', 'u666', 'u675', 'u689'], 'g7': ['u8', 'u14', . . . 'g203': ['u698']}```

In [None]:
# Print: example: group 1 (4 users): user2, user32, user41, user56

for k,v in d.items():
    if len(v) == 1:  # use singular "user"
        print('group ', k[1:], ' (', len(v), ' user): ', ', '.join(v), sep = '')
    else:            # plural "users"
        print('group ', k[1:], ' (', len(v), ' users): ', ', '.join(v), sep = '')

```group 1 (2 users): u1, u145
group 2 (16 users): u2, u77, u154, u187, u210, u274, u276, u285, u367, u420, u468, u476, u580, u633, u646, u659
group 3 (2 users): u3, u626
group 4 (9 users): u4, u15, u33, u53, u59, u62, u69, u99, u417
group 5 (6 users): u5, u23, u47, u149, u336, u664
group 6 (36 users): u6, u7, u48, u84, u87, u103, u164, u182, u200, u217, u218, u230, u243, u254, u255, u257, u323, u362, u409, u422, u432, u436, u503, u539, u559, u563, u589, u590, u597, u600, u614, u638, u657, u666, u675, u689
group 7 (70 users): u8, u14, u16, u20, u36, u39, u44, u49, u54, u70, u96, u107, u132, u138, u143, u146, u152, u158, u160, u186, u190, u191, u194, u195, u232, u250, u251, u258, u270, u301, u303, u320, u334, u341, u356, u357, u361, u364, u381, u383, u384, u388, u413, u455, u459, u460, u475, u487, u490, u516, u533, u556, u558, u592, u605, u619, u621, u623, u627, u637, u648, u651, u654, u665, u672, u673, u676, u679, u693, u699
group 8 (5 users): u9, u450, u519, u582, u618
group 9 (17 users): u10, u93, u104, u131, u133, u389, u390, u391, u479, u480, u482, u514, u593, u653, u660, u661, u686
group 10 (5 users): u11, u101, u493, u535, u692
group 11 (2 users): u12, u52
group 12 (11 users): u13, u19, u192, u240, u346, u354, u392, u529, u578, u584, u669
group 13 (3 users): u17, u67, u281
group 14 (5 users): u18, u42, u106, u123, u554
group 15 (2 users): u21, u448
group 16 (2 users): u22, u284
group 17 (12 users): u24, u66, u118, u177, u238, u272, u371, u395, u431, u481, u494, u613
group 18 (8 users): u25, u27, u40, u41, u428, u492, u505, u678
group 19 (3 users): u26, u326, u419
group 20 (4 users): u28, u242, u348, u414
group 21 (7 users): u29, u56, u82, u280, u430, u464, u687
group 22 (9 users): u30, u57, u162, u179, u291, u342, u585, u588, u697
group 23 (2 users): u31, u151
group 24 (46 users): u32, u60, u86, u97, u108, u109, u111, u112, u121, u134, u141, u157, u202, u231, u233, u237, u239, u249, u253, u264, u268, u269, u275, u307, u309, u315, u319, u332, u333, u360, u366, u440, u461, u520, u534, u549, u586, u603, u462, u631, u632, u635, u668, u671, u674, u690
group 25 (8 users): u34, u71, u127, u290, u403, u483, u485, u684
group 26 (3 users): u35, u222, u358
group 27 (2 users): u37, u581
group 28 (3 users): u38, u51, u205
group 29 (5 users): u43, u163, u214, u441, u528
group 30 (2 users): u45, u387
group 31 (9 users): u46, u58, u63, u116, u130, u209, u211, u246, u344
group 32 (5 users): u50, u273, u298, u474, u541
group 33 (2 users): u55, u288
group 34 (13 users): u61, u114, u126, u144, u282, u297, u379, u452, u510, u511, u532, u601, u607
group 35 (7 users): u64, u119, u122, u148, u337, u415, u544
group 36 (2 users): u65, u155
group 37 (6 users): u68, u74, u139, u234, u247, u599
group 38 (1 user): u72
group 39 (3 users): u73, u153, u650
group 40 (3 users): u75, u256, u466
group 41 (1 user): u76
group 42 (3 users): u78, u180, u685
group 43 (4 users): u79, u221, u411, u465
group 44 (5 users): u80, u159, u248, u369, u630
group 45 (2 users): u81, u219
group 46 (12 users): u83, u136, u165, u168, u174, u393, u454, u497, u508, u522, u566, u655
group 47 (1 user): u85
group 48 (2 users): u88, u351
group 49 (1 user): u89
group 50 (5 users): u90, u110, u331, u496, u553
group 51 (4 users): u91, u166, u628, u688
group 52 (9 users): u92, u117, u188, u212, u262, u378, u488, u501, u645
group 53 (5 users): u94, u178, u227, u279, u680
group 54 (1 user): u95
group 55 (3 users): u98, u429, u571
group 56 (3 users): u100, u615, u652
group 57 (9 users): u102, u184, u216, u244, u278, u458, u512, u521, u524
group 58 (1 user): u105
group 59 (2 users): u113, u394
group 60 (4 users): u115, u135, u161, u604
group 61 (2 users): u120, u469
group 62 (1 user): u124
group 63 (6 users): u125, u137, u172, u203, u228, u507
group 64 (4 users): u128, u353, u382, u513
group 65 (2 users): u129, u641
group 66 (3 users): u140, u506, u616
group 67 (1 user): u142
group 68 (4 users): u147, u372, u446, u451
group 69 (3 users): u150, u156, u486
group 70 (1 user): u167
group 71 (3 users): u169, u339, u340
group 72 (6 users): u170, u259, u277, u435, u463, u602
group 73 (1 user): u171
group 74 (1 user): u173
group 75 (3 users): u175, u206, u608
group 76 (3 users): u176, u610, u611
group 77 (8 users): u181, u321, u397, u398, u399, u400, u401, u402
group 78 (4 users): u183, u355, u425, u426
group 79 (3 users): u185, u385, u517
group 80 (1 user): u189
group 81 (3 users): u193, u418, u537
group 82 (1 user): u196
group 83 (2 users): u197, u408
group 84 (4 users): u198, u365, u579, u609
group 85 (3 users): u199, u515, u695
group 86 (1 user): u201
group 87 (2 users): u204, u359
group 88 (4 users): u207, u289, u587, u594
group 89 (6 users): u208, u245, u410, u427, u629, u670
group 90 (1 user): u213
group 91 (1 user): u215
group 92 (2 users): u220, u617
group 93 (1 user): u223
group 94 (3 users): u224, u406, u551
group 95 (1 user): u225
group 96 (1 user): u226
group 97 (3 users): u229, u373, u622
group 98 (2 users): u235, u236
group 99 (5 users): u241, u421, u498, u538, u639
group 100 (1 user): u252
group 101 (2 users): u260, u547
group 102 (1 user): u261
group 103 (1 user): u263
group 104 (5 users): u265, u335, u396, u438, u531
group 105 (2 users): u266, u484
group 106 (3 users): u267, u691, u696
group 107 (2 users): u271, u300
group 108 (1 user): u283
group 109 (3 users): u286, u478, u491
group 110 (2 users): u287, u502
group 111 (2 users): u292, u327
group 112 (1 user): u293
group 113 (1 user): u294
group 114 (1 user): u295
group 115 (1 user): u296
group 116 (1 user): u299
group 117 (2 users): u302, u380
group 118 (2 users): u304, u312
group 119 (4 users): u305, u509, u525, u527
group 120 (6 users): u306, u316, u317, u322, u328, u570
group 121 (8 users): u308, u329, u330, u347, u462, u567, u574, u595
group 122 (1 user): u310
group 123 (1 user): u311
group 124 (1 user): u313
group 125 (1 user): u314
group 126 (2 users): u318, u443
group 127 (1 user): u324
group 128 (1 user): u325
group 129 (2 users): u338, u518
group 130 (1 user): u343
group 131 (2 users): u345, u471
group 132 (2 users): u349, u376
group 133 (1 user): u350
group 134 (3 users): u352, u467, u470
group 135 (1 user): u363
group 136 (4 users): u368, u550, u612, u643
group 137 (1 user): u370
group 138 (2 users): u374, u548
group 139 (1 user): u375
group 140 (1 user): u377
group 141 (1 user): u386
group 142 (1 user): u404
group 143 (1 user): u405
group 144 (1 user): u407
group 145 (1 user): u412
group 146 (1 user): u416
group 147 (2 users): u423, u530
group 148 (1 user): u424
group 149 (1 user): u433
group 150 (1 user): u434
group 151 (1 user): u437
group 152 (1 user): u439
group 153 (1 user): u442
group 154 (1 user): u444
group 155 (4 users): u445, u489, u495, u644
group 156 (2 users): u447, u681
group 157 (1 user): u449
group 158 (1 user): u453
group 159 (2 users): u456, u591
group 160 (1 user): u457
group 161 (1 user): u472
group 162 (1 user): u473
group 163 (1 user): u477
group 164 (3 users): u499, u540, u577
group 165 (4 users): u500, u543, u598, u649
group 166 (1 user): u504
group 167 (1 user): u523
group 168 (2 users): u526, u625
group 169 (1 user): u536
group 170 (1 user): u542
group 171 (1 user): u545
group 172 (1 user): u546
group 173 (1 user): u552
group 174 (1 user): u555
group 175 (1 user): u557
group 176 (1 user): u560
group 177 (1 user): u561
group 178 (1 user): u562
group 179 (1 user): u564
group 180 (1 user): u565
group 181 (1 user): u568
group 182 (2 users): u569, u642
group 183 (1 user): u572
group 184 (1 user): u573
group 185 (2 users): u575, u694
group 186 (1 user): u576
group 187 (1 user): u583
group 188 (1 user): u596
group 189 (1 user): u606
group 190 (1 user): u620
group 191 (1 user): u624
group 192 (1 user): u634
group 193 (1 user): u636
group 194 (1 user): u640
group 195 (2 users): u647, u656
group 196 (1 user): u658
group 197 (1 user): u662
group 198 (1 user): u663
group 199 (1 user): u667
group 200 (1 user): u677
group 201 (1 user): u682
group 202 (1 user): u683
group 203 (1 user): u698```

### Assignment 1-2
Compute the CPU-hours, defined as (h\_rt)*(slots), used by each group by generating a diagram similar to the following.

Sort the results in decreasing order. Your plot should look like the following in which the labels of the vertical axis are the names of the groups and the labels of the horizontal axis are the CPU-hour values:

In [None]:
df['cpu_hours_by_group'] = df.h_rt * df.slots
# df.head(2)

In [None]:
grouped = df.groupby(['group'])['cpu_hours_by_group'].sum()
type(grouped)

In [None]:
grouped  # g1  2.377116e+06, g10 3.301120e+06, . . .

In [None]:
grouped = pd.DataFrame(grouped)
type(grouped)

In [None]:
round(grouped, 0)

In [None]:
# sort grouped df on values in cpu_hours_by_group column 
grouped = grouped.sort_values('cpu_hours_by_group', ascending = False)
# grouped  #g4 4.406948e+08, g9 3.747326e+07, ...

In [None]:
grouped.columns

In [None]:
# # plot "grouped" df

# fig, ax = plt.subplots()
# # fig = plt.figure(figsize=(8,6))
# width, height = 8, 26
# plt.figure(figsize=(width, height), dpi=80)  # it's not plotting big enough

# # %pylab inline
# # pylab.rcParams['figure.figsize'] = (10, 6)  # width x height in inches
# # plt.tight_layout()

# # # Settings for y tick labels
# # fontdict = {'fontsize': rcParams['axes.titlesize'],  # make smaller
# #         'fontweight': rcParams['axes.titleweight'],  # make lighter
# #         'verticalalignment': 'baseline',
# #         'horizontalalignment': loc}
# # # validating functions are defined and associated with rc parameters in
# # # :mod:`matplotlib.rcsetup`

# ax.barh(grouped.index, 
#         grouped.cpu_hours_by_group, 
#         color='green', 
#         ecolor='black')
# # ax.set_yticks(grouped.y_pos)
# ax.set_yticklabels(grouped.index)
# ax.invert_yaxis()  # labels read top-to-bottom
# ax.set_xlabel('CPU-hours')
# ax.set_ylabel('Group label')
# ax.set_title('CPU-hours by group')

# # # plot it on a log scale

# plt.xscale('log')

# # # plt.subplot(222)
# # plt.semilogx(t, np.sin(2*np.pi*t))
# # # plt.title('semilogx')
# # # plt.grid(True)

# plt.show()

In [None]:
# # Create a new figure of size 8x6 points (width x height), using 80 dots per inch
# # Creates a WxH ratio that constrains itself in Jupyter
# plt.figure(figsize=(8,46), dpi=80)  

# # Create a new subplot from a grid of 1x1
# plt.subplot(111)  # subplot grid parameter encoded as a single integer. 
#                   # Alternative form is subplot(1, 1, 1).
#                   # "111" means "1x1 grid, first subplot" and 
#                   # "234" means "2x3 grid, 4th subplot".

# x = grouped.cpu_hours_by_group
# y = grouped.index

# # Plot x,y using blue color with a continuous line of width 1 (pixels)
# plt.plot(x, y, color="blue", linewidth=1.0, linestyle="-")

# # Set x limits
# min = 0
# max = int(round(max(grouped.cpu_hours_by_group) * 1.02))  # TypeError: 'int' object is not callable
# plt.xlim(min, max)

# # Set x ticks
# plt.xticks(np.linspace(min, max, 10, endpoint=True))

# # Set y limits
# min = 0
# max = len(grouped.index)
# plt.ylim(min, max)

# # Set y ticks
# plt.yticks(np.linspace(min, max + 1, max + 1, endpoint=True))

# # Save figure using 72 dots per inch
# # savefig("cpu_hrs_by_grp.png", dpi=72)

# # Show result on screen
# plt.show()

In [None]:
# LATEST

# need to learn pyplot interface

# Create a new figure of size 8x6 points (width x height), using 80 dots per inch
# Creates a WxH ratio that constrains itself in Jupyter
plt.figure(figsize=(8,46), dpi=80)  

# Create a new subplot from a grid of 1x1
plt.subplot(111)  # subplot grid parameter encoded as a single integer. 
                  # Alternative form is subplot(1, 1, 1).
                  # "111" means "1x1 grid, first subplot" and 
                  # "234" means "2x3 grid, 4th subplot".

x = grouped.cpu_hours_by_group
y = grouped.index

plt.barh(y, x, color='green', ecolor='black')
# ax.set_yticks(grouped.y_pos)
plt.set_yticklabels(grouped.index, fontsize=8)  # AttributeError: module 'matplotlib.pyplot' 
                                    # has no attribute 'set_yticklabels'
plt.invert_yaxis()  # labels read top-to-bottom
plt.set_xlabel('CPU-hours')
plt.set_ylabel('Group label')
plt.set_title('CPU-hours by group')

# Set x limits
min = 0
max = int(round(max(grouped.cpu_hours_by_group) * 1.02))  # TypeError: 'int' object is not callable
plt.xlim(min, max)

# Set x ticks
plt.xticks(np.linspace(min, max, 10, endpoint=True))

# Set y limits
min = 0
max = len(grouped.index)
plt.ylim(min, max)

# Set y ticks
plt.yticks(np.linspace(min, max + 1, max + 1, endpoint=True))

# Save figure using 72 dots per inch
# savefig("cpu_hrs_by_grp.png", dpi=72)

# Show result on screen
plt.show()

### Assignment 1-3
Compute the total number of jobs corresponding to each “slot” number by generating a plot like the following one.

Note that the number of jobs corresponding to “slot=1” is much larger so we will not display it in order to avoid distorting the results. At the end, you should get a diagram similar to the following one:

In [None]:
# add column to count number of jobs per row (always 1)

df['jobs_by_slot'] = 1

The "slot" column is the slot label (1, 2, 3, etc.). This is different from the "slots" column above, which is XXX.

In [None]:
df.groupby(['slot'])['jobs_by_slot'].sum() == df.groupby(['slot'])['slots'].sum()

In [None]:
# created a new df "grouped" that groups by slot field and sums jobs_by_slot
grouped = df.groupby(['slot'])['jobs_by_slot'].sum()
grouped = pd.DataFrame(grouped)
grouped.drop([1])  # drop row named 1 (not row at index 1)

In [None]:
grouped.index

In [None]:
# plot from "grouped" df

fig, ax = plt.subplots()
# fig = plt.figure(figsize=(8,6))
width, height = 8, 26
plt.figure(figsize=(width, height))  # it's not plotting big enough

ax.barh(grouped.index, grouped.jobs_by_slot, 
        color='green', ecolor='black')
# ax.set_yticks(grouped.y_pos)
ax.set_yticklabels(grouped.index)
ax.invert_yaxis()  # labels read top-to-bottom
ax.set_ylabel('Slot labels')
ax.set_xlabel('Number of jobs')
ax.set_title('Jobs by slot')

plt.show()

**Iteration vs. vectorization**  
Post on iterative processes vs. vectorized processes:

https://stackoverflow.com/questions/54028199/for-loops-with-pandas-when-should-i-care/54028200#54028200

# STUFF

In [None]:
# # In an earlier version of the code the following was true. Now it is not:
# # I didn't need to create df_jobs_by_group. As proof:
# all(df_jobs_by_user.groupby(['group'])['group'].size() == 
#     df_jobs_by_group.groupby(['group'])['group'].size())   # True

# # df_jobs_by_user would have worked also.

# # I think this is because user and group are related and user is LESS CONSTRAINED
# # (there are more users than groups).

# # I don't think it would have worked if I had created df_jobs_by_group
# # first and then tried to use it for jobs by user. Reason: All jobs related to all
# # groups still appear in the longer (699) jobs by user list, but not all jobs related to all
# # users appear in the shorter (203) jobs by group list.

# # As a test of this hypothesis:

# df_jobs_by_group.groupby(['owner'])['owner'].size()

# all(df_jobs_by_user.groupby(['owner'])['owner'].size() == 
#     df_jobs_by_group.groupby(['owner'])['owner'].size())

# # Hmm, this is also True. Ah, the df is not 203 and 699 rows. It is the len of
# # the number of job numbers, 184 thousand some. Still, this shows I didn't need
# # to create a new df just for the jobs by group analysis.

In [None]:
# # Add a 'num_of_jobs' col to the df to capture these values.
# df_jobs_by_user.loc[:,('num_of_jobs')] = (df_jobs_by_user
#                                           .groupby(['owner'])['owner'].size())  # FINALLY!

# df_jobs_by_user  # producing NaNs :(

In [None]:
# # In an earlier version of the code the following was true. Now it is not:
# # I didn't need to create df_jobs_by_group. As proof:
# all(df_jobs_by_user.groupby(['group'])['group'].size() == 
#     df_jobs_by_group.groupby(['group'])['group'].size())   # True

# # df_jobs_by_user would have worked also.

# # I think this is because user and group are related and user is LESS CONSTRAINED
# # (there are more users than groups).

# # I don't think it would have worked if I had created df_jobs_by_group
# # first and then tried to use it for jobs by user. Reason: All jobs related to all
# # groups still appear in the longer (699) jobs by user list, but not all jobs related to all
# # users appear in the shorter (203) jobs by group list.

# # As a test of this hypothesis:

# df_jobs_by_group.groupby(['owner'])['owner'].size()

# all(df_jobs_by_user.groupby(['owner'])['owner'].size() == 
#     df_jobs_by_group.groupby(['owner'])['owner'].size())

# # Hmm, this is also True. Ah, the df is not 203 and 699 rows. It is the len of
# # the number of job numbers, 184 thousand some. Still, this shows I didn't need
# # to create a new df just for the jobs by group analysis.

In [None]:
# # sum of 3rd column is num of rows in orig df
# print(users_by_job_number.count().sum())  # 4468061
# print()

# # Returns user, uniq job numbers, and number of rows in which each uniq job number appears
# # So the len of output is 184688 (num of uniq job nums) and 

# print(users_by_job_number.count())  # job_number  owner
#                                     # 130207      u468        1
#                                     # 130254      u468        1
#                                     # 130362      u468        1
#                                     # 130454      u468        1
#                                     # 2946857     u49        25
#                                     # 3339208     u417        2 ...
#                                     # 3413360     u31       740
#                                     # 3586518     u45        80
#                                     # 3611822     u417        2
#                                     # 3639910     u31      1028
#                                     # Name: owner, Length: 184687, dtype: int64

In [None]:
# job_numbers_by_user = df.groupby(['owner','job_number'])['job_number']

In [None]:
# # Returns user (owner), uniq job numbers, and number of rows in which each job number appears
# # So the len of output is 184687 (num of uniq job nums) 

# print(job_numbers_by_user.count())  # owner  job_number
# #                                     # u1     3912841       1645
# #                                     #        3912842       3000  ...
# #                                     # Name: job_number, Length: 184687, dtype: int64

In [None]:
# df.groupby(['owner','job_number'])['job_number'].count()  # same as above

In [None]:
# # different method, same result

# df.groupby(['owner', 'job_number']).size()  # owner  job_number
#                                             # u1     3912841       1645
#                                             #        3912842       3000 ...
#                                             # Length: 184687, dtype: int64

In [None]:
# all(df.groupby(['owner', 'job_number']).size() == 
#     df.groupby(['owner','job_number'])['job_number'].count())  # True

In [None]:
# # sum of 3rd column above is 4468060 (num of rows in df)
# print(job_numbers_by_user.count().sum())  # 4468060

In [None]:
# # ngroup : Number each group from 0 to the number of groups - 1.

# # This is the enumerative complement of cumcount. Note that the numbers given 
# # to the groups match the order in which the groups would be seen when iterating 
# # over the groupby object, not the order they are first observed.

# # But what does it tell us ??? 

# # Since cumcount numbers the rows in each group, the "enumerative complement"
# # must be the index of the row at which each group is first observed. ???

# print(job_numbers_by_user.ngroup())   # 0               0
# #                                     # 1           64629  <-- 1 less than w full df
# #                                     # 2               0
# #                                     # 3           92437  <-- 1 less than w full df
# #                                     # 4           92437  <-- 1 less than w full df
# #                                     # ...
# #                                     # Length: 4468060, dtype: int64

In [None]:
# # nunique returns len uniq job_number

# print(job_numbers_by_user.nunique())    # owner  job_number
# #                                         # u1     3912841       1
# #                                         #        3912842       1 ...
# #                                         # Name: job_number, Length: 184688, dtype: int64

In [None]:
# job_numbers_by_user.nunique().sum()  # 184687

Ways to count by keys

In [None]:
# # df.groupby('key1')['key2'].apply(lambda x: x[x == 'one'].count())

# # Option 1

# df.set_index('key1').key2.eq('one').sum(level=0).astype(int).reset_index()

# #   key1  key2
# # 0    a     2
# # 1    b     1
# # 2    c     0

# # Option 2

# df.key2.eq('one').groupby(df.key1).sum().astype(int).reset_index()

# #   key1  key2
# # 0    a     2
# # 1    b     1
# # 2    c     0

# # Option 3

# f, u = df.key1.factorize()
# pd.DataFrame(dict(key1=u, key2=np.bincount(f, df.key2.eq('one')).astype(int)))

# #   key1  key2
# # 0    a     2
# # 1    b     1
# # 2    c     0

# # Option 4

# pd.crosstab(df.key1, df.key2.eq('one'))[True].rename('key2').reset_index()

# #   key1  key2
# # 0    a     2
# # 1    b     1
# # 2    c     0

# # Option 5

# pd.get_dummies(df.key1).mul(df.key2.eq('one'), 0).sum().rename_axis('key1').reset_index(name='key2')

# #   key1  key2
# # 0    a     2
# # 1    b     1
# # 2    c     0

In [None]:
# df.groupby(['owner', 'job_number'])['job_number'] == '2946857'

In [None]:
# # ngroup returns len df

# # print(users_by_job_number.ngroup()) # 0               0
# # #                                     # 1           64630
# # #                                     # 2               0
# # #                                     # 3           92438
# # #                                     # 4           92438 ...
# # #                                     # Length: 4468061, dtype: int64  
                    
# df.groupby(['job_number', 'owner'])['owner'].ngroup()

# # What changed? Now it outputs:
#                                         # 0          657   
#                                         # 1          571   
#                                         # 2          657   
#                                         # 3          596   
#                                         # 4          596 

In [None]:
# # nunique returns len uniq job_number

# print(users_by_job_number.nunique())    # job_number  owner
#                                         # 130207      u468     1
#                                         # 130254      u468     1 ...
#                                         # Name: job_number, Length: 184687, dtype: int64

In [None]:
# jobs_by_user = []

# for u in users:
# #     print(u, ": ", len(df[(df['owner'] == u)]), sep = '')
#     jobs_by_user.append(str(u
#                             + ": " 
#                             + str(len(df[(df['owner'] == u)]))))
# for i in jobs_by_user:
#     print(i)                # This is based on each row being a unique job, which we now know is not the case:
#                             # u1: 99019
#                             # u2: 1050597
#                             # u3: 347511

This is based on each row being a unique job, which we now know is not the case:

```
u1: 99019
u2: 1050597
u3: 347511
u4: 16679
u5: 909
u6: 356
u7: 878
u8: 499
u9: 429
u10: 8762
u11: 38295
u12: 2780
u13: 1412
u14: 12413
u15: 66580
u16: 4992
u17: 38
u18: 439203
u19: 858
u20: 7082
u21: 555
u22: 44245
u23: 2264
u24: 42
u25: 2051
u26: 73
u27: 2115
u28: 858
u29: 385
u30: 324
u31: 1768
u32: 296
u33: 130
u34: 143
u35: 422
u36: 3166
u37: 44
u38: 1501
u39: 16007
u40: 2242
u41: 1384
u42: 530
u43: 73
u44: 48
u45: 2075
u46: 107
u47: 144
u48: 266
u49: 25
u50: 15
u51: 237
u52: 345
u53: 5348
u54: 609
u55: 74
u56: 50
u57: 496
u58: 37
u59: 2559
u60: 1194
u61: 116
u62: 1751
u63: 370
u64: 16464
u65: 235
u66: 72
u67: 180
u68: 24
u69: 1895
u70: 1167
u71: 39
u72: 320
u73: 889
u74: 6863
u75: 32535
u76: 95
u77: 2054
u78: 127
u79: 50
u80: 520
u81: 550
u82: 225
u83: 30
u84: 52
u85: 24
u86: 435
u87: 289
u88: 531
u89: 540
u90: 3135
u91: 4854
u92: 52
u93: 79
u94: 11366
u95: 223
u96: 54
u97: 9
u98: 26
u99: 42
u100: 11
u101: 11689
u102: 96
u103: 788
u104: 52
u105: 90
u106: 2225
u107: 18581
u108: 5
u109: 44
u110: 790
u111: 1249
u112: 8908
u113: 30
u114: 394
u115: 54
u116: 33
u117: 69
u118: 2115
u119: 51429
u120: 256
u121: 1269
u122: 673
u123: 71
u124: 1451
u125: 37
u126: 94
u127: 9158
u128: 47
u129: 4
u130: 310
u131: 119
u132: 25143
u133: 125
u134: 105
u135: 70
u136: 25
u137: 72
u138: 16232
u139: 385
u140: 49
u141: 96
u142: 20
u143: 4933
u144: 35
u145: 2
u146: 1834
u147: 16
u148: 1536
u149: 102
u150: 1352
u151: 498
u152: 514
u153: 13
u154: 762
u155: 75
u156: 1914
u157: 27
u158: 8545
u159: 1737
u160: 94
u161: 1
u162: 1398
u163: 285
u164: 570
u165: 796
u166: 24003
u167: 61
u168: 2
u169: 6123
u170: 138
u171: 31
u172: 50
u173: 78
u174: 111
u175: 4
u176: 42
u177: 51
u178: 2823
u179: 15
u180: 207
u181: 598
u182: 162
u183: 5
u184: 760
u185: 10
u186: 11
u187: 34
u188: 74
u189: 188
u190: 1198
u191: 1288
u192: 12
u193: 241
u194: 44555
u195: 8614
u196: 143
u197: 25
u198: 158
u199: 280
u200: 1569
u201: 161
u202: 634
u203: 6
u204: 6
u205: 9
u206: 9
u207: 2
u208: 5
u209: 68
u210: 5
u211: 6
u212: 1
u213: 2125
u214: 20
u215: 5
u216: 33
u217: 12
u218: 55
u219: 1150
u220: 142
u221: 31
u222: 62
u223: 170
u224: 2173
u225: 4
u226: 620
u227: 586
u228: 69
u229: 130
u230: 847
u231: 11
u232: 1071
u233: 103
u234: 9
u235: 27
u236: 38
u237: 90
u238: 10
u239: 191
u240: 40
u241: 12
u242: 49
u243: 398
u244: 180
u245: 20
u246: 73
u247: 81
u248: 109
u249: 79
u250: 1350135
u251: 14
u252: 15021
u253: 78
u254: 154
u255: 202
u256: 6
u257: 158
u258: 5406
u259: 877
u260: 117
u261: 6
u262: 144
u263: 99
u264: 160
u265: 11
u266: 77
u267: 152
u268: 43
u269: 2
u270: 1517
u271: 13
u272: 200
u273: 71
u274: 56
u275: 8
u276: 261
u277: 2
u278: 11
u279: 1303
u280: 762
u281: 8
u282: 156
u283: 25
u284: 20
u285: 62
u286: 5
u287: 20
u288: 3
u289: 43
u290: 5208
u291: 4
u292: 26
u293: 6
u294: 10
u295: 12
u296: 7
u297: 85
u298: 178
u299: 2
u300: 12
u301: 9725
u302: 328
u303: 5
u304: 11
u305: 57
u306: 11
u307: 12
u308: 5
u309: 13
u310: 2
u311: 7
u312: 5
u313: 3
u314: 7
u315: 4
u316: 4
u317: 4
u318: 2
u319: 9
u320: 186056
u321: 2
u322: 6
u323: 137
u324: 22
u325: 7
u326: 53
u327: 3
u328: 10
u329: 7
u330: 12
u331: 2
u332: 1103
u333: 88
u334: 59
u335: 5
u336: 330
u337: 8
u338: 17
u339: 8
u340: 1
u341: 6
u342: 130
u343: 10
u344: 82
u345: 15
u346: 234
u347: 41
u348: 3
u349: 1
u350: 4
u351: 366
u352: 1
u353: 4
u354: 79
u355: 111
u356: 41
u357: 571
u358: 578
u359: 9
u360: 46
u361: 27
u362: 10
u363: 1
u364: 17556
u365: 326
u366: 8
u367: 2
u368: 24
u369: 21
u370: 571
u371: 50
u372: 3
u373: 7
u374: 4683
u375: 14
u376: 18
u377: 18
u378: 3239
u379: 90
u380: 18
u381: 261
u382: 683
u383: 247
u384: 7
u385: 42
u386: 161
u387: 36
u388: 1
u389: 56
u390: 708
u391: 226
u392: 436
u393: 49
u394: 17
u395: 228
u396: 31
u397: 20
u398: 20
u399: 20
u400: 36
u401: 151
u402: 32
u403: 5
u404: 115
u405: 21
u406: 62
u407: 242
u408: 600
u409: 567
u410: 2
u411: 9
u412: 9
u413: 4
u414: 16
u415: 4
u416: 5
u417: 58
u418: 75
u419: 178
u420: 5
u421: 17
u422: 4
u423: 1
u424: 2
u425: 10
u426: 11
u427: 15
u428: 1645
u429: 76
u430: 23
u431: 73
u432: 409
u433: 1
u434: 1
u435: 4
u436: 1
u437: 84
u438: 21
u439: 2
u440: 4290
u441: 6
u442: 59
u443: 1
u444: 12
u445: 76
u446: 20
u447: 1
u448: 37
u449: 1
u450: 2071
u451: 34
u452: 11
u453: 52156
u454: 2
u455: 2489
u456: 5
u457: 103
u458: 75
u459: 4151
u460: 1
u461: 17
u462: 29
u463: 1600
u464: 9
u465: 3
u466: 4
u467: 17
u468: 4
u469: 5
u470: 5
u471: 15
u472: 4
u473: 6
u474: 10
u475: 376
u476: 134
u477: 9
u478: 1
u479: 24
u480: 28
u481: 8
u482: 169
u483: 734
u484: 5
u485: 11
u486: 1374
u487: 3
u488: 18
u489: 1605
u490: 4
u491: 3
u492: 56
u493: 5248
u494: 4
u495: 2
u496: 714
u497: 4
u498: 1
u499: 172
u500: 18
u501: 4
u502: 505
u503: 72
u504: 16
u505: 51
u506: 6
u507: 1
u508: 38
u509: 23
u510: 24
u511: 36
u512: 27
u513: 3
u514: 71
u515: 25
u516: 1
u517: 16
u518: 18
u519: 1
u520: 96
u521: 8
u522: 1004
u523: 5
u524: 38
u525: 19
u526: 10647
u527: 2
u528: 1
u529: 72
u530: 7
u531: 2
u532: 124
u533: 605
u534: 113
u535: 7909
u536: 2
u537: 94
u538: 1
u539: 25
u540: 26
u541: 21
u542: 42
u543: 3
u544: 1
u545: 36
u546: 1
u547: 19
u548: 299
u549: 33
u550: 59
u551: 9
u552: 32
u553: 1
u554: 2
u555: 711
u556: 22
u557: 242
u558: 102
u559: 8
u560: 13
u561: 3
u562: 1
u563: 82
u564: 13
u565: 3
u566: 3
u567: 1
u568: 8
u569: 6
u570: 1
u571: 616
u572: 1
u573: 238
u574: 5
u575: 61
u576: 54
u577: 1
u578: 28
u579: 88
u580: 932
u581: 2
u582: 106020
u583: 7
u584: 54
u585: 1
u586: 6
u587: 11
u588: 1
u589: 11
u590: 43
u591: 4
u592: 20
u593: 21
u594: 94
u595: 2
u596: 1
u597: 158
u598: 15
u599: 2
u600: 95
u601: 34
u602: 2
u603: 386
u604: 2
u605: 3324
u606: 2
u607: 20
u608: 1
u609: 12
u610: 7
u611: 1
u612: 1
u613: 12
u614: 235
u615: 2
u616: 3
u617: 22
u618: 1
u619: 5
u620: 4
u621: 1862
u622: 7
u623: 16
u624: 3
u625: 3311
u626: 1626
u627: 27
u628: 10011
u629: 15
u630: 35
u631: 1
u632: 255
u633: 2
u634: 5
u635: 173
u636: 3
u637: 163
u638: 77
u639: 55
u640: 16
u641: 24
u642: 803
u643: 4
u644: 89
u645: 15
u646: 4
u647: 3
u648: 3
u649: 9
u650: 2
u651: 200
u652: 11
u653: 96
u654: 47
u655: 8
u656: 7
u657: 345
u658: 2
u659: 2
u660: 2
u661: 5
u662: 958
u663: 3
u664: 15
u665: 1
u666: 7
u667: 6
u668: 7
u669: 12
u670: 2
u671: 15
u672: 101
u673: 5
u674: 33
u675: 108
u676: 20
u677: 6
u678: 1432
u679: 1
u680: 2
u681: 1
u682: 195
u683: 5
u684: 1
u685: 3
u686: 3
u687: 1
u688: 42900
u689: 12
u690: 1
u691: 1
u692: 1056
u693: 808
u694: 3
u695: 3
u696: 15
u697: 22
u698: 20
u699: 1```

In [None]:
# # df[['owner','job_number']]['job_number'] == 3912841  # 0          True 
# #                                                      # 1          False
# #                                                      # 2          True 
# #                                                      # 3          False
            
# # len(df[['owner','job_number']]['job_number'] == 3912841)  # 4468061. Counts T&F. I want T only.
# # len((df[['owner','job_number']]['job_number'] == 3912841) == True)  # 4468061. ???
# # (df[['owner','job_number']]['job_number'] == 3912841).unique()  # array([ True, False])

# # WORKS
# df[['owner','job_number']]['job_number'].unique()  # array([3912841, 3902779, 3907911, ..., 4112011, 4114770, 4114821], dtype=int64)
# len(df[['owner','job_number']]['job_number'].unique())  # 184688. Number of unique job numbers


In [None]:
# type(df[['owner','job_number']]['job_number'].unique())  # numpy.ndarray
# pd.DataFrame(df[['owner','job_number']]['job_number'].unique())  # 	          0
#                                                                  # 0	3912841
#                                                                  # 1	3902779
#                                                                  # 2	3907911
            
# df[['owner','job_number']]['job_number'].unique()

# df.groupby(['owner','job_number'])['job_number'].size()  # owner  job_number
#                                                          # u1     3912841       1645
#                                                          #        3912842       3000
#                                                          #        3923147       3000
# len(df.groupby(['owner','job_number'])['job_number'].size())  # 184688

# df.groupby(['owner','job_number'])['job_number'].unique()   # owner  job_number
#                                                             # u1     3912841       [3912841]
#                                                             #        3912842       [3912842]
#                                                             #        3923147       [3923147]            
# len(df.groupby(['owner','job_number'])['job_number'].unique())  # 184688

# df.groupby(['job_number', 'owner'])['job_number'].unique()  # job_number  owner
#                                                             # 130207      u468      [130207]
#                                                             # 130254      u468      [130254]
#                                                             # 130362      u468      [130362]
# len(df.groupby(['job_number', 'owner'])['job_number'].unique())  # 184688

# df.groupby(['job_number', 'owner'])['owner'].unique()       # job_number  owner
#                                                             # 130207      u468     [u468]
#                                                             # 130254      u468     [u468]
#                                                             # 130362      u468     [u468]
# len(df.groupby(['job_number', 'owner'])['owner'].unique())  # 184688

# # (df.groupby(['job_number', 'owner'])
# #  [['job_number', 'owner']].unique())  # AttributeError: 'DataFrameGroupBy' object has no attribute 'unique'

# # (df.groupby(['owner','job_number'])['job_number']
# #  .unique()
# #  .size())  # TypeError: 'int' object is not callable

In [None]:
# fix the above to count each uniq job number as 1 
# O(n^2)  

# groups =      df['group'].unique()
# users =       df['owner'].unique()
# job_numbers = df['job_number'].unique()
# slot =        df['slot'].unique()
# slots =       df['slots'].unique()

jobs_by_user = []

# for u in df['owner'].unique():
#     for j in df['job_number'].unique(): 
# #         print(u, type(u))
# #         print(j, type(j))
# #         print(len(df[(df['job_number'] == j)]), type(len(df[(df['job_number'] == j)])))
# #         print(u, j, len(df[(df['job_number'] == j)]))
#         print("User:", u, " *** Job number:", j, " *** on", len(df[(df['job_number'] == j)]), "rows")
# #         print("User:", u, " *** Job number:", j, " *** 1 job")

# User: u1  *** Job number: 3912841  *** on 1645 rows
# User: u1  *** Job number: 3902779  *** on 19374 rows
# User: u1  *** Job number: 3907911  *** on 1770 rows
# User: u1  *** Job number: 3913291  *** on 6 rows
# User: u1  *** Job number: 3914733  *** on 1 rows
# User: u1  *** Job number: 3914732  *** on 1 rows

        
#         jobs_by_user.append(str(u
#                                 + ": "
#                                 + str(len(df[(df['owner'] == u)]))))
        
#         jobs_by_user.append(str(u + ", job " + j + " : "
#                                 + len(df[df['job_number'] == str(j)])))
        
# for i in jobs_by_user:
#     print(i)

** Big O notation and `timeit`**  
- Calculate Big O notation
- Time execution of `for` loop

In [None]:
# # define statement string
# # O(N)

# # s = """\
# # for u in df['owner'].unique():
# #     print(u, ': ', len(df[(df['owner'] == u)]), sep = '')"""


# s = """\
# jobs_by_user = []
# uniq_users = df['owner'].unique()

# for u in uniq_users:
#     jobs_by_user.append(str(u
#                              + ": " 
#                              + str(len(df[(df['owner'] == u)]))))"""

In [None]:
# # define setup string
# setup = """\
# import h5py
# import pandas as pd
# df = pd.read_hdf('accounting-2018-10-deid.h5', 'table')"""

I was getting **`NameError: name 'df' is not defined`** when I ran `timeit`. One solution was to pass 'df'=df to the `timeit` function in the **`globals`** parameter. Another was to use the **`setup`** parameter.

`timeit.timeit() vs. timeit.Timer()` ???

See details at https://stackoverflow.com/questions/54164497/python-how-to-make-timeit-recognize-defined-inputs/54164538#54164538).

In [None]:
# time_iter_1_1_1 = timeit.timeit(s, 
#                                 globals={'df': df},
#                                 number = 1)
# print(time_iter_1_1_1)  # 239.14 sec

In [None]:
# time_iter_1_1_1 = min(timeit.timeit(s, 
#                                    setup = setup,
#                                    number = 2))
# #                                     globals = {'d':d, 'df': df, 'group_users': group_users})
# #            .repeat(2, 1000))

# ERROR:
x = """---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
<ipython-input-19-ad4772804674> in <module>()
      2 time_iter_1_1_1 = min(timeit.timeit(s, 
      3                                    setup = setup,
----> 4                                    number = 2))
      5 #                                     globals = {'d':d, 'df': df, 'group_users': group_users})
      6 #            .repeat(2, 1000))

TypeError: 'float' object is not iterable"""

In [None]:
# jobs_by_group = []
# for g in df['group'].unique():
# #     print(g, ": ", len(df[(df['group'] == g)]), sep = '')
#     jobs_by_group.append(str(g 
#                              + ": " 
#                              + str(len(df[(df['group'] == g)]))))
# for i in jobs_by_group:
#     print(i)

This may be (is) double/triple/... counting some jobs.
```g1: 99021
g2: 1054916
g3: 349137
g4: 95042
g5: 3764
g6: 9162
g7: 1764671
g8: 108522
g9: 10546
g10: 64197
g11: 3125
g12: 3237
g13: 226
g14: 442031
g15: 592
g16: 44265
g17: 2865
g18: 10976
g19: 304
g20: 926
g21: 1455
g22: 2391
g23: 2266
g24: 21755
g25: 15299
g26: 1062
g27: 46
g28: 1747
g29: 385
g30: 2111
g31: 1086
g32: 295
g33: 77
g34: 1219
g35: 70115
g36: 310
g37: 7364
g38: 320
g39: 904
g40: 32545
g41: 95
g42: 337
g43: 93
g44: 2422
g45: 1700
g46: 2072
g47: 24
g48: 897
g49: 540
g50: 4642
g51: 81768
g52: 3616
g53: 16080
g54: 223
g55: 718
g56: 24
g57: 1228
g58: 90
g59: 47
g60: 127
g61: 261
g62: 1451
g63: 235
g64: 737
g65: 28
g66: 58
g67: 20
g68: 73
g69: 4640
g70: 61
g71: 6132
g72: 2623
g73: 31
g74: 78
g75: 14
g76: 50
g77: 879
g78: 137
g79: 68
g80: 188
g81: 410
g82: 143
g83: 625
g84: 584
g85: 308
g86: 161
g87: 15
g88: 150
g89: 59
g90: 2125
g91: 5
g92: 164
g93: 170
g94: 2244
g95: 4
g96: 620
g97: 144
g98: 65
g99: 86
g100: 15021
g101: 136
g102: 6
g103: 99
g104: 70
g105: 82
g106: 168
g107: 25
g108: 25
g109: 9
g110: 525
g111: 29
g112: 6
g113: 10
g114: 12
g115: 7
g116: 2
g117: 346
g118: 16
g119: 101
g120: 36
g121: 92
g122: 2
g123: 7
g124: 3
g125: 7
g126: 3
g127: 22
g128: 7
g129: 35
g130: 10
g131: 30
g132: 19
g133: 4
g134: 23
g135: 1
g136: 88
g137: 571
g138: 4982
g139: 14
g140: 18
g141: 161
g142: 115
g143: 21
g144: 242
g145: 9
g146: 5
g147: 8
g148: 2
g149: 1
g150: 1
g151: 84
g152: 2
g153: 59
g154: 12
g155: 1772
g156: 2
g157: 1
g158: 52156
g159: 9
g160: 103
g161: 4
g162: 6
g163: 9
g164: 199
g165: 45
g166: 16
g167: 5
g168: 13958
g169: 2
g170: 42
g171: 36
g172: 1
g173: 32
g174: 711
g175: 242
g176: 13
g177: 3
g178: 1
g179: 13
g180: 3
g181: 8
g182: 809
g183: 1
g184: 238
g185: 64
g186: 54
g187: 7
g188: 1
g189: 2
g190: 4
g191: 3
g192: 5
g193: 3
g194: 16
g195: 10
g196: 2
g197: 958
g198: 3
g199: 6
g200: 6
g201: 195
g202: 5
g203: 20```

In [None]:
# # TODO

# # FIX the above to single count each job

# # 

# jobs_by_group = []
# for g in df['group'].unique():
# #     print(g, ": ", len(df[(df['group'] == g)]), sep = '')
#     jobs_by_group.append(str(g 
#                              + ": " 
#                              + str(len(df[(df['group'] == g)]))))
# for i in jobs_by_group:
#     print(i)

In [None]:
# s = """\
# jobs_by_group = []
# for g in df['group'].unique():
#     jobs_by_group.append(str(g 
#                              + ": " 
#                              + str(len(df[(df['group'] == g)]))))"""

In [None]:
# time_iter_1_1_2 = timeit.timeit(s, 
#                                 globals = {'df': df},
#                                 number = 1)

# time_iter_1_1_2  # 64.72