In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import tracemalloc


dataset91 = pd.read_csv('https://raw.githubusercontent.com/kraikisto/CERN_LEP_Z_boson/main/dimuon_short91_f1_000.csv.gz',index_col=False)
dataset92 = pd.read_csv('https://raw.githubusercontent.com/kraikisto/CERN_LEP_Z_boson/main/dimuon_short92_e2_000.csv.gz',index_col=False)
dataset93 = pd.read_csv('https://raw.githubusercontent.com/kraikisto/CERN_LEP_Z_boson/main/dimuon_short93_d2_000.csv.gz',index_col=False)
dataset94 = pd.read_csv('https://raw.githubusercontent.com/kraikisto/CERN_LEP_Z_boson/main/dimuon_short94_c2_000.csv.gz',index_col=False)
dataset = pd.concat([dataset91, dataset92, dataset93, dataset94])


In [2]:
def first_program(dataset):
    #section 1: 
    lum1 = 711.1 #luminosity for this section
    condition1 = np.abs(dataset.ECM-88.5) < 0.1 #ECM for this section is near 88.5
    dataset.loc[condition1, "LUM"] = lum1 #set the luminosity under row "LUM"


    #section 2: 
    lum2 = 632.7
    condition2 = np.abs(dataset.ECM-89.5) < 0.1
    dataset.loc[condition2, "LUM"] = lum2


    #section 3: 
    lum3 = 622.6
    condition3 = np.abs(dataset.ECM-90.2) < 0.1
    dataset.loc[condition3 , "LUM"] = lum3


    #section 4: 
    lum4 = 2482.5
    condition4 = np.abs(dataset.ECM-91.2) < 0.1
    dataset.loc[condition4, "LUM"] = lum4


    #section 5: 
    lum5 = 666.1
    condition5 = np.abs(dataset.ECM-92) < 0.1
    dataset.loc[condition5, "LUM"] = lum5


    #section 6: 
    lum6 = 634.6
    condition6 = np.abs(dataset.ECM-93) < 0.1
    dataset.loc[condition6, "LUM"] = lum6


    #section 7: 
    lum7 = 681.2
    condition7 = np.abs(dataset.ECM-93.7) < 0.1
    dataset.loc[condition7, "LUM"] = lum7
    
    
    
def second_program(dataset):
    dataset_sorted = dataset.sort_values(by=["ECM"])
    dataset_sorted = dataset_sorted.reset_index(drop=True)
    unique_values = dataset_sorted.ECM.unique() #simulate having to print

    #section 1: 
    index1 = dataset_sorted[dataset_sorted.ECM == 89.506].index[0]
    lum1 = 711.1 #luminosity for this section
    dataset_sorted.loc[:index1, "LUM"] = lum1 #set the luminosity under row "LUM"


    #section 2: 
    lum2 = 632.7
    index2 = dataset_sorted[dataset_sorted.ECM == 90.256].index[0]
    dataset_sorted.loc[index1:index2, "LUM"] = lum2


    #section 3: 
    lum3 = 622.6
    index3 = dataset_sorted[dataset_sorted.ECM == 91.25].index[0]
    dataset_sorted.loc[index2:index3, "LUM"] = lum3


    #section 4: 
    lum4 = 2482.5
    index4 = dataset_sorted[dataset_sorted.ECM == 92.004].index[0]
    dataset_sorted.loc[index3:index4, "LUM"] = lum4


    #section 5: 
    lum5 = 666.1
    index5 = dataset_sorted[dataset_sorted.ECM == 93.015].index[0]
    dataset_sorted.loc[index4:index5, "LUM"] = lum5


    #section 6: 
    lum6 = 634.6
    index6 = dataset_sorted[dataset_sorted.ECM == 93.765].index[0]
    dataset_sorted.loc[index5:index6, "LUM"] = lum6


    #section 7: 
    lum7 = 681.2
    dataset_sorted.loc[index6:, "LUM"] = lum7
    return dataset_sorted


In [3]:
import time 

#measure first program in small dataset
start_time = time.time()
first_program(dataset91)
end_time = time.time()
result1 = end_time-start_time

#measure second program in small dataset
start_time = time.time()
second_program(dataset91)
end_time = time.time()
result2 = end_time-start_time

#measure first program in bigger dataset
start_time = time.time()
first_program(dataset)
end_time = time.time()
result3 = end_time-start_time

#measure second program in bigger dataset
start_time = time.time()
second_program(dataset)
end_time = time.time()
result4 = end_time-start_time

print(result1)
print(result2)
print(result3)
print(result4)

0.018329620361328125
0.016561031341552734
0.012064695358276367
0.026291608810424805


This isn't the most efficient method of testing the speed of an algorithm, since it just takes the real times and subtracts them. You should run it few times, but you will get a basic idea of how the programs compare to each other. 

So for the smaller dataset the first one is often slower, but for the larger its always faster, why is this? Well lets look at the programs starting with the lines: 


`condition1 = np.abs(dataset.ECM-88.5) < 0.1` program one

`index1 = dataset[dataset.ECM == 89.478].index[0]` program two

Both are actually quite similar, running through the entire data checking the codition for each value. Let's say we have $N$ number of datapoints in our data. Then going through would take $aN$ time where $a$ is the amount of time it takes to go through one and is a constant. We usually denote this as $O(N)$ time complexity.  

For the next lines: 

`dataset.loc[condition1, "LUM"] = 711.1` program one

`dataset.loc[:index1, "LUM"] = 711.1` program two

Here program one is actually worse. It goes through the entire data checking the whether the condition is True or False just like the last lines we looked at. This is slower than actually giving the indices that need to be edited. Since program 2 goes through all datapoints once, instead of 7 times its 7 times faster. Both programs are still $O(N)$ complexity since both runtimes grow linearly in terms of $N$, one is just always 7 times slower. 

But now the reason why program 2 is worse on a large dataset: It has to sort the data first. There are a bunch of different sorting algorithms, but this uses quicksort as a default which is $O(N log N)$. This is larger than any of the other parts of the program so when the amount of data grows this will grow faster than all the other parts explaining why for a larger dataset the second program is always slower. 

The last 2 parts which are the resetting of indices and calculation of unique values are both $O(N)$ for the same reasons we covered before. Now the complexity of the entire program is just the largest term which is $O(N log N)$.

Now what about the memory usage? The tracemalloc library can be used to test the memory usage. Now when testing it is important to remember not to have anything already saved before for accurate measurements. The safest way to do that is to restart your kernel and run the first 2 cells to have everything you need imported and saved. After that you can run the cell below.

In [6]:
tracemalloc.start()
first_program(dataset)
print(tracemalloc.get_traced_memory())
tracemalloc.stop()

tracemalloc.start()
second_program(dataset)
print(tracemalloc.get_traced_memory())
tracemalloc.stop()

(10153, 573705)
(155310, 16483973)


The first value given is the current memory usage of the program and the second the peak memory usage. The peak is what matters here since that is what can stop your program from running. Now clearly the first program is the better one, but why? If we look at what variables we actually save, its quite obvious. In the beginning we save the entire dataset, sorted to a different order and then save the entire dataset again just with different indices. An easy way to make sure which part of the code is using too much memory is to compare just that part to the entire program

In [7]:
tracemalloc.start()
second_program(dataset)
print(tracemalloc.get_traced_memory())
tracemalloc.stop()

tracemalloc.start()
dataset_sorted = dataset.sort_values(["ECM"])
dataset_sorted = dataset_sorted.reset_index(drop=True)
print(tracemalloc.get_traced_memory())
tracemalloc.stop()

(303154, 16630625)
(4457122, 16482811)


The peak is almost the same and current usage is ten times higher. This is clearly the main problem, but how do we fix it? 

In [3]:
def second_program_improved(dataset):
    dataset.sort_values(by=["ECM"], inplace = True)
    dataset.reset_index(drop=True, inplace = True)
    unique_values = dataset.ECM.unique() #simulate having to print

    #section 1:
    lum1 = 711.1 #luminosity for this section
    index1 = dataset[dataset.ECM == 89.506].index[0]
    dataset.loc[:index1, "LUM"] = lum1 #set the luminosity under row "LUM"


    #section 2: 
    lum2 = 632.7
    index2 = dataset[dataset.ECM == 90.256].index[0]
    dataset.loc[index1:index2, "LUM"] = lum2


    #section 3: 
    lum3 = 622.6
    index3 = dataset[dataset.ECM == 91.25].index[0]
    dataset.loc[index2:index3, "LUM"] = lum3


    #section 4: 
    lum4 = 2482.5
    index4 = dataset[dataset.ECM == 92.004].index[0]
    dataset.loc[index3:index4, "LUM"] = lum4


    #section 5: 
    lum5 = 666.1
    index5 = dataset[dataset.ECM == 93.015].index[0]
    dataset.loc[index4:index5, "LUM"] = lum5


    #section 6: 
    lum6 = 634.6
    index6 = dataset[dataset.ECM == 93.765].index[0]
    dataset.loc[index5:index6, "LUM"] = lum6


    #section 7: 
    lum7 = 681.2
    dataset.loc[index6:, "LUM"] = lum7




tracemalloc.start()
second_program(dataset)
print(tracemalloc.get_traced_memory())
tracemalloc.stop()

tracemalloc.start()
second_program_improved(dataset)
print(tracemalloc.get_traced_memory())
tracemalloc.stop()

(266297, 15801264)
(4647288, 12208766)


It's important to remember that now this function will change the dataset that we give it and sometimes this is not adviced. However, generally when dealing with large sets of data, it's recommended to edit the existing dataset instead of creating a copy to edit, unless you know that you will need the original again. 