<a href="https://colab.research.google.com/github/jillzzy/jupyter_notebooks/blob/master/pandasoptimize.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### Pandas Optmization
*How to optimize performance of Pandas applying the following techniques to the iris dataset*
- .itterows()
- apply()
- .cut()


![alt text](https://media0.giphy.com/media/wT71Ce9oKBQGc/giphy.gif)


---

Reference tutorial:

https://towardsdatascience.com/how-to-use-pandas-the-right-way-to-speed-up-your-code-4a19bd89926d

In [0]:
import seaborn as sns
import pandas as pd

data = sns.load_dataset('iris')

print(data.head())

   sepal_length  sepal_width  petal_length  petal_width species
0           5.1          3.5           1.4          0.2  setosa
1           4.9          3.0           1.4          0.2  setosa
2           4.7          3.2           1.3          0.2  setosa
3           4.6          3.1           1.5          0.2  setosa
4           5.0          3.6           1.4          0.2  setosa


### Baseline
*using for loop*


---


![alt text](https://media1.tenor.com/images/c1e31c6eb9b4249c82d217777b279e6b/tenor.gif?itemid=11956161)

In [0]:
import seaborn as sns
import time 

def compute_class(petal_length):
    if petal_length <= 2:
        return 1
    elif 2 < petal_length < 5:
        return 2
    else:
        return 3

start = time.time()

class_list = list()
for i in range(len(data)):
    petal_length = data.iloc[i]['petal_length']
    class_num = compute_class(petal_length)
    class_list.append(class_num)

end = time.time()
print("For-loop run time = {}".format(end - start))

For-loop run time = 0.05428910255432129


# Optimization 1
*itterows()*

In [0]:
import seaborn as sns
import time

data = sns.load_dataset('iris')

def compute_class(petal_length):
    if petal_length <= 2:
        return 1
    elif 2 < petal_length < 5:
        return 2
    else:
        return 3

start = time.time()

class_list = list()
for index, data_row in data.iterrows():
    petal_length = data_row['petal_length']
    class_num = compute_class(petal_length)
    class_list.append(class_num)

end = time.time()
print("Iterrows run time = {}".format(end - start))

Iterrows run time = 0.02191615104675293


#Optimization 2
*.apply()*

In [0]:
import seaborn as sns
import time

data = sns.load_dataset('iris')

def compute_class(petal_length):
    if petal_length <= 2:
        return 1
    elif 2 < petal_length < 5:
        return 2
    else:
        return 3

start = time.time()

class_list = data.apply(lambda row: compute_class(row['petal_length']), axis=1)

end = time.time()
print(".apply() run time = {}".format(end - start))

.apply() run time = 0.008765697479248047


#Optimization 3
*.cut()*

In [0]:
import seaborn as sns
import time
import pandas as pd

data = sns.load_dataset('iris')

start = time.time()

class_list = pd.cut(x=data.petal_length,
                   bins=[0, 2, 5, 100],
                   include_lowest=True,
                   labels=[1, 2, 3]).astype(int)

end = time.time()
print(".cut() run time = {}".format(end - start))

.cut() run time = 0.010038137435913086
