# Progress Bar

In [51]:
# for console output
# from tqdm import tqdm

# for jupyter notebook
from tqdm.notebook import tqdm

import pandas as pd
from pandarallel import pandarallel

import time

In [37]:
j = 0
for i in tqdm(range(100)):
    j += i
    time.sleep(1)
    
print(j)

  0%|          | 0/100 [00:00<?, ?it/s]

4950


If you get error on displaying the progress bar:

```bash
pip uninstall ipywidgets
pip install ipywidgets
```

# Dataframe

About dataset:
    https://www.kaggle.com/datasets/lakshmi25npathi/bike-sharing-dataset

In [13]:
df = pd.read_csv("../data/bike.csv")
df.head()

Unnamed: 0,datetime,season,holiday,workingday,weather,temp,atemp,humidity,windspeed,casual,registered,count
0,2011-01-01 00:00:00,1,0,0,1,9.84,14.395,81,0.0,3,13,16
1,2011-01-01 01:00:00,1,0,0,1,9.02,13.635,80,0.0,8,32,40
2,2011-01-01 02:00:00,1,0,0,1,9.02,13.635,80,0.0,5,27,32
3,2011-01-01 03:00:00,1,0,0,1,9.84,14.395,75,0.0,3,10,13
4,2011-01-01 04:00:00,1,0,0,1,9.84,14.395,75,0.0,0,1,1


In [14]:
df.shape

(10886, 12)

## Feature Engineering

### Iterrows()

In [39]:
df["season_name"] = ""
df["weather_name"] = ""

seasons = {1: "spring", 2: "summer", 3: "fall", 4: "winter"}
weather = {1: "clear", 2: "mist", 3: "light", 4: "heavy"}

for index, row in tqdm(df.iterrows(), total=df.shape[0]):
    df.loc[index, "season_name"] = seasons[row["season"]]
    df.loc[index, "weather_name"] = weather[row["weather"]]
    

  0%|          | 0/10886 [00:00<?, ?it/s]

In [33]:
df.sample(5)

Unnamed: 0,datetime,season,holiday,workingday,weather,temp,atemp,humidity,windspeed,casual,registered,count,season_name,weather_name
2059,2011-05-12 17:00:00,2,0,1,2,26.24,31.06,57,12.998,54,540,594,summer,mist
7160,2012-04-16 17:00:00,2,1,0,1,32.8,34.85,33,26.0027,111,601,712,summer,clear
6082,2012-02-09 16:00:00,1,0,1,1,13.94,16.665,36,0.0,15,223,238,spring,clear
1362,2011-04-02 15:00:00,2,0,0,3,13.94,15.15,76,26.0027,72,64,136,summer,light
9020,2012-08-18 05:00:00,3,0,0,1,24.6,28.79,78,16.9979,2,7,9,fall,clear


### Pandas Apply()

In [45]:
tqdm.pandas()

In [58]:
df["season_name"] = ""
df["weather_name"] = ""

def apply_season(x):
    seasons = {1: "spring", 2: "summer", 3: "fall", 4: "winter"}
    time.sleep(0.001)
    return seasons[x]

def apply_weather(x):
    weather = {1: "clear", 2: "mist", 3: "light", 4: "heavy"}
    time.sleep(0.001)
    return weather[x]

df["season_name"] = df["season"].progress_apply(apply_season)
df["weather_name"] = df["weather"].progress_apply(apply_weather)

  0%|          | 0/10886 [00:00<?, ?it/s]

  0%|          | 0/10886 [00:00<?, ?it/s]

In [52]:
df.sample(5)

Unnamed: 0,datetime,season,holiday,workingday,weather,temp,atemp,humidity,windspeed,casual,registered,count,season_name,weather_name
9860,2012-10-15 05:00:00,4,0,1,2,22.96,26.515,73,19.0012,2,38,40,winter,mist
7613,2012-05-16 14:00:00,2,0,1,1,29.52,32.575,42,6.0032,42,221,263,summer,clear
3120,2011-07-18 22:00:00,3,0,1,1,31.16,36.365,66,15.0013,36,104,140,fall,clear
8317,2012-07-07 22:00:00,3,0,0,1,34.44,41.665,59,12.998,56,164,220,fall,clear
5695,2012-01-12 11:00:00,1,0,1,2,13.94,17.425,81,7.0015,18,101,119,spring,mist


## Parallel Apply

In [53]:
pandarallel.initialize(progress_bar=True)

INFO: Pandarallel will run on 4 workers.
INFO: Pandarallel will use standard multiprocessing data transfer (pipe) to transfer data between the main process and workers.

https://nalepae.github.io/pandarallel/troubleshooting/


In [57]:
df["season_name"] = ""

def apply_season(x):
    import time
    seasons = {1: "spring", 2: "summer", 3: "fall", 4: "winter"}
    time.sleep(0.001)
    return seasons[x]

t0 = time.time()
df["season_name"] = df["season"].parallel_apply(apply_season)
t1 = time.time()

print(f"It took: {t1-t0:.5f} sec")

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=2722), Label(value='0 / 2722'))), …

It took: 46.37058 sec
