# Preprocess Data for Time Until Overripe

### Import dependencies

In [1]:
import pandas as pd
import numpy as np

### Import Dataset

In [2]:
data = pd.read_excel('../data/external/Hass Avocado Ripening Photographic Dataset/Avocado Ripening Dataset.xlsx')

In [3]:
data.head()

Unnamed: 0,File Name,Time Stamp,Storage Group,Sample,Day of Experiment,Ripening Index Classification
0,T20_d01_001_a_1,2022-04-04 18:56:55,T20,1,1,1
1,T20_d01_001_b_1,2022-04-04 18:57:03,T20,1,1,1
2,T20_d02_001_a_1,2022-04-05 14:16:21,T20,1,2,1
3,T20_d02_001_b_1,2022-04-05 14:16:46,T20,1,2,1
4,T20_d03_001_a_2,2022-04-06 15:08:50,T20,1,3,2


Compute all the days the index is marked as overripe (5)

In [4]:
overripe_days = data[data['Ripening Index Classification'] == 5]

In [5]:
overripe_days.head()

Unnamed: 0,File Name,Time Stamp,Storage Group,Sample,Day of Experiment,Ripening Index Classification
16,T20_d09_001_a_5,2022-04-12 14:44:53,T20,1,9,5
17,T20_d09_001_b_5,2022-04-12 14:55:35,T20,1,9,5
18,T20_d10_001_a_5,2022-04-13 13:26:47,T20,1,10,5
19,T20_d10_001_b_5,2022-04-13 13:26:55,T20,1,10,5
20,T20_d11_001_a_5,2022-04-14 16:33:07,T20,1,11,5


Find the first day of overripening for each sample

In [6]:
first_overripe_day = overripe_days.groupby('Sample')[['Day of Experiment', 'Time Stamp']].min()

In [7]:
first_overripe_day.head(30)

Unnamed: 0_level_0,Day of Experiment,Time Stamp
Sample,Unnamed: 1_level_1,Unnamed: 2_level_1
1,9,2022-04-12 14:44:53
2,19,2022-04-22 13:35:44
3,18,2022-04-21 13:58:57
4,9,2022-04-12 15:41:19
5,9,2022-04-12 14:20:47
7,19,2022-04-22 13:12:02
8,11,2022-04-14 14:53:38
9,8,2022-04-11 16:52:31
10,8,2022-04-11 14:46:14
11,21,2022-04-24 15:17:57


Create new feature (Day of Overripening)

In [8]:
data['Overripening Day'] = data['Sample'].map(first_overripe_day['Day of Experiment'])

In [9]:
data.head(200)

Unnamed: 0,File Name,Time Stamp,Storage Group,Sample,Day of Experiment,Ripening Index Classification,Overripening Day
0,T20_d01_001_a_1,2022-04-04 18:56:55,T20,1,1,1,9.0
1,T20_d01_001_b_1,2022-04-04 18:57:03,T20,1,1,1,9.0
2,T20_d02_001_a_1,2022-04-05 14:16:21,T20,1,2,1,9.0
3,T20_d02_001_b_1,2022-04-05 14:16:46,T20,1,2,1,9.0
4,T20_d03_001_a_2,2022-04-06 15:08:50,T20,1,3,2,9.0
...,...,...,...,...,...,...,...
195,T10_d04_007_b_1,2022-04-07 11:07:33,T10,7,4,1,19.0
196,T10_d05_007_a_1,2022-04-08 12:00:33,T10,7,5,1,19.0
197,T10_d05_007_b_1,2022-04-08 12:00:41,T10,7,5,1,19.0
198,T10_d06_007_a_1,2022-04-09 12:19:16,T10,7,6,1,19.0


Create new Feature (Time Stamp of Overripening)

In [10]:
data['Overripening Time Stamp'] = data['Sample'].map(first_overripe_day['Time Stamp'])

In [11]:
data.head()

Unnamed: 0,File Name,Time Stamp,Storage Group,Sample,Day of Experiment,Ripening Index Classification,Overripening Day,Overripening Time Stamp
0,T20_d01_001_a_1,2022-04-04 18:56:55,T20,1,1,1,9.0,2022-04-12 14:44:53
1,T20_d01_001_b_1,2022-04-04 18:57:03,T20,1,1,1,9.0,2022-04-12 14:44:53
2,T20_d02_001_a_1,2022-04-05 14:16:21,T20,1,2,1,9.0,2022-04-12 14:44:53
3,T20_d02_001_b_1,2022-04-05 14:16:46,T20,1,2,1,9.0,2022-04-12 14:44:53
4,T20_d03_001_a_2,2022-04-06 15:08:50,T20,1,3,2,9.0,2022-04-12 14:44:53


Compute the Time until overripeness by substracting Time Stamp features

In [12]:
def compute_overripeness_time(time_stamp, overripe_time_stamp):
    if pd.isna(overripe_time_stamp):
        return np.nan
    dif = pd.Timestamp(overripe_time_stamp) - pd.Timestamp(time_stamp)
    if dif < pd.Timedelta(0):
        return pd.Timedelta(0)
    else:
        return dif

In [13]:
data['Time Unitl Overripening'] = data[['Time Stamp', 'Overripening Time Stamp']].apply(lambda x: compute_overripeness_time(x['Time Stamp'], x['Overripening Time Stamp']), axis=1)

In [14]:
data.head(200)

Unnamed: 0,File Name,Time Stamp,Storage Group,Sample,Day of Experiment,Ripening Index Classification,Overripening Day,Overripening Time Stamp,Time Unitl Overripening
0,T20_d01_001_a_1,2022-04-04 18:56:55,T20,1,1,1,9.0,2022-04-12 14:44:53,7 days 19:47:58
1,T20_d01_001_b_1,2022-04-04 18:57:03,T20,1,1,1,9.0,2022-04-12 14:44:53,7 days 19:47:50
2,T20_d02_001_a_1,2022-04-05 14:16:21,T20,1,2,1,9.0,2022-04-12 14:44:53,7 days 00:28:32
3,T20_d02_001_b_1,2022-04-05 14:16:46,T20,1,2,1,9.0,2022-04-12 14:44:53,7 days 00:28:07
4,T20_d03_001_a_2,2022-04-06 15:08:50,T20,1,3,2,9.0,2022-04-12 14:44:53,5 days 23:36:03
...,...,...,...,...,...,...,...,...,...
195,T10_d04_007_b_1,2022-04-07 11:07:33,T10,7,4,1,19.0,2022-04-22 13:12:02,15 days 02:04:29
196,T10_d05_007_a_1,2022-04-08 12:00:33,T10,7,5,1,19.0,2022-04-22 13:12:02,14 days 01:11:29
197,T10_d05_007_b_1,2022-04-08 12:00:41,T10,7,5,1,19.0,2022-04-22 13:12:02,14 days 01:11:21
198,T10_d06_007_a_1,2022-04-09 12:19:16,T10,7,6,1,19.0,2022-04-22 13:12:02,13 days 00:52:46


### Shelf-life Days 

In [15]:
seconds_in_a_day = 24 * 60 * 60
data['Shelf-life Days'] = data['Time Unitl Overripening'].dt.total_seconds() / seconds_in_a_day

In [16]:
data.head(200)

Unnamed: 0,File Name,Time Stamp,Storage Group,Sample,Day of Experiment,Ripening Index Classification,Overripening Day,Overripening Time Stamp,Time Unitl Overripening,Shelf-life Days
0,T20_d01_001_a_1,2022-04-04 18:56:55,T20,1,1,1,9.0,2022-04-12 14:44:53,7 days 19:47:58,7.824977
1,T20_d01_001_b_1,2022-04-04 18:57:03,T20,1,1,1,9.0,2022-04-12 14:44:53,7 days 19:47:50,7.824884
2,T20_d02_001_a_1,2022-04-05 14:16:21,T20,1,2,1,9.0,2022-04-12 14:44:53,7 days 00:28:32,7.019815
3,T20_d02_001_b_1,2022-04-05 14:16:46,T20,1,2,1,9.0,2022-04-12 14:44:53,7 days 00:28:07,7.019525
4,T20_d03_001_a_2,2022-04-06 15:08:50,T20,1,3,2,9.0,2022-04-12 14:44:53,5 days 23:36:03,5.983368
...,...,...,...,...,...,...,...,...,...,...
195,T10_d04_007_b_1,2022-04-07 11:07:33,T10,7,4,1,19.0,2022-04-22 13:12:02,15 days 02:04:29,15.086447
196,T10_d05_007_a_1,2022-04-08 12:00:33,T10,7,5,1,19.0,2022-04-22 13:12:02,14 days 01:11:29,14.049641
197,T10_d05_007_b_1,2022-04-08 12:00:41,T10,7,5,1,19.0,2022-04-22 13:12:02,14 days 01:11:21,14.049549
198,T10_d06_007_a_1,2022-04-09 12:19:16,T10,7,6,1,19.0,2022-04-22 13:12:02,13 days 00:52:46,13.036644


### One-hot encoding Temperature Feature

In [17]:
data = pd.concat([data, pd.get_dummies(data['Storage Group'], dtype=int)], axis=1)

### Drop Storage Group Column

In [18]:
data.drop(['Storage Group'], axis=1, inplace=True)

### Drop NaN values

In [19]:
data.dropna(inplace=True)

In [20]:
data.head()

Unnamed: 0,File Name,Time Stamp,Sample,Day of Experiment,Ripening Index Classification,Overripening Day,Overripening Time Stamp,Time Unitl Overripening,Shelf-life Days,T10,T20,Tam
0,T20_d01_001_a_1,2022-04-04 18:56:55,1,1,1,9.0,2022-04-12 14:44:53,7 days 19:47:58,7.824977,0,1,0
1,T20_d01_001_b_1,2022-04-04 18:57:03,1,1,1,9.0,2022-04-12 14:44:53,7 days 19:47:50,7.824884,0,1,0
2,T20_d02_001_a_1,2022-04-05 14:16:21,1,2,1,9.0,2022-04-12 14:44:53,7 days 00:28:32,7.019815,0,1,0
3,T20_d02_001_b_1,2022-04-05 14:16:46,1,2,1,9.0,2022-04-12 14:44:53,7 days 00:28:07,7.019525,0,1,0
4,T20_d03_001_a_2,2022-04-06 15:08:50,1,3,2,9.0,2022-04-12 14:44:53,5 days 23:36:03,5.983368,0,1,0


### Convert DataFrame to CSV

In [22]:
data.to_csv('../data/processed/avocado_ripening_data.csv', index=False)