In [1]:
import pandas as pd
import numpy as np

from bokeh.io import output_notebook
from bokeh.plotting import figure, show, output_file
from bokeh.layouts import column
from itertools import groupby
from operator import itemgetter

from IPython.core.display import display, HTML

output_notebook()

pd.set_option('display.height', 1000)
pd.set_option('display.max_rows', 6000)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

In [11]:
colocations = pd.read_csv("../data/colocations.csv", index_col=0)
customer_details = pd.read_csv("../data/customer_details.csv", index_col=0)
deliveries = pd.read_csv("../data/deliveries.csv", index_col=0)
original_level_readings = pd.read_csv("/data/Linde_Intel_AI_Challenge_Nov2018/level_readings.csv")
level_readings = pd.read_csv("processed_level.csv", index_col=0)

  mask |= (ar1 == a)


## Convert to date and time

In [12]:
deliveries["del_date"] = pd.to_datetime(deliveries["DELIVERY_DATE"]).dt.date
deliveries["del_time"] = pd.to_datetime(deliveries["DELIVERY_DATE"]).dt.time
deliveries["DELIVERY_DATE"] = pd.to_datetime(deliveries["DELIVERY_DATE"])

level_readings["read_date"] = pd.to_datetime(level_readings["ON_DATE_TIME"]).dt.date
level_readings["read_time"] = pd.to_datetime(level_readings["ON_DATE_TIME"]).dt.time
level_readings["ON_DATE_TIME"] = pd.to_datetime(level_readings["ON_DATE_TIME"])

original_level_readings["read_date"] = pd.to_datetime(original_level_readings["ON_DATE_TIME"]).dt.date
original_level_readings["read_time"] = pd.to_datetime(original_level_readings["ON_DATE_TIME"]).dt.time
original_level_readings["ON_DATE_TIME"] = pd.to_datetime(original_level_readings["ON_DATE_TIME"])

# Co locations

| ... | ... | ... |
| --- | --- | --- |
|VESSEL_ID 1 |Identifier of primary storage vessel |VARCHAR|
|VESSEL_ID 2 - 7 |Identifier of any (up to seven) co-located storage vessels |VARCHAR|

In [4]:
colocations.head()

Unnamed: 0,VESSEL_ID_1,VESSEL_ID_2,VESSEL_ID_3,VESSEL_ID_4,VESSEL_ID_5,VESSEL_ID_6,VESSEL_ID_7
0,BR-218373,,,,,,
1,BR-216975,,,,,,
2,BR-216968,,,,,,
3,BR-217297,,,,,,
4,BR-217930,,,,,,


# Customer deets

| ... | ... | ...|
| --- | --- | --- |
|INST_ID                     | Installation Identifier                                         | INT      |
|CNTRY_UN_COUNTRY_CODE b     | Two-digit standard country identifier                           | CHAR (2) |
|PROVINCE                    |  Geographic location Province/State/County                      | VARCHAR  |
|PRD_ID                      | Abbreviation of Product Name                                    | CHAR (3) |
|MAXIMUM_PRODUCT_CAPACITY    | Maximum capacity of the storage vessel                          | INT (KG) |
|VESSEL_ID                   | Concatenation of Country identifier and Installation identifier | VARCHAR  |
|MKT_SEGMENT                 | Market Segment Classification                                   | VARCHAR  |
|ISIC_CODE (where available) |  International Standard Industrial Classification (ISIC) code   | INT      |
|ISIC_DESC                   | ISIC Code Description                                           | VARCHAR  |

In [5]:
customer_details.head()

Unnamed: 0,INST_ID,CNTRY_UN_COUNTRY_CODE,PROVINCE,PRD_ID,MAXIMUM_PRODUCT_CAPACITY,VESSEL_ID,MKT_SEGMENT,ISIC_CODE,ISIC_DESCRIPTION
0,216705,BR,PB,LOX,2195,BR-216705,Healthcare,8610.0,Hospital activities
1,216706,BR,SP,LOX,13075,BR-216706,Chemistry & Energy,3822.0,Treatment and disposal of hazardous wast
2,216721,BR,SP,LOX,3567,BR-216721,Manufacturing Industries,2910.0,Manufacture of motor vehicles
3,216729,BR,SP,LAR,13852,BR-216729,Metals & Glass,2610.0,Manufacture of electronic components and
4,216732,BR,PE,LOX,2195,BR-216732,Healthcare,8610.0,Hospital activities


# Deliveries

|...|...|...|
|--- | ---| ---|
|INST_ID |Installation Identifier |INT
|DELIVERY_DATE |Date/time of delivery |YYYY-MM-DD HH:MM
|DELIVERED_VOLUME | Amount of product delivered |INT (KG)

In [6]:
deliveries.head()

Unnamed: 0,DELIVERY_DATE,DELIVERED_VOLUME,VESSEL_ID,del_date,del_time
0,2016-03-13 01:34:00,12362,UK-3749,2016-03-13,01:34:00
1,2016-03-13 02:18:00,11564,UK-3806,2016-03-13,02:18:00
2,2016-03-13 06:48:00,3103,UK-41488,2016-03-13,06:48:00
3,2016-03-13 07:07:00,3683,UK-63168,2016-03-13,07:07:00
4,2016-03-13 07:24:00,6545,UK-2435,2016-03-13,07:24:00


# Level headings

|...|...|...|
|--- | ---| ---|
|INST_ID |Installation Identifier| INT
|ON_DATE_TIME| Date/time of reading| YYYY-MM-DD HH:MM
|INST_PRODUCT_AMOUNT |Instantaneous product level reading | INT (KG)

In [7]:
level_readings.head()

Unnamed: 0,COUNTRY_CODE,INST_ID,INST_PRODUCT_AMOUNT,ON_DATE_TIME,VESSEL_ID,read_date,read_time
0,MY,62,13870,2017-10-02 00:59:00,MY-62,2017-10-02,00:59:00
1,MY,62,18330,2017-10-02 01:59:00,MY-62,2017-10-02,01:59:00
2,MY,62,15367,2017-10-02 02:59:00,MY-62,2017-10-02,02:59:00
3,MY,62,13805,2017-10-02 03:59:00,MY-62,2017-10-02,03:59:00
4,MY,62,27870,2017-10-02 04:59:00,MY-62,2017-10-02,04:59:00


## Add sensor status feature - 0 if 4 consecutive readings are 0 when there is also a delivery else 1

In [8]:
def delivery_occurred(data, start_date, end_date):
    
    # get all data between start and end dates
    
    indices_above = np.where(deliveries["DELIVERY_DATE"] >= start_date)[0]
    indices_below = np.where(deliveries["DELIVERY_DATE"] < end_date)[0]
    indices_within = np.intersect1d(indices_above, indices_below)
    
    if sum(deliveries.loc[indices_within, "DELIVERED_VOLUME"]) > 0:
        return True
    else:
        return False
    

In [9]:
_status = np.ones(level_readings.shape[0])
failure_indices=[]

for ves_id, vessel_group in level_readings.groupby("VESSEL_ID"):
    zero_indices = vessel_group.index[np.where(vessel_group["INST_PRODUCT_AMOUNT"].values == 0)[0]]
    
    for k, g in groupby(enumerate(zero_indices), lambda x:x[0]-x[1]):
        group = list(map(itemgetter(1), g)) # group of continuous indices
        if len(group) > 4:
            for index in range(len(group)):
                if index + 4 < len(group):
#                     display(vessel_group.loc[group[index: index + 4], "ON_DATE_TIME"])
                    
                    if delivery_occurred(deliveries, vessel_group.loc[group[index], "ON_DATE_TIME"], vessel_group.loc[group[index+4], "ON_DATE_TIME"]):
#                         _status[group[index] : group[index+4]] = 0
                        failure_indices.append(group[index:index+4])
            
_status[failure_indices] = 0

level_readings["sensor_status"] = _status
print("DONE")

DONE


## Find the missing data

### 1. Dealing with fake 0s

In [13]:
while True:
    old_shape = level_readings.shape[0]
    indices_to_remove = []
    # group by vessel id
    for ves_id, vessel_group in level_readings.groupby("VESSEL_ID"):

        # sort the group
        sorted_vessel_group = vessel_group.sort_values(by=["ON_DATE_TIME"])

        # find max and min timestamps in the group
        sorted_max = sorted_vessel_group["ON_DATE_TIME"].max()
        sorted_min = sorted_vessel_group["ON_DATE_TIME"].min()

    #     print(sorted_vessel_group.dtypes)
    #     print(sorted_vessel_group["ON_DATE_TIME"].dt.minute.value_counts())

        zero_indices = sorted_vessel_group.index[np.where(sorted_vessel_group["ON_DATE_TIME"].dt.minute == 0)[0]]
        zero_indices_1 = zero_indices + 1
        zero_indices_0 = zero_indices - 1
        
        one_indices = sorted_vessel_group.index[np.where(sorted_vessel_group["ON_DATE_TIME"].dt.minute == 1)[0]]
        one_indices_1 = one_indices + 1
        one_indices_0 = one_indices - 1

    #     print(sorted_vessel_group.loc[zero_indices, "ON_DATE_TIME"].dt.minute.value_counts())
    #     print(sorted_vessel_group.loc[zero_indices_1, "ON_DATE_TIME"].dt.minute.value_counts())
    #     print(sorted_vessel_group.loc[zero_indices_0, "ON_DATE_TIME"].dt.minute.value_counts())

        # check where the plus one index and minus one index have timestamps with minute as 59
        # this means that the 0 value is wrong, hence delete
        plus_one_indices = sorted_vessel_group.loc[zero_indices_1][sorted_vessel_group.loc[zero_indices_1, "ON_DATE_TIME"].dt.minute == 59].index.values - 1
        minus_one_indices = sorted_vessel_group.loc[zero_indices_0][sorted_vessel_group.loc[zero_indices_0, "ON_DATE_TIME"].dt.minute == 59].index.values + 1
        
        # for the ones
#         plus_one_indices_one = sorted_vessel_group.loc[one_indices_1][sorted_vessel_group.loc[one_indices_1, "ON_DATE_TIME"].dt.minute == 59].index.values - 1
#         minus_one_indices_one = sorted_vessel_group.loc[one_indices_0][sorted_vessel_group.loc[one_indices_0, "ON_DATE_TIME"].dt.minute == 59].index.values + 1
        
        int_one = np.intersect1d(plus_one_indices_one, minus_one_indices_one).tolist()
#         int_zero = np.intersect1d(plus_one_indices, minus_one_indices).tolist()
        
#         indices_to_remove.append(np.intersect1d(int_one, int_zero).tolist())
        indices_to_remove.append(int_one)

    if len(indices_to_remove) > 0:
        level_readings.drop(index=sum(indices_to_remove,[]), inplace=True)
        level_readings.reset_index(drop=True, inplace=True)

    if old_shape == level_readings.shape[0]:
        break
    
#     print(plus_one_indices)
#     indices_within = np.where()
    
#     break
    
print("Done")

Passing list-likes to .loc or [] with any missing label will raise
KeyError in the future, you can use .reindex() as an alternative.

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#deprecate-loc-reindex-listlike
Passing list-likes to .loc or [] with any missing label will raise
KeyError in the future, you can use .reindex() as an alternative.

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#deprecate-loc-reindex-listlike
  return getattr(section, self.name)[new_key]
Passing list-likes to .loc or [] with any missing label will raise
KeyError in the future, you can use .reindex() as an alternative.

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#deprecate-loc-reindex-listlike


Done


In [11]:
# after_prep = level_readings.copy()

## Dealing with sharp changes in level readings

In [4]:
# need to check if there is no delivery and also if the level increases sharply. 
# then find the mean of the previous and next point
# level_readings = after_prep.copy()

for ves_id, vessel_group in level_readings.groupby("VESSEL_ID"):
    sorted_vessel_group = vessel_group.sort_values(by=["ON_DATE_TIME"])
    sorted_delivery_group = deliveries[deliveries["VESSEL_ID"] == ves_id].sort_values(by=["DELIVERY_DATE"])
    
    data_x = sorted_vessel_group["INST_PRODUCT_AMOUNT"].values

    data_x_plus = np.zeros(data_x.shape)
    data_x_plus[:-1] = data_x[1:]
    
    data_x_minus = np.zeros(data_x.shape)
    data_x_minus[1:] = data_x[1:]
  
    dist_x_minus_1 = abs(data_x - data_x_minus)
    dist_x_plus_1 = abs(data_x_plus - data_x)

    indices_minus = np.where(dist_x_minus_1 > 100)[0]
    indices_plus = np.where(dist_x_plus_1 > 100)[0]
    
    final_indices = sorted_vessel_group.index[np.unique(np.concatenate((indices_minus+1, indices_plus-1)))]
    final_indices_minus = final_indices - 1
    final_indices_plus = final_indices + 1

    values = (sorted_vessel_group.loc[final_indices_minus, "INST_PRODUCT_AMOUNT"].values + \
         sorted_vessel_group.loc[final_indices_plus, "INST_PRODUCT_AMOUNT"].values) / 2
    
    nan_indices = final_indices[np.where(pd.isnull(values) == True)]
    
    if len(nan_indices) > 0:
        values[nan_indices] = values[nan_indices] - 2
    
    level_readings.loc[final_indices, "INST_PRODUCT_AMOUNT"] = values
    
#     print(data_x.shape)
    
#     print(final_indices_minus.values)
#     print(final_indices_plus.values)
    
#     sorted_vessel_group.loc[final_indices, "INST_PRODUCT_AMOUNT"] = 
#     print(sorted_vessel_group.loc[final_indices_minus, "INST_PRODUCT_AMOUNT"].values)
#     print()
#     print(sorted_vessel_group.loc[final_indices_minus, "INST_PRODUCT_AMOUNT"].values)
#     print()
#     print(type(sorted_vessel_group.loc[final_indices_minus, "INST_PRODUCT_AMOUNT"].values[0]))
#     print(sorted_vessel_group.loc[final_indices_minus, "INST_PRODUCT_AMOUNT"].values + \
#          sorted_vessel_group.loc[final_indices_plus, "INST_PRODUCT_AMOUNT"].values)
#     print()   
#     print("ori last", data_x[-1])
#     print("ori second last", data_x[-2])
#     print("ori first", data_x[0])
#     print("ori second", data_x[1])
#     print("minus last", data_x_minus[-1])
#     print("minus first", data_x_minus[0])
#     print("minus second", data_x_minus[0])
#     print("plus second last", data_x_plus[-2])
#     print("plus last", data_x_plus[-1])
#     print("plus first", data_x_plus[0])

Passing list-likes to .loc or [] with any missing label will raise
KeyError in the future, you can use .reindex() as an alternative.

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#deprecate-loc-reindex-listlike
  return getattr(section, self.name)[new_key]


IndexError: index 8376172 is out of bounds for axis 1 with size 133

In [34]:
level_readings.to_csv("processed_level.csv", sep=',')

## Need to clip all values above the max capacity of the vessel

In [6]:
# find all points where the level is higher than the max capacity of the vessel
# apply a threshold: If value is within treshold, then no clipping needed
#                    Else modify the value to the nearest value under the max capacity


level_readings = original_level_readings.copy()
for vessel_id, vessel_group in level_readings.groupby("VESSEL_ID"):
    sorted_vessel_group = vessel_group.sort_values(by=["ON_DATE_TIME"])
    threshold = 400
    max_capacity = customer_details.loc[np.where(vessel_id == customer_details["VESSEL_ID"])[0], "MAXIMUM_PRODUCT_CAPACITY"].values
    max_capacity = max_capacity + threshold
    
#     print(sorted_vessel_group["INST_PRODUCT_AMOUNT"].values > 10)
    
    indices = np.where(sorted_vessel_group["INST_PRODUCT_AMOUNT"].values > (max_capacity + threshold))[0]
    to_clip_indices = sorted_vessel_group.index[indices]
    
    for clip_index in to_clip_indices:
#         print(clip_index)
#         print((clip_index-3 in sorted_vessel_group.index.values))
#         print((clip_index-2 in sorted_vessel_group.index.values))
#         print((clip_index-1 in sorted_vessel_group.index.values))
        
        if clip_index-3 in sorted_vessel_group.index.values or clip_index-2 in sorted_vessel_group.index.values or \
           clip_index-1 in sorted_vessel_group.index.values:
            ind1 = clip_index - 1
            ind2 = clip_index - 2
            ind3 = clip_index - 3
        else:
            ind1 = clip_index + 1
            ind2 = clip_index + 2
            ind3 = clip_index + 3
        
#         print(clip_index == to_clip_indices[0])
        smallest = np.array([ind1,ind2,ind3])
        smallest_index = np.where(sorted_vessel_group.loc[clip_index, "INST_PRODUCT_AMOUNT"] > \
                    sorted_vessel_group.loc[smallest, "INST_PRODUCT_AMOUNT"] * 2)[0]
        
#         if not(smallest_distance):
#             smallest_distance = np.argmin(sorted_vessel_group.loc[clip_index, "INST_PRODUCT_AMOUNT"] - \
#                     sorted_vessel_group.loc[[ind1,ind2,ind3], "INST_PRODUCT_AMOUNT"] * 1.5)
        
#             level_readings.loc[clip_index, "INST_PRODUCT_AMOUNT"] = level_readings.loc[smallest_distance, "INST_PRODUCT_AMOUNT"]
# #             print(smallest_distance)
#         else:
#             print(smallest_distance)
        if len(smallest_index) > 0 and str(smallest[smallest_index[0]]) != "nan":
            find_smallest = np.argmin(smallest[smallest_index])
            level_readings.loc[clip_index, "INST_PRODUCT_AMOUNT"] = level_readings.loc[find_smallest, "INST_PRODUCT_AMOUNT"]
#         print("-")
print("DONE")

Passing list-likes to .loc or [] with any missing label will raise
KeyError in the future, you can use .reindex() as an alternative.

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#deprecate-loc-reindex-listlike
  return getattr(section, self.name)[new_key]


DONE


## Need to clip all the sudden zeros or large distances

In [5]:
level_readings.reset_index(drop=True, inplace=True)

In [5]:
level_readings = original_level_readings.copy()

for vessel_id, vessel_group in level_readings.groupby("VESSEL_ID"):
#     print(vessel_id)
    
    sorted_vessel_group = vessel_group.sort_values(by=["ON_DATE_TIME"])
    
    current_minus = [t - s for s, t in zip(sorted_vessel_group["INST_PRODUCT_AMOUNT"].values, sorted_vessel_group["INST_PRODUCT_AMOUNT"].values[1:])]
    next_minus = [t - s for s, t in zip(sorted_vessel_group["INST_PRODUCT_AMOUNT"].values[1:], sorted_vessel_group["INST_PRODUCT_AMOUNT"].values[2:])]
    
    minus_indices = sorted_vessel_group.index[1:][np.where(np.array(current_minus) > 100)[0]]
#     print(len(minus_indices))
    
    next_indices = sorted_vessel_group.index[2:][np.where(np.array(next_minus) > 100)[0]]
#     print(len(minus_indices))
    
    final_indices = np.intersect1d(minus_indices, next_indices)
    final_indices = np.intersect1d(final_indices, sorted_vessel_group.index.values)
    
    for index in final_indices:
        if index == sorted_vessel_group.index[-1]:
            mean = (sorted_vessel_group.loc[index, "INST_PRODUCT_AMOUNT"] + sorted_vessel_group.loc[index-1, "INST_PRODUCT_AMOUNT"]) / 2
        elif index+1 in sorted_vessel_group.index and index-1 in sorted_vessel_group.index:
            mean = (sorted_vessel_group.loc[index+1, "INST_PRODUCT_AMOUNT"] + sorted_vessel_group.loc[index-1, "INST_PRODUCT_AMOUNT"]) / 2
        else:
            mean = sorted_vessel_group.loc[index-4, "INST_PRODUCT_AMOUNT"]
            
        level_readings.loc[index, "INST_PRODUCT_AMOUNT"] = mean

print("DONE")

KeyboardInterrupt: 

In [61]:
minus_indices.values

array([8359383, 8359698, 8360031, 8360433, 8360584, 8360830, 8360831,
       8360832, 8361156, 8361319, 8361599, 8361791, 8361980, 8362063,
       8362340, 8362672, 8363113, 8363349, 8363692, 8363960, 8364279,
       8364545, 8364743])

In [62]:
next_indices.values

array([8359383, 8359698, 8360031, 8360433, 8360584, 8360830, 8360831,
       8360832, 8361156, 8361319, 8361599, 8361791, 8361980, 8362063,
       8362340, 8362672, 8363113, 8363349, 8363692, 8363960, 8364279,
       8364545, 8364743])

In [63]:
np.intersect1d(minus_indices, next_indices)

array([8359383, 8359698, 8360031, 8360433, 8360584, 8360830, 8360831,
       8360832, 8361156, 8361319, 8361599, 8361791, 8361980, 8362063,
       8362340, 8362672, 8363113, 8363349, 8363692, 8363960, 8364279,
       8364545, 8364743])

In [64]:
sorted_vessel_group.index[-1]

8364838

# Delivery and level reading based on vessel id

In [15]:

vessel_id = deliveries["VESSEL_ID"].unique()[np.random.randint(2347)]
# vessel_id = "CN-150090"
vessel_id = 'CN-136070'
# vessel_id = 'BR-218596'
vessel_id = 'BR-216705'
vessel_id = "CN-140691"
vessel_id = "CN-136070"
vessel_id = "CN-134330"
# vessel_id = "TH-23561"
print(vessel_id)

vessel_id_del_data = deliveries[deliveries["VESSEL_ID"] == vessel_id].sort_values(by=["DELIVERY_DATE"])
vessel_id_level_data = level_readings[level_readings["VESSEL_ID"] == vessel_id].sort_values(by=["ON_DATE_TIME"])
max_capacity = customer_details.loc[np.where(vessel_id == customer_details["VESSEL_ID"])[0], "MAXIMUM_PRODUCT_CAPACITY"]
# print(type(vessel_id_del_data["DELIVERY_DATE"].values[0]))
col_plot = column()

p = figure(plot_width=970, plot_height=600, title="Deliveries for Vessel ID: "+vessel_id, x_axis_type='datetime')

p.vbar(x=vessel_id_del_data["DELIVERY_DATE"], top=vessel_id_del_data["DELIVERED_VOLUME"], width=100)
# p.square(vessel_id_del_data["DELIVERY_DATE"], vessel_id_del_data["DELIVERED_VOLUME"], color='#A6CEE3', line_width=3)

col_plot.children.append(p)

p = figure(plot_width=970, plot_height=600, title="Level readings for Vessel ID: "+vessel_id, x_axis_type='datetime')

p.line(vessel_id_level_data["ON_DATE_TIME"], vessel_id_level_data["INST_PRODUCT_AMOUNT"], color='green', line_width=3)
p.line(vessel_id_level_data["ON_DATE_TIME"], np.ones(vessel_id_level_data["ON_DATE_TIME"].shape[0]) * max_capacity.values, color='red', line_width=4)
p.line(vessel_id_level_data["ON_DATE_TIME"], np.ones(vessel_id_level_data["ON_DATE_TIME"].shape[0]) * (max_capacity.values+threshold), color='purple', line_width=4)
p.square(vessel_id_level_data["ON_DATE_TIME"], vessel_id_level_data["INST_PRODUCT_AMOUNT"], color='green', line_width=3)

col_plot.children.append(p)

show(col_plot)

CN-134330


In [38]:
level_readings.to_csv("after_clipping.csv", sep=",")

In [9]:
len(level_readings["VESSEL_ID"].unique())

1810

## To check if all data points have unique timestamps

In [9]:
count = 0
for vessel_id in vessel_id_level_data["VESSEL_ID"].unique():
    # vessel_id_del_data = deliveries[deliveries["VESSEL_ID"] == vessel_id].sort_values(by=["del_date"])
    vessel_id_level_data = level_readings[level_readings["VESSEL_ID"] == vessel_id].sort_values(by=["read_date"])
    if len(vessel_id_level_data["ON_DATE_TIME"].unique()) == vessel_id_level_data.shape[0]:
        count = count + 1
        
if count == len(vessel_id_level_data["VESSEL_ID"].unique()):
    print("ALL UNIQUE")
else:
    print(NOT)

ALL UNIQUE


In [115]:
updated_lvl_readings = pd.read_csv("/data/Linde_Intel_AI_Challenge_Nov2018/level_readings.csv")

updated_lvl_readings["read_date"] = pd.to_datetime(updated_lvl_readings["ON_DATE_TIME"]).dt.date
updated_lvl_readings["read_time"] = pd.to_datetime(updated_lvl_readings["ON_DATE_TIME"]).dt.time
updated_lvl_readings["ON_DATE_TIME"] = pd.to_datetime(updated_lvl_readings["ON_DATE_TIME"])

## Do NOT USE

In [12]:
# new level_reading dataframe
updated_lvl_readings = level_readings.copy()
indices_to_remove = []
# group by vessel id
for ves_id, vessel_group in level_readings.groupby("VESSEL_ID"):
    # sort the group
    sorted_vessel_group = vessel_group.sort_values(by=["ON_DATE_TIME"])
    
    # find max and min timestamps in the group
    sorted_max = sorted_vessel_group["ON_DATE_TIME"].max()
    sorted_min = sorted_vessel_group["ON_DATE_TIME"].min()
    
    # subtract the max from the timestamps of the entire group
    # update in level readings
#     updated_lvl_readings.loc[sorted_vessel_group.index, "sub_time"] = sorted_max - sorted_vessel_group["ON_DATE_TIME"]
    times_with_max = sorted_max - sorted_vessel_group["ON_DATE_TIME"]
    times_with_max = times_with_max / pd.Timedelta('1 hour')
    times_with_max = times_with_max % 1
    
    times_with_min = sorted_vessel_group["ON_DATE_TIME"] - sorted_min
    times_with_min = times_with_min / pd.Timedelta('1 hour')
    times_with_min = times_with_min % 1
    
    remove_min_indices = times_with_min.index[np.where(times_with_min > 0)[0]]
    remove_max_indices = times_with_max.index[np.where(times_with_max > 0)[0]]
    
    indices_to_remove.append(np.unique(np.concatenate((remove_min_indices, remove_max_indices))).tolist())

print("Finished")

Finished


### Delete the indices

In [13]:
updated_lvl_readings.drop(index=sum(indices_to_remove, []), inplace=True)
updated_lvl_readings.reset_index(drop=True, inplace=True)
print("Completed")

Completed


In [None]:
upda

# VIZ

In [133]:
vessel_id = deliveries["VESSEL_ID"].unique()[np.random.randint(2347)]
vessel_id = "IN-19758"

vessel_id_del_data = deliveries[deliveries["VESSEL_ID"] == vessel_id].sort_values(by=["DELIVERY_DATE"])
original_vessel_id_level_data = level_readings[level_readings["VESSEL_ID"] == vessel_id].sort_values(by=["ON_DATE_TIME"])
vessel_id_level_data = updated_lvl_readings[updated_lvl_readings["VESSEL_ID"] == vessel_id].sort_values(by=["ON_DATE_TIME"])
max_capacity = customer_details.loc[np.where(vessel_id == customer_details["VESSEL_ID"])[0], "MAXIMUM_PRODUCT_CAPACITY"]

# print(type(vessel_id_del_data["DELIVERY_DATE"].values[0]))
col_plot = column()

p = figure(plot_width=970, plot_height=600, title="Deliveries for Vessel ID: "+vessel_id, x_axis_type='datetime')

p.vbar(x=vessel_id_del_data["DELIVERY_DATE"], top=vessel_id_del_data["DELIVERED_VOLUME"], width=100)
# p.square(vessel_id_del_data["DELIVERY_DATE"], vessel_id_del_data["DELIVERED_VOLUME"], color='#A6CEE3', line_width=3)

col_plot.children.append(p)
# print(vessel_id_level_data)
p = figure(plot_width=970, plot_height=600, title="Level readings for Vessel ID: "+vessel_id, x_axis_type='datetime')

p.line(vessel_id_level_data["ON_DATE_TIME"], vessel_id_level_data["INST_PRODUCT_AMOUNT"], color='green', line_width=1.5)
p.line(vessel_id_level_data["ON_DATE_TIME"], max_capacity, color='green', line_width=1.5)
p.square(vessel_id_level_data["ON_DATE_TIME"], vessel_id_level_data["INST_PRODUCT_AMOUNT"], color='green', line_width=3)

col_plot.children.append(p)

show(col_plot)



# Write the modified csv files

In [101]:
colocations.to_csv("../data/colocations.csv", sep=',')
customer_details.to_csv("../data/customer_details.csv", sep=',')
deliveries.to_csv("../data/deliveries.csv", sep=',')
level_readings.to_csv("../data/level_readings.csv", sep=',')