# load data

In [1]:
import os
import pandas as pd

In [2]:
from tqdm.notebook import tqdm
tqdm.pandas()

In [3]:
# dataset setting
DATASET_PATH="dataset-ifttt-zenodo"
EXPERIMENT="chen"
DATA_NUM=None

In [4]:
def get_dataset_path(root=DATASET_PATH, exp=EXPERIMENT):
    if exp=="chen":
        train_path = os.path.join(root, "ready-train-chen-only/train-chen.pkl")
        val_path = os.path.join(root, "ready-train-val-noisy/validation-noisy.pkl")
        gold_path = os.path.join(root, "ready-test-clean/test_gold_clean.pkl")
        noisy_path = os.path.join(root, "ready-test-clean/test_intel_clean.pkl")
    return {"train": train_path,
            "val": val_path,
            "gold": gold_path,
            "noisy": noisy_path}

path_dict = get_dataset_path()

In [5]:
def load_dataset(path_dict=path_dict, number=None):
    assert(type(path_dict)==dict)
    df_dict = {}
    for split, path in path_dict.items():
        if number:
            df_dict[split] = pd.read_pickle(path).sample(n=number, random_state=1234).copy()
        else:
            df_dict[split] = pd.read_pickle(path)
    return df_dict

if DATA_NUM:
    df_dict = load_dataset(number=DATA_NUM)
else:
    df_dict = load_dataset()

df_dict['train'].info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 45003 entries, 0 to 45005
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   source  45003 non-null  object
 1   target  45003 non-null  object
dtypes: object(2)
memory usage: 1.0+ MB


# add prefix

In [6]:
prefix_dict=  {"CHANNEL_ONLY": "GENERATE TRIGGER AND ACTION CHANNEL ONLY <pf> ",
                "FULL": "GENERATE BOTH CHANNEL AND FUNCTION FOR TRIGGER AND ACTION <pf> "}

In [7]:
import copy

In [8]:
def add_prefix(input_dict, mode, prefix_dict):
    input_dict_cp = copy.deepcopy(input_dict)
    for key, df in input_dict_cp.items():
        df['source'] = df.source.apply(lambda x: prefix_dict.get(mode) + x)
        if mode=="CHANNEL_ONLY":
            df['target'] = df.target.apply(lambda x: x.split("<sep>")[0].strip() + " <sep> " + x.split("<sep>")[2].strip())
    return input_dict_cp

In [9]:
temp_dict0 = add_prefix(input_dict=df_dict, mode="CHANNEL_ONLY", prefix_dict=prefix_dict)
temp_dict1 = add_prefix(input_dict=df_dict, mode="FULL", prefix_dict=prefix_dict)

In [15]:
temp_dict0['gold'].sample(n=3, random_state=123).values.tolist()

[['GENERATE TRIGGER AND ACTION CHANNEL ONLY <pf> pinboard . in @post items to buffer for distribution to social networks',
  'Pinboard <sep> Buffer'],
 ['GENERATE TRIGGER AND ACTION CHANNEL ONLY <pf> add a new wordpress blog , post a link on twitter',
  'WordPress <sep> Twitter'],
 ['GENERATE TRIGGER AND ACTION CHANNEL ONLY <pf> save the nasa image of the day to your ios photos',
  'Space <sep> iOS_Photos']]

In [12]:
OUTPUT_DIR = "dataset-w-prefix_ch"
if not os.path.exists(OUTPUT_DIR):
    os.makedirs(OUTPUT_DIR)
    
for key, item in temp_dict0.items():
    item.to_pickle(f"{OUTPUT_DIR}/{key}.pkl")

In [13]:
temp_dict1['train'].sample(n=3, random_state=123).values.tolist()

[['GENERATE BOTH CHANNEL AND FUNCTION FOR TRIGGER AND ACTION <pf> sends me an email when weather is rainy tommorow . in c [ for europe use only ]',
  "Weather <sep> Weather.Tomorrow's_forecast_calls_for <sep> Email <sep> Email.Send_me_an_email"],
 ['GENERATE BOTH CHANNEL AND FUNCTION FOR TRIGGER AND ACTION <pf> when @mel66 tweets about # ppc save to pocket',
  'Feed <sep> Feed.New_feed_item <sep> Pocket <sep> Pocket.Save_for_later'],
 ["GENERATE BOTH CHANNEL AND FUNCTION FOR TRIGGER AND ACTION <pf> thanks for the twitter follow , here ' s when you followed me !",
  'Gmail <sep> Gmail.New_email_in_inbox_labeled <sep> Twitter <sep> Twitter.Post_a_tweet']]

In [14]:
OUTPUT_DIR = "dataset-w-prefix_fc"
if not os.path.exists(OUTPUT_DIR):
    os.makedirs(OUTPUT_DIR)
    
for key, item in temp_dict1.items():
    item.to_pickle(f"{OUTPUT_DIR}/{key}.pkl")

In [66]:
merge_dict = {}
for key in temp_dict0:
    merge_dict[key] = pd.concat(([temp_dict0[key],
                                 temp_dict1[key]]))

In [67]:
merge_dict['train']

Unnamed: 0,source,target
0,GENERATE TRIGGER AND ACTION CHANNEL ONLY <pf> ...,IFTTT <sep> Twitter
1,GENERATE TRIGGER AND ACTION CHANNEL ONLY <pf> ...,Phone_Call <sep> Gmail
2,GENERATE TRIGGER AND ACTION CHANNEL ONLY <pf> ...,Facebook <sep> Dropbox
3,GENERATE TRIGGER AND ACTION CHANNEL ONLY <pf> ...,Twitter <sep> Email
4,GENERATE TRIGGER AND ACTION CHANNEL ONLY <pf> ...,Flickr <sep> Twitter
...,...,...
45001,GENERATE BOTH CHANNEL AND FUNCTION FOR TRIGGER...,Facebook <sep> Facebook.New_status_message_by_...
45002,GENERATE BOTH CHANNEL AND FUNCTION FOR TRIGGER...,Facebook_Pages <sep> Facebook_Pages.New_photo_...
45003,GENERATE BOTH CHANNEL AND FUNCTION FOR TRIGGER...,Facebook_Pages <sep> Facebook_Pages.New_link_p...
45004,GENERATE BOTH CHANNEL AND FUNCTION FOR TRIGGER...,Facebook_Pages <sep> Facebook_Pages.New_status...


In [68]:
OUTPUT_DIR = "dataset-w-prefix"
if not os.path.exists(OUTPUT_DIR):
    os.makedirs(OUTPUT_DIR)
    
for key, item in merge_dict.items():
    item.to_pickle(f"{OUTPUT_DIR}/{key}.pkl")