In [1]:
import torch
import transformers
import datasets


  from .autonotebook import tqdm as notebook_tqdm


In [3]:
hf_dataset_name = "knkarthick/dialogsum"

dataset = datasets.load_dataset(hf_dataset_name)
dataset

DatasetDict({
    train: Dataset({
        features: ['id', 'dialogue', 'summary', 'topic'],
        num_rows: 12460
    })
    validation: Dataset({
        features: ['id', 'dialogue', 'summary', 'topic'],
        num_rows: 500
    })
    test: Dataset({
        features: ['id', 'dialogue', 'summary', 'topic'],
        num_rows: 1500
    })
})

In [9]:
filtered_dataset = dataset.filter(lambda x, indexes: [index % 100 == 0 for index in indexes], with_indices=True, batched=True)
filtered_dataset

Filter: 100%|██████████| 12460/12460 [00:00<00:00, 369435.10 examples/s]
Filter: 100%|██████████| 500/500 [00:00<00:00, 19186.94 examples/s]
Filter: 100%|██████████| 1500/1500 [00:00<00:00, 49901.69 examples/s]


DatasetDict({
    train: Dataset({
        features: ['id', 'dialogue', 'summary', 'topic'],
        num_rows: 125
    })
    validation: Dataset({
        features: ['id', 'dialogue', 'summary', 'topic'],
        num_rows: 5
    })
    test: Dataset({
        features: ['id', 'dialogue', 'summary', 'topic'],
        num_rows: 15
    })
})

In [20]:
# filtered_dataset = dataset.filter(lambda xs, indexes: len(xs['dialogue']) > 200, with_indices=True, batched=False)
filtered_dataset = dataset.filter(lambda xs, indexes: [len(x) > 200 and index%10 == 0 for x, index in zip(xs['dialogue'], indexes)], with_indices=True, batched=True)
filtered_dataset

Filter: 100%|██████████| 12460/12460 [00:00<00:00, 163486.86 examples/s]
Filter: 100%|██████████| 500/500 [00:00<00:00, 18512.01 examples/s]
Filter: 100%|██████████| 1500/1500 [00:00<00:00, 41668.31 examples/s]


DatasetDict({
    train: Dataset({
        features: ['id', 'dialogue', 'summary', 'topic'],
        num_rows: 1244
    })
    validation: Dataset({
        features: ['id', 'dialogue', 'summary', 'topic'],
        num_rows: 50
    })
    test: Dataset({
        features: ['id', 'dialogue', 'summary', 'topic'],
        num_rows: 150
    })
})

In [42]:
map_dataset = dataset.map(lambda xs: {"dialogue_upper": [x.upper() for x in xs['dialogue']]}, batched=True)
print(map_dataset['train'][0]['dialogue'])
print(map_dataset['train'][0]['dialogue_upper'])

#Person1#: Hi, Mr. Smith. I'm Doctor Hawkins. Why are you here today?
#Person2#: I found it would be a good idea to get a check-up.
#Person1#: Yes, well, you haven't had one for 5 years. You should have one every year.
#Person2#: I know. I figure as long as there is nothing wrong, why go see the doctor?
#Person1#: Well, the best way to avoid serious illnesses is to find out about them early. So try to come at least once a year for your own good.
#Person2#: Ok.
#Person1#: Let me see here. Your eyes and ears look fine. Take a deep breath, please. Do you smoke, Mr. Smith?
#Person2#: Yes.
#Person1#: Smoking is the leading cause of lung cancer and heart disease, you know. You really should quit.
#Person2#: I've tried hundreds of times, but I just can't seem to kick the habit.
#Person1#: Well, we have classes and some medications that might help. I'll give you more information before you leave.
#Person2#: Ok, thanks doctor.
#PERSON1#: HI, MR. SMITH. I'M DOCTOR HAWKINS. WHY ARE YOU HERE TODAY

In [51]:
new_dataset = dataset.map(lambda x: {'short': x['dialogue'][:50]})
print(new_dataset['train'][200]['dialogue'])
print('-'.join('-' for _ in range(50)))
print(new_dataset['train'][200]['short'])

new_dataset_batched = dataset.map(lambda xs: {'short': [x[:50] for x in xs['dialogue']]}, batched=True)
print(new_dataset_batched['train'][200]['dialogue'])
print('-'.join('-' for _ in range(50)))
print(new_dataset_batched['train'][200]['short'])

#Person1#: What do you want to know about me?
#Person2#: How about your academic records at college?
#Person1#: The average grade of all my courses is above 85.
#Person2#: In which subject did you get the highest marks?
#Person1#: In mathematics I got a 98.
#Person2#: Have you received any scholarships?
#Person1#: Yes, I have, and three times in total.
#Person2#: Have you been a class leader?
#Person1#: I have been a class commissary in charge of studies for two years.
#Person2#: Did you join in any club activities?
#Person1#: I was an aerobics team member in college.
#Person2#: What sport are you good at?
#Person1#: I am good at sprint and table tennis.
#Person2#: You are excellent.
---------------------------------------------------------------------------------------------------
#Person1#: What do you want to know about me?
#Per
#Person1#: What do you want to know about me?
#Person2#: How about your academic records at college?
#Person1#: The average grade of all my courses is above

In [57]:
new_dataset_filter = new_dataset.filter(lambda x, index: index % 10 == 0, with_indices=True)
print(new_dataset_filter)

new_dataset_filter_batched = new_dataset.filter(lambda xs, indexes: [index % 10 == 0 for index in indexes], with_indices=True, batched=True)
print(new_dataset_filter_batched)

DatasetDict({
    train: Dataset({
        features: ['id', 'dialogue', 'summary', 'topic', 'short'],
        num_rows: 1246
    })
    validation: Dataset({
        features: ['id', 'dialogue', 'summary', 'topic', 'short'],
        num_rows: 50
    })
    test: Dataset({
        features: ['id', 'dialogue', 'summary', 'topic', 'short'],
        num_rows: 150
    })
})
DatasetDict({
    train: Dataset({
        features: ['id', 'dialogue', 'summary', 'topic', 'short'],
        num_rows: 1246
    })
    validation: Dataset({
        features: ['id', 'dialogue', 'summary', 'topic', 'short'],
        num_rows: 50
    })
    test: Dataset({
        features: ['id', 'dialogue', 'summary', 'topic', 'short'],
        num_rows: 150
    })
})


In [59]:
model_name = "google/flan-t5-base"
tokenizer = transformers.AutoTokenizer.from_pretrained(model_name)

In [64]:
tokenized_dataset = dataset.map(lambda x: {'dialogue_tok': tokenizer(x['dialogue'])})
print(tokenized_dataset)
# print(tokenizer.decode(tokenized_dataset['train'][0]['dialogue_tok']))

DatasetDict({
    train: Dataset({
        features: ['id', 'dialogue', 'summary', 'topic', 'dialogue_tok'],
        num_rows: 12460
    })
    validation: Dataset({
        features: ['id', 'dialogue', 'summary', 'topic', 'dialogue_tok'],
        num_rows: 500
    })
    test: Dataset({
        features: ['id', 'dialogue', 'summary', 'topic', 'dialogue_tok'],
        num_rows: 1500
    })
})


In [76]:
print(tokenized_dataset['train'][0]['dialogue_tok']['input_ids'])
print(tokenizer.decode(tokenized_dataset['train'][0]['dialogue_tok']['input_ids']))
print(tokenized_dataset['train'][0]['dialogue'])


[1713, 345, 13515, 536, 4663, 10, 2018, 6, 1363, 5, 3931, 5, 27, 31, 51, 7582, 12833, 77, 7, 5, 1615, 33, 25, 270, 469, 58, 1713, 345, 13515, 357, 4663, 10, 27, 435, 34, 133, 36, 3, 9, 207, 800, 12, 129, 3, 9, 691, 18, 413, 5, 1713, 345, 13515, 536, 4663, 10, 2163, 6, 168, 6, 25, 43, 29, 31, 17, 141, 80, 21, 305, 203, 5, 148, 225, 43, 80, 334, 215, 5, 1713, 345, 13515, 357, 4663, 10, 27, 214, 5, 27, 2320, 38, 307, 38, 132, 19, 1327, 1786, 6, 572, 281, 217, 8, 2472, 58, 1713, 345, 13515, 536, 4663, 10, 1548, 6, 8, 200, 194, 12, 1792, 2261, 21154, 19, 12, 253, 91, 81, 135, 778, 5, 264, 653, 12, 369, 44, 709, 728, 3, 9, 215, 21, 39, 293, 207, 5, 1713, 345, 13515, 357, 4663, 10, 8872, 5, 1713, 345, 13515, 536, 4663, 10, 1563, 140, 217, 270, 5, 696, 2053, 11, 11581, 320, 1399, 5, 2321, 3, 9, 1659, 6522, 6, 754, 5, 531, 25, 7269, 6, 1363, 5, 3931, 58, 1713, 345, 13515, 357, 4663, 10, 2163, 5, 1713, 345, 13515, 536, 4663, 10, 14627, 53, 19, 8, 1374, 1137, 13, 5084, 1874, 11, 842, 1994, 6, 25,

In [92]:
dataset_filter = tokenized_dataset.filter(lambda x: x['dialogue'].replace('\n', ' ') == tokenizer.decode(x['dialogue_tok']['input_ids'], skip_special_tokens=True))

print(dataset_filter)
print(tokenized_dataset)

Filter:   0%|          | 0/12460 [00:00<?, ? examples/s]

Filter: 100%|██████████| 12460/12460 [00:15<00:00, 817.93 examples/s]
Filter: 100%|██████████| 500/500 [00:00<00:00, 895.37 examples/s]
Filter: 100%|██████████| 1500/1500 [00:01<00:00, 779.69 examples/s]

DatasetDict({
    train: Dataset({
        features: ['id', 'dialogue', 'summary', 'topic', 'dialogue_tok'],
        num_rows: 10845
    })
    validation: Dataset({
        features: ['id', 'dialogue', 'summary', 'topic', 'dialogue_tok'],
        num_rows: 425
    })
    test: Dataset({
        features: ['id', 'dialogue', 'summary', 'topic', 'dialogue_tok'],
        num_rows: 1302
    })
})
DatasetDict({
    train: Dataset({
        features: ['id', 'dialogue', 'summary', 'topic', 'dialogue_tok'],
        num_rows: 12460
    })
    validation: Dataset({
        features: ['id', 'dialogue', 'summary', 'topic', 'dialogue_tok'],
        num_rows: 500
    })
    test: Dataset({
        features: ['id', 'dialogue', 'summary', 'topic', 'dialogue_tok'],
        num_rows: 1500
    })
})





In [78]:
tok_dataset = dataset.map(lambda xs: {"dialogue_tok": [tokenizer(x) for x in xs['dialogue']]}, batched=True)

Map: 100%|██████████| 12460/12460 [00:06<00:00, 1979.52 examples/s]
Map: 100%|██████████| 500/500 [00:00<00:00, 1769.92 examples/s]
Map: 100%|██████████| 1500/1500 [00:00<00:00, 2168.97 examples/s]


In [87]:
print(tok_dataset)
print(dataset['train'][0]['dialogue'].replace('\n', ' '))
print(tokenizer.decode(tok_dataset['train'][0]['dialogue_tok']['input_ids']))
dataset['train'][0]['dialogue'].replace('\n', ' ') == tokenizer.decode(tok_dataset['train'][0]['dialogue_tok']['input_ids'], skip_special_tokens=True)

DatasetDict({
    train: Dataset({
        features: ['id', 'dialogue', 'summary', 'topic', 'dialogue_tok'],
        num_rows: 12460
    })
    validation: Dataset({
        features: ['id', 'dialogue', 'summary', 'topic', 'dialogue_tok'],
        num_rows: 500
    })
    test: Dataset({
        features: ['id', 'dialogue', 'summary', 'topic', 'dialogue_tok'],
        num_rows: 1500
    })
})
#Person1#: Hi, Mr. Smith. I'm Doctor Hawkins. Why are you here today? #Person2#: I found it would be a good idea to get a check-up. #Person1#: Yes, well, you haven't had one for 5 years. You should have one every year. #Person2#: I know. I figure as long as there is nothing wrong, why go see the doctor? #Person1#: Well, the best way to avoid serious illnesses is to find out about them early. So try to come at least once a year for your own good. #Person2#: Ok. #Person1#: Let me see here. Your eyes and ears look fine. Take a deep breath, please. Do you smoke, Mr. Smith? #Person2#: Yes. #Person1#: S

True

In [93]:
tok_dataset_filter = tok_dataset.filter(lambda xs: [x.replace('\n', ' ') != tokenizer.decode(xt['input_ids'], skip_special_tokens=True) for x, xt in zip(xs['dialogue'], xs['dialogue_tok'])], batched=True)

Filter:   0%|          | 0/12460 [00:00<?, ? examples/s]

Filter: 100%|██████████| 12460/12460 [00:15<00:00, 783.91 examples/s]
Filter: 100%|██████████| 500/500 [00:00<00:00, 898.93 examples/s]
Filter: 100%|██████████| 1500/1500 [00:02<00:00, 743.48 examples/s]


In [100]:
print(tok_dataset_filter)
print(tok_dataset_filter['train'][0]['dialogue'])
print(tok_dataset_filter['train'][0]['dialogue'].replace('\n', ' '))
print(tokenizer.decode(tok_dataset_filter['train'][0]['dialogue_tok']['input_ids'], skip_special_tokens=True))

DatasetDict({
    train: Dataset({
        features: ['id', 'dialogue', 'summary', 'topic', 'dialogue_tok'],
        num_rows: 1615
    })
    validation: Dataset({
        features: ['id', 'dialogue', 'summary', 'topic', 'dialogue_tok'],
        num_rows: 75
    })
    test: Dataset({
        features: ['id', 'dialogue', 'summary', 'topic', 'dialogue_tok'],
        num_rows: 198
    })
})
#Person1#: Why didn't you tell me you had a girlfriend? 
#Person2#: Sorry, I thought you knew. 
#Person1#: But you should have told me you were in love with her. 
#Person2#: Didn't I? 
#Person1#: You know you didn't. 
#Person2#: Well, I'm telling you now. 
#Person1#: Yes, but you might have told me before. 
#Person2#: I didn't think you'd be interested. 
#Person1#: You can't be serious. How dare you not tell me you were going to marry her? 
#Person2#: Sorry, I didn't think it mattered. 
#Person1#: Oh, you men! You're all the same. 
#Person1#: Why didn't you tell me you had a girlfriend?  #Person2#: S

In [91]:
print(tok_dataset)
print(tok_dataset_filter)

DatasetDict({
    train: Dataset({
        features: ['id', 'dialogue', 'summary', 'topic', 'dialogue_tok'],
        num_rows: 12460
    })
    validation: Dataset({
        features: ['id', 'dialogue', 'summary', 'topic', 'dialogue_tok'],
        num_rows: 500
    })
    test: Dataset({
        features: ['id', 'dialogue', 'summary', 'topic', 'dialogue_tok'],
        num_rows: 1500
    })
})
DatasetDict({
    train: Dataset({
        features: ['id', 'dialogue', 'summary', 'topic', 'dialogue_tok'],
        num_rows: 10845
    })
    validation: Dataset({
        features: ['id', 'dialogue', 'summary', 'topic', 'dialogue_tok'],
        num_rows: 425
    })
    test: Dataset({
        features: ['id', 'dialogue', 'summary', 'topic', 'dialogue_tok'],
        num_rows: 1302
    })
})
