the original notebook is from hugging face colab notebook here (https://colab.research.google.com/github/sanchit-gandhi/notebooks/blob/main/fine_tune_whisper.ipynb)


Make sure you have the following dependencies installed in your environment

```
pip install datasets transformers evaulate jiwer
```

the common voice dataset is coming from here (https://huggingface.co/datasets/mozilla-foundation/common_voice_11_0/viewer/zh-HK/train?p=84)

It is part of mozilla foundation common voice project

----

We could use this format to prepare our own dataset to fine tune our version of whisper

----

If you want to re-use / avoid to download the voice file every time, you can un-comment the part which specify `cache_dir` and point it to the directory you want those file to be downloaded / already downloaded.

In [4]:
# import sys
# sys.path.append(datasets_dir)

# before downloading any new dataset, 
# make sure to check if it needs to Check and Agrees to the terms first, otherwise the download would fail

from datasets import load_dataset, DatasetDict

dataset_name = "mozilla-foundation/common_voice_16_0"
language_to_train = 'yue'

common_voice = DatasetDict()
common_voice["train"] = load_dataset(
  dataset_name, language_to_train, 
  split="train+validation",
  cache_dir="/Volumes/BACKUP/Coding/HUGGING_FACE/datasets"
  )

common_voice["test"] = load_dataset(
  dataset_name, language_to_train, 
  split="test",  
  cache_dir="/Volumes/BACKUP/Coding/HUGGING_FACE/datasets"
  )

print(common_voice)

Downloading builder script: 100%|██████████| 8.17k/8.17k [00:00<00:00, 8.21MB/s]
Downloading readme: 100%|██████████| 12.3k/12.3k [00:00<00:00, 14.3MB/s]
Downloading extra modules: 100%|██████████| 3.74k/3.74k [00:00<00:00, 25.9MB/s]
Downloading extra modules: 100%|██████████| 77.3k/77.3k [00:00<00:00, 1.03MB/s]
Downloading data: 100%|██████████| 14.6k/14.6k [00:00<00:00, 35.2MB/s]
Downloading data: 100%|██████████| 78.7M/78.7M [00:02<00:00, 31.9MB/s]
Downloading data: 100%|██████████| 66.4M/66.4M [00:03<00:00, 18.9MB/s]
Downloading data: 100%|██████████| 72.0M/72.0M [00:01<00:00, 46.7MB/s]
Downloading data: 100%|██████████| 994M/994M [01:00<00:00, 16.5MB/s]]
Downloading data: 100%|██████████| 962M/962M [01:16<00:00, 12.5MB/s]
Downloading data: 100%|██████████| 939M/939M [01:31<00:00, 10.2MB/s]
Downloading data: 100%|██████████| 429M/429M [00:30<00:00, 14.3MB/s]
Downloading data: 100%|██████████| 48.2M/48.2M [00:01<00:00, 26.3MB/s]
Downloading data files: 100%|██████████| 5/5 [04:37<00

DatasetDict({
    train: Dataset({
        features: ['client_id', 'path', 'audio', 'sentence', 'up_votes', 'down_votes', 'age', 'gender', 'accent', 'locale', 'segment', 'variant'],
        num_rows: 5644
    })
    test: Dataset({
        features: ['client_id', 'path', 'audio', 'sentence', 'up_votes', 'down_votes', 'age', 'gender', 'accent', 'locale', 'segment', 'variant'],
        num_rows: 2581
    })
})


In [2]:
# !pip install "tokenizers>=0.14,<0.15"

from transformers import WhisperFeatureExtractor
feature_extractor = WhisperFeatureExtractor.from_pretrained(
  "openai/whisper-small", 
  cache_dir="/Volumes/BACKUP/Coding/HUGGING_FACE/feature"
  ) # start with the whisper small checkout

preprocessor_config.json: 100%|██████████| 185k/185k [00:00<00:00, 1.30MB/s]


In [3]:
from transformers import WhisperTokenizer
tokenizer = WhisperTokenizer.from_pretrained("openai/whisper-small", 
language="cantonese", 
task="transcribe",
cache_dir="/Volumes/BACKUP/Coding/HUGGING_FACE/tokenizer"
)

In [4]:
from transformers import WhisperProcessor
processor = WhisperProcessor.from_pretrained("openai/whisper-small", 
language="cantonese", 
task="transcribe",
cache_dir="/Volumes/BACKUP/Coding/HUGGING_FACE/processor"
)

In [5]:
# Preparing Data

# Whisper expecting the audio to be at sampling rate @16000 - this is just to make sure the sampling rate fits whisper's training
# Since our input audio is sampled at 48kHz, we need to downsample it to 16kHz prior to passing it to the Whisper feature extractor, 
# 16kHz being the sampling rate expected by the Whisper model.
from datasets import Audio
raw_common_voice = common_voice.cast_column("audio", Audio(sampling_rate=16000))

print(raw_common_voice["train"][0])

{'client_id': '2ecfe4e00a829397a04e316949bf3058c9ed72b0da9fad2686b0bc3bd98654d8a586e878cc3aa7609bf0359f56e24b3bc0f6f1ec4d1ec958e569bbaaf742560b', 'path': '/Volumes/BACKUP/Coding/HUGGING_FACE/datasets/downloads/extracted/5f8c376b62cbcec81f092e38c43f1519f67645668f8044d9b7c5a51c4297c524/yue_train_0/common_voice_yue_31210647.mp3', 'audio': {'path': '/Volumes/BACKUP/Coding/HUGGING_FACE/datasets/downloads/extracted/5f8c376b62cbcec81f092e38c43f1519f67645668f8044d9b7c5a51c4297c524/yue_train_0/common_voice_yue_31210647.mp3', 'array': array([ 1.45519152e-10,  4.36557457e-11,  4.36557457e-11, ...,
       -2.06303957e-06, -1.26592931e-06,  1.36844028e-06]), 'sampling_rate': 16000}, 'sentence': '睇內容長短嘅', 'up_votes': 4, 'down_votes': 0, 'age': 'teens', 'gender': 'male', 'accent': '香港粵語', 'locale': 'yue', 'segment': '', 'variant': ''}


prepare the dataset
doing the encoding -> preparing the vector

In [6]:
def prepare_dataset(batch):
    # load and resample audio data from 48 to 16kHz
    audio = batch["audio"]

    # compute log-Mel input features from input audio array
    batch["input_features"] = feature_extractor(audio["array"], sampling_rate=audio["sampling_rate"]).input_features[0]

    # encode target text to label ids
    batch["labels"] = tokenizer(batch["sentence"]).input_ids
    return batch



finalized_common_voice = raw_common_voice.map(prepare_dataset, 
  remove_columns=raw_common_voice.column_names["train"], 
  num_proc=2)
print(finalized_common_voice)

# Initialize the accelerator
# accelerator = Accelerator()

# Move the model and dataset to the device
# model, common_voice = accelerator.prepare(model, finalized_common_voice)

DatasetDict({
    train: Dataset({
        features: ['input_features', 'labels'],
        num_rows: 5636
    })
    test: Dataset({
        features: ['input_features', 'labels'],
        num_rows: 2565
    })
})


the following is the actual training and evaluation of the model

using the trainer provided by huggingface

Evaluation metrics: during evaluation, we want to evaluate the model using the word error rate (WER) metric. We need to define a compute_metrics function that handles this computation.

Load a pre-trained checkpoint: we need to load a pre-trained checkpoint and configure it correctly for training.

Define the training configuration: this will be used by the 🤗 Trainer to define the training schedule.

In [7]:
import torch

from dataclasses import dataclass
from typing import Any, Dict, List, Union

@dataclass
class DataCollatorSpeechSeq2SeqWithPadding:
    processor: Any

    def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:
        # split inputs and labels since they have to be of different lengths and need different padding methods
        # first treat the audio inputs by simply returning torch tensors
        input_features = [{"input_features": feature["input_features"]} for feature in features]
        batch = self.processor.feature_extractor.pad(input_features, return_tensors="pt")

        # get the tokenized label sequences
        label_features = [{"input_ids": feature["labels"]} for feature in features]
        # pad the labels to max length
        labels_batch = self.processor.tokenizer.pad(label_features, return_tensors="pt")

        # replace padding with -100 to ignore loss correctly
        labels = labels_batch["input_ids"].masked_fill(labels_batch.attention_mask.ne(1), -100)

        # if bos token is appended in previous tokenization step,
        # cut bos token here as it's append later anyways
        if (labels[:, 0] == self.processor.tokenizer.bos_token_id).all().cpu().item():
            labels = labels[:, 1:]

        batch["labels"] = labels

        return batch

In [8]:
data_collator = DataCollatorSpeechSeq2SeqWithPadding(processor=processor)

Evaluation using hugging face metric - WER (Word error rate)

In [16]:
#!pip install evaluate

Collecting evaluate
  Using cached evaluate-0.4.1-py3-none-any.whl.metadata (9.4 kB)
Collecting responses<0.19 (from evaluate)
  Using cached responses-0.18.0-py3-none-any.whl (38 kB)
Using cached evaluate-0.4.1-py3-none-any.whl (84 kB)
Installing collected packages: responses, evaluate
Successfully installed evaluate-0.4.1 responses-0.18.0


In [9]:
import evaluate

metric = evaluate.load("wer")


def compute_metrics(pred):
    pred_ids = pred.predictions
    label_ids = pred.label_ids

    # replace -100 with the pad_token_id
    label_ids[label_ids == -100] = tokenizer.pad_token_id

    # we do not want to group tokens when computing the metrics
    pred_str = tokenizer.batch_decode(pred_ids, skip_special_tokens=True)
    label_str = tokenizer.batch_decode(label_ids, skip_special_tokens=True)

    wer = 100 * metric.compute(predictions=pred_str, references=label_str)

    return {"wer": wer}

In [10]:
from transformers import WhisperForConditionalGeneration

model = WhisperForConditionalGeneration.from_pretrained(
  "openai/whisper-small", 
  # cache_dir="/Volumes/BACKUP/Coding/HUGGING_FACE/models"
  )

model.config.forced_decoder_ids = None
model.config.suppress_tokens = []

What should be the training

In [None]:
# this is a nice youtube video guide / introduction for how to use tensorboard (https://www.youtube.com/watch?v=VJW9wU-1n18&t=4s)
!pip install tensorboard

In [11]:
import wandb

wandb.init(project="language-x-change")

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mpoppysmic[0m. Use [1m`wandb login --relogin`[0m to force relogin


In [12]:
from transformers import Seq2SeqTrainingArguments
import datetime
# from accelerate import Accelerator

now = datetime.datetime.now().strftime("%d-%m-%Y-%H%M")

training_args = Seq2SeqTrainingArguments(
    output_dir="model/whisper-small-cantonese_"+now,  # change to a repo name of your choice
    per_device_train_batch_size=16,
    gradient_accumulation_steps=1,  # increase by 2x for every 2x decrease in batch size
    learning_rate=1e-5,
    warmup_steps=500,
    max_steps=500,
    gradient_checkpointing=True,
    fp16=False,  # if we are not using CUDA or non graphics card, use fp16=false
    evaluation_strategy="steps",
    per_device_eval_batch_size=8,
    predict_with_generate=True,
    generation_max_length=225,
    save_steps=100,
    eval_steps=100,
    logging_steps=25,
    report_to=["tensorboard","wandb"], #this would requires the tensorboardx to be installed
    load_best_model_at_end=True,
    metric_for_best_model="wer",
    greater_is_better=False,
    push_to_hub=False,
)

from transformers import Seq2SeqTrainer

trainer = Seq2SeqTrainer(
    args=training_args,
    model=model,
    train_dataset=finalized_common_voice["train"],
    eval_dataset=finalized_common_voice["test"],
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    tokenizer=processor.feature_extractor,
    # checkpoint_activations=True
)

processor.save_pretrained(training_args.output_dir)

The actual Training Part

In [13]:
trainer.train()



`use_cache = True` is incompatible with gradient checkpointing. Setting `use_cache = False`...
  5%|▌         | 25/500 [02:42<49:22,  6.24s/it] 

{'loss': 2.0521, 'learning_rate': 5.000000000000001e-07, 'epoch': 0.07}


 10%|█         | 50/500 [05:18<46:54,  6.25s/it]

{'loss': 1.6707, 'learning_rate': 1.0000000000000002e-06, 'epoch': 0.14}


 15%|█▌        | 75/500 [07:55<44:45,  6.32s/it]

{'loss': 0.995, 'learning_rate': 1.5e-06, 'epoch': 0.21}


 20%|██        | 100/500 [10:32<41:39,  6.25s/it]

{'loss': 0.3957, 'learning_rate': 2.0000000000000003e-06, 'epoch': 0.28}


                                                 
 20%|██        | 100/500 [31:47<41:39,  6.25s/it]

{'eval_loss': 0.25670018792152405, 'eval_wer': 356.92307692307696, 'eval_runtime': 1274.5735, 'eval_samples_per_second': 2.012, 'eval_steps_per_second': 0.252, 'epoch': 0.28}


 25%|██▌       | 125/500 [34:29<38:57,  6.23s/it]    

{'loss': 0.193, 'learning_rate': 2.5e-06, 'epoch': 0.35}


 30%|███       | 150/500 [37:03<35:56,  6.16s/it]

{'loss': 0.157, 'learning_rate': 3e-06, 'epoch': 0.42}


 35%|███▌      | 175/500 [39:40<35:42,  6.59s/it]

{'loss': 0.1427, 'learning_rate': 3.5e-06, 'epoch': 0.5}


 40%|████      | 200/500 [42:13<30:51,  6.17s/it]

{'loss': 0.1412, 'learning_rate': 4.000000000000001e-06, 'epoch': 0.57}


                                                 
 40%|████      | 200/500 [1:01:06<30:51,  6.17s/it]

{'eval_loss': 0.17312493920326233, 'eval_wer': 301.8076923076923, 'eval_runtime': 1132.7636, 'eval_samples_per_second': 2.264, 'eval_steps_per_second': 0.283, 'epoch': 0.57}


 45%|████▌     | 225/500 [1:03:54<28:40,  6.26s/it]    

{'loss': 0.1466, 'learning_rate': 4.5e-06, 'epoch': 0.64}


 50%|█████     | 250/500 [1:18:44<2:16:47, 32.83s/it]  

{'loss': 0.1434, 'learning_rate': 5e-06, 'epoch': 0.71}


 55%|█████▌    | 275/500 [1:36:23<23:24,  6.24s/it]    

{'loss': 0.1218, 'learning_rate': 5.500000000000001e-06, 'epoch': 0.78}


 60%|██████    | 300/500 [1:39:22<41:47, 12.54s/it]

{'loss': 0.1358, 'learning_rate': 6e-06, 'epoch': 0.85}


                                                   
 60%|██████    | 300/500 [3:01:50<41:47, 12.54s/it] 

{'eval_loss': 0.1573217809200287, 'eval_wer': 129.26923076923075, 'eval_runtime': 4947.7065, 'eval_samples_per_second': 0.518, 'eval_steps_per_second': 0.065, 'epoch': 0.85}


 65%|██████▌   | 325/500 [3:04:38<18:58,  6.50s/it]     

{'loss': 0.1332, 'learning_rate': 6.5000000000000004e-06, 'epoch': 0.92}


 70%|███████   | 350/500 [3:22:50<17:35,  7.03s/it]    

{'loss': 0.129, 'learning_rate': 7e-06, 'epoch': 0.99}


 75%|███████▌  | 375/500 [3:41:53<1:55:47, 55.58s/it]  

{'loss': 0.0777, 'learning_rate': 7.500000000000001e-06, 'epoch': 1.06}


 80%|████████  | 400/500 [3:44:29<10:12,  6.13s/it]  

{'loss': 0.0701, 'learning_rate': 8.000000000000001e-06, 'epoch': 1.13}


                                                   
 80%|████████  | 400/500 [5:22:26<10:12,  6.13s/it]

{'eval_loss': 0.1577225923538208, 'eval_wer': 83.96153846153847, 'eval_runtime': 5877.0947, 'eval_samples_per_second': 0.436, 'eval_steps_per_second': 0.055, 'epoch': 1.13}


 85%|████████▌ | 425/500 [6:03:25<2:48:41, 134.96s/it]  

{'loss': 0.0823, 'learning_rate': 8.5e-06, 'epoch': 1.2}


 90%|█████████ | 450/500 [6:05:58<05:06,  6.13s/it]   

{'loss': 0.0756, 'learning_rate': 9e-06, 'epoch': 1.27}


 95%|█████████▌| 475/500 [6:08:36<02:41,  6.47s/it]

{'loss': 0.0904, 'learning_rate': 9.5e-06, 'epoch': 1.35}


100%|██████████| 500/500 [6:42:36<00:00, 288.06s/it]  

{'loss': 0.0822, 'learning_rate': 0.0, 'epoch': 1.42}


                                                    
100%|██████████| 500/500 [8:01:09<00:00, 288.06s/it]

{'eval_loss': 0.15555481612682343, 'eval_wer': 82.11538461538461, 'eval_runtime': 4711.569, 'eval_samples_per_second': 0.544, 'eval_steps_per_second': 0.068, 'epoch': 1.42}


There were missing keys in the checkpoint model loaded: ['proj_out.weight'].
100%|██████████| 500/500 [8:01:16<00:00, 57.75s/it] 

{'train_runtime': 28876.5739, 'train_samples_per_second': 0.277, 'train_steps_per_second': 0.017, 'train_loss': 0.35178972697257993, 'epoch': 1.42}





TrainOutput(global_step=500, training_loss=0.35178972697257993, metrics={'train_runtime': 28876.5739, 'train_samples_per_second': 0.277, 'train_steps_per_second': 0.017, 'train_loss': 0.35178972697257993, 'epoch': 1.42})

In [14]:
wandb.finish()



0,1
eval/loss,█▂▁▁▁
eval/runtime,▁▁▇█▆
eval/samples_per_second,▇█▁▁▁
eval/steps_per_second,▇█▁▁▁
eval/wer,█▇▂▁▁
train/epoch,▁▁▂▂▂▂▃▃▄▄▄▄▅▅▅▅▆▆▆▆▇▇████
train/global_step,▁▁▂▂▂▂▃▃▄▄▄▄▅▅▅▅▆▆▇▇▇▇████
train/learning_rate,▁▂▂▂▃▃▄▄▄▅▅▅▆▆▇▇▇██▁
train/loss,█▇▄▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
train/total_flos,▁

0,1
eval/loss,0.15555
eval/runtime,4711.569
eval/samples_per_second,0.544
eval/steps_per_second,0.068
eval/wer,82.11538
train/epoch,1.42
train/global_step,500.0
train/learning_rate,0.0
train/loss,0.0822
train/total_flos,2.30522017775616e+18


if you need to push the model to hugging face hub, run the following block

```
pip install --upgrade huggingface_hub
```

In [None]:
# this is optional. but it would allow you to upload the model to hugging face space later on
from huggingface_hub import notebook_login
notebook_login()

In [None]:
# to push

#the following arguments are needed only when we are pushing the model to hugging face hub
kwargs = {
    "dataset_tags": "mozilla-foundation/common_voice_11_0",
    "dataset": "Common Voice 11.0",  # a 'pretty' name for the training dataset
    "dataset_args": "config: hi, split: test",
    "language": "Cantonese",
    "model_name": "[language-x-change] Custom Whisper for Cantanese",  # a 'pretty' name for our model
    "finetuned_from": "openai/whisper-small",
    "tasks": "automatic-speech-recognition",
    "tags": "hf-asr-leaderboard",
}

trainer.push_to_hub(**kwargs)

The following is only needed when we want to deploy a runnable version with our uploaded model on hugging face spaces

In [None]:
!pip install gradio

In [None]:
from transformers import pipeline
import gradio as gr

pipe = pipeline(model="your-own-model")  # change to "your-username/the-name-you-picked"

def transcribe(audio):
    text = pipe(audio)["text"]
    return text

iface = gr.Interface(
    fn=transcribe,
    inputs=gr.Audio(source="microphone", type="filepath"),
    outputs="text",
    title="Whisper Small Hindi",
    description="Realtime demo for Hindi speech recognition using a fine-tuned Whisper small model.",
)

iface.launch()

to use the model we just compiled (https://huggingface.co/docs/transformers/tasks/asr#inference)


In [20]:
from transformers import pipeline, AutoModelForSpeechSeq2Seq, AutoProcessor
import datetime
import json

def write_contents_to_file(content): 
    now = datetime.datetime.now().strftime("%d-%m-%Y-%H-%M-%S")
    json_object = json.dumps(result, indent=4)
    with open('output/'+now+".json", "w") as f:
        f.write(json_object)

path = "model/whisper-small-cantonese_23-12-2023-2157/checkpoint-400"
processor_path = "model/whisper-small-cantonese_23-12-2023-2157"

model = AutoModelForSpeechSeq2Seq.from_pretrained(
   path, 
   local_files_only=True,
)

processor = AutoProcessor.from_pretrained(processor_path)

transcriber = pipeline("automatic-speech-recognition", 
    model=model,  
    tokenizer=processor.tokenizer,
    feature_extractor=processor.feature_extractor,
    # chunk_length_s=5,
    max_new_tokens=500,
   #  batch_size=16,
    # return_timestamps=True
   )
transcriber.tokenizer.get_decoder_prompt_ids(language='cantonese', task="transcribe")

# file_list = ["Audio1_2.mp3","Audio1_4.mp3","Audio1_5.mp3","Audio1_9.mp3","Audio1_10.mp3","Audio1_11.mp3"]
# for index, file in enumerate(file_list):
#     result = transcriber("source/"+file)
#     write_contents_to_file(result)
#     # also it will print out the result in the following output block
#     print(f'[{index}] - {result}')



num_of_chunks = 41
file_prefix = "chunk";
file_suffix = ".mp3"

for index in range(0, num_of_chunks):
    result = transcriber("source/rthk/"+file_prefix+str(index)+file_suffix)
    # write_contents_to_file(result)
    print(f'[{index}] - {result}')






Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


[0] - {'text': ' 中上環境半生一帶除咗畀人個感覺靈性之外呢度嘅建築物亦都係新舊交用㗎唔講你唔知呢度曾經出現過一個叫做三十間嘅社區啲'}
[1] - {'text': '被稱為三十間嘅社區，目前並冇完整嘅文獻記錄資料，上傳喺十九世紀，市單頓街同埋必列者事街附近一帶嘅華人聚居地，所建造嘅三十間石屋移得名'}
[2] - {'text': '隨著社會變遷 石屋已經不復存在現時 番韋內仍然有數座大約在1950年代建成的唐流建築分別是在2019年確定成為二級歷史建築的史丹頓街88及90號同埋平級有待評估的華顏方西唐流建築群'}
[3] - {'text': ' Kara,其實當初沙拉間個起源係'}
[4] - {'text': '其實如果我們找回資料的話最早其實我們是在一百八零年的政府憲報見到三十間這個名字因為我們現在在香港的地圖上其實我們都很難可以見到三十間這個名字'}
[5] - {'text': '真係知道呢個名嘅人呢大概都已經去到六十歲或者以上嘅人先至會識得用呢個名'}
[6] - {'text': '當時其實係呢個位置呢應該就係起咗大約三十間'}
[7] - {'text': '屋建築群如果肉眼見到的痕跡其實可能只有返'}
[8] - {'text': '即係呢度生下間街坊魚蘭會呢個招牌'}
[9] - {'text': ' We can reflect on the past here, the Saradan area. There used to be a lot of Chinese people coming here.'}
[10] - {'text': '余蘭會就係每年七月嘅時候會舉辦余蘭性會嘅組織啦佢哋其實係一班居民'}
[11] - {'text': '組織出來嘅一個地方'}
[12] - {'text': '於南性會 對於一個華人社會來說非常重要超到一些孤魂鬼令到這裡可能傷破 街坊可以安心一點 這樣的一個傳統習俗'}
[13] - {'text': '喺街道佈局上面三十間社區嘅特色係點嘅'}
[14] - {'text': '如果想理解三十間範圍其實我們應該由下面的士丹頓街開始計算那其實是一個族心'}
[15] - {'text': '跟住就一路打上去度上面半山嘅堅堵範圍'}
[16] - {'text': '中間記憶一個範圍其實

install the following dependencies for plotting and tabulation

```
pip install pandas
```

In [90]:
import pandas as pd
import json
from IPython.display import display


df = pd.json_normalize(result, record_path =['chunks'])
display(df)

# show df in a tablular format



Unnamed: 0,timestamp,text
0,"(0.0, 1.4)",其實都有佢嘅價值啦
1,"(1.4, 4.68)",可能會係同身粉認同有關係啦
2,"(4.68, 8.0)",又或者可能佢會大動到一個地方嘅文化旅游啦
3,"(8.0, 15.04)",佢隕藏住同埋佢對於呢個社會創造緊嘅價值其實都係好重要嘅元素嚟嘅
4,"(19.04, 22.64)",市單定加一大嘅唐州活發工程已經完成嘅嘞
5,"(22.64, 26.88)",而市建共亦都話嚟緊會引入共同租住單位嘅共居模式
6,"(0.0, 4.32)",希望底時呢度呢就可以變成一個充滿文化特色同埋活力嘅社區
