# **Fine-tuning XLSR-Wav2Vec2  🤗 Transformers**

## Pre-configuration

In [None]:
!pip uninstall -y transformers accelerate
!pip install transformers accelerate
!pip install datasets
!pip install jiwer

In [None]:
from transformers import Wav2Vec2CTCTokenizer
from transformers import Wav2Vec2FeatureExtractor
from transformers import Wav2Vec2Processor
from transformers import Wav2Vec2ForCTC
from transformers import TrainingArguments
from transformers import Trainer


In [None]:
from ipywidgets import widgets

In [None]:
import os

In [None]:
# # jonatasgrosman/wav2vec2-large-xlsr-53-chinese-zh-cn
language_code = 'zh-CN'
language_name = 'chinese'
# base_model = "facebook/wav2vec2-large-xlsr-53"
pretrain_model = f"patrickvonplaten/wav2vec2-large-xlsr-{language_name}-demo"

data_dir = f"/workspace/data/{language_code}"
output_models_dir = f"/content/drive/Shareddrives/Hoggy Project/hoggyPro"


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!mv /models/ /content/drive/MyDrive/model_checkpoints

mv: cannot stat '/models/': No such file or directory


In [None]:
from transformers.trainer_utils import get_last_checkpoint
checkpoint = get_last_checkpoint("/content/drive/MyDrive/model_checkpoints/checkpoint-1600")

## Presentation

In [None]:
!gdown --folder https://drive.google.com/drive/folders/1JraFo8ltR6e3TyQqJ-PwUzlaT8ZfVjrL?usp=share_link -O .

Retrieving folder list
Processing file 1vltdseFMMRZERqi5FG719C1C1ujnxph9 test_df.csv
Processing file 10jwnTFVkKAJL47nU35ZdweqXgMMIGTWb train_df.csv
Processing file 1glH49rXwX67T1RCiqR1ehFgqwhAt1gjq val_df.csv
Retrieving folder list completed
Building directory structure
Building directory structure completed
Downloading...
From: https://drive.google.com/uc?id=1vltdseFMMRZERqi5FG719C1C1ujnxph9
To: /content/test_df.csv
100% 39.8k/39.8k [00:00<00:00, 54.0MB/s]
Downloading...
From: https://drive.google.com/uc?id=10jwnTFVkKAJL47nU35ZdweqXgMMIGTWb
To: /content/train_df.csv
100% 159k/159k [00:00<00:00, 115MB/s]
Downloading...
From: https://drive.google.com/uc?id=1glH49rXwX67T1RCiqR1ehFgqwhAt1gjq
To: /content/val_df.csv
100% 20.1k/20.1k [00:00<00:00, 71.5MB/s]
Download completed


In [None]:
!gdown --id 1Lx6Kxy29Jp5LFtWsszOMUAraTqam2FxG # audio files
!gdown --id 1psbZWyPRFJ32g5JpXxcADl7TJDuTh8Wd # transcript

Downloading...
From: https://drive.google.com/uc?id=1Lx6Kxy29Jp5LFtWsszOMUAraTqam2FxG
To: /content/wav2.zip
100% 421M/421M [00:13<00:00, 31.8MB/s]
Downloading...
From: https://drive.google.com/uc?id=1psbZWyPRFJ32g5JpXxcADl7TJDuTh8Wd
To: /content/non-native-path.csv
100% 220k/220k [00:00<00:00, 188MB/s]


In [None]:
!unzip "wav2"

In [None]:
!unzip non-native.zip

unzip:  cannot find or open non-native.zip, non-native.zip.zip or non-native.zip.ZIP.


In [None]:
!mv wav2 wav

In [None]:
from datasets import load_dataset, load_metric

common_voice_train = load_dataset("csv", data_files=['/content/train_df.csv'], split='train' )
common_voice_test = load_dataset("csv", data_files=['/content/val_df.csv'], split='train' )

Downloading and preparing dataset csv/default to /root/.cache/huggingface/datasets/csv/default-b3307c3334dfc3f3/0.0.0/6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1...


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Dataset csv downloaded and prepared to /root/.cache/huggingface/datasets/csv/default-b3307c3334dfc3f3/0.0.0/6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1. Subsequent calls will reuse this data.
Downloading and preparing dataset csv/default to /root/.cache/huggingface/datasets/csv/default-3987ef410cab3672/0.0.0/6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1...


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Dataset csv downloaded and prepared to /root/.cache/huggingface/datasets/csv/default-3987ef410cab3672/0.0.0/6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1. Subsequent calls will reuse this data.


In [None]:
common_voice_train

Dataset({
    features: ['sentence', 'path'],
    num_rows: 2063
})

In [None]:
len(common_voice_train)

2063

In [None]:
from datasets import ClassLabel
import random
import pandas as pd
from IPython.display import display, HTML

def show_random_elements(dataset, num_examples=10):
    assert num_examples <= len(dataset), "Can't pick more elements than there are in the dataset."
    picks = []
    for _ in range(num_examples):
        pick = random.randint(0, len(dataset)-1)
        while pick in picks:
            pick = random.randint(0, len(dataset)-1)
        picks.append(pick)
    
    df = pd.DataFrame(dataset[picks])
    display(HTML(df.to_html()))

In [None]:
show_random_elements(common_voice_train.remove_columns(["path"]), num_examples=10)

Unnamed: 0,sentence
0,北平方浇正正的潮楼可比 。
1,那那道就可以不用这样这样严格的条件吗 ？
2,一个一个的都有重赏 。
3,也不能指出新一潮的将来趋势 。
4,挖心血 ， 收集材料 ， 征求意见 ， 考察情形 ，
5,故不如做介绍写绍的事业 ， 借“学理研究”的美名 ，
6,后来有人觉得单用白话做科书不中用的 ，
7,还是要想收一点鼠际的效果 ，
8,都有这种危险 。
9,故不觉得达尔文的议论的重要 。


In [None]:
import re
chars_to_ignore_regex = '[\,\?\.\!\-\;\:\"\“\%\‘\”\�]'

def remove_special_characters(batch):
    batch["sentence"] = re.sub(chars_to_ignore_regex, '', batch["sentence"]).lower() + " "
    return batch

In [None]:
common_voice_train = common_voice_train.map(remove_special_characters)
common_voice_test = common_voice_test.map(remove_special_characters)

Map:   0%|          | 0/2063 [00:00<?, ? examples/s]

Map:   0%|          | 0/258 [00:00<?, ? examples/s]

In [None]:
show_random_elements(common_voice_train.remove_columns(["path"]))

Unnamed: 0,sentence
0,这些人有了钱 ， 有了势 ， 有了名誉 ，
1,又没有《大英百科全书》可查 ，
2,不能不依靠一种啊超超自然的势呃势力 。
3,努力建设人的乐国人世的天堂 ；
4,在眼前的尽管用怹或别的向远处推 ；
5,李嫂 。
6,做一点实际的改良呢 ？
7,但是我也没有法子可以否认他 。
8,越在那不可可捉摸的心心性上玩把戏 ，
9,我起初觉得我是那威国人 。


In [None]:
def extract_all_chars(batch):
  all_text = " ".join(batch["sentence"])
  vocab = list(set(all_text))
  return {"vocab": [vocab], "all_text": [all_text]}

In [None]:
vocab_train = common_voice_train.map(extract_all_chars, batched=True, batch_size=-1, keep_in_memory=True, remove_columns=common_voice_train.column_names)
vocab_test = common_voice_test.map(extract_all_chars, batched=True, batch_size=-1, keep_in_memory=True, remove_columns=common_voice_test.column_names)

Map:   0%|          | 0/2063 [00:00<?, ? examples/s]

Map:   0%|          | 0/258 [00:00<?, ? examples/s]

In [None]:
vocab_list = list(set(vocab_train["vocab"][0]) | set(vocab_test["vocab"][0]))


In [None]:
vocab_dict = {v: k for k, v in enumerate(vocab_list)}
vocab_dict

In [None]:
vocab_dict["|"] = vocab_dict[" "]
del vocab_dict[" "]

In [None]:
vocab_dict["[UNK]"] = len(vocab_dict)
vocab_dict["[PAD]"] = len(vocab_dict)
len(vocab_dict)

1887

In [None]:
import json
with open('vocab.json', 'w') as vocab_file:
    json.dump(vocab_dict, vocab_file)

In [None]:
tokenizer = Wav2Vec2CTCTokenizer("./vocab.json", unk_token="[UNK]", pad_token="[PAD]", word_delimiter_token="|")

### Create XLSR-Wav2Vec2 Feature Extractor

In [None]:

feature_extractor = Wav2Vec2FeatureExtractor(feature_size=1, sampling_rate=16000, padding_value=0.0, do_normalize=True, return_attention_mask=True)

In [None]:

processor = Wav2Vec2Processor(feature_extractor=feature_extractor, tokenizer=tokenizer)

In [None]:
processor.save_pretrained("jonatasgrosman/wav2vec2-large-xlsr-53-chinese-zh-cn")

In [None]:
common_voice_train[0]

{'sentence': '明明是不可救药的大病 ， 我们偏说—点病都没有 ', 'path': '/content/wav/000120428.WAV'}

In [None]:
import torchaudio

def speech_file_to_array_fn(batch):
    speech_array, sampling_rate = torchaudio.load(batch["path"])
    batch["speech"] = speech_array[0].numpy()
    batch["sampling_rate"] = sampling_rate
    batch["target_text"] = batch["sentence"]
    return batch

In [None]:
common_voice_train = common_voice_train.map(speech_file_to_array_fn, remove_columns=common_voice_train.column_names)

Map:   0%|          | 0/2063 [00:00<?, ? examples/s]

In [None]:
common_voice_test = common_voice_test.map(speech_file_to_array_fn, remove_columns=common_voice_test.column_names)

Map:   0%|          | 0/258 [00:00<?, ? examples/s]

In [None]:
!pip install librosa

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
import IPython.display as ipd
import numpy as np
import random

rand_int = random.randint(0, len(common_voice_train)-1)
print(common_voice_train[rand_int]["target_text"])
ipd.Audio(data=np.asarray(common_voice_train[rand_int]["speech"]), autoplay=True, rate=16000)

以上所说 ， 泛论问题与主义 ， 


In [None]:
rand_int = random.randint(0, len(common_voice_train)-1)

print("Target text:", common_voice_train[rand_int]["target_text"])
print("Input array shape:", np.asarray(common_voice_train[rand_int]["speech"]).shape)
print("Sampling rate:", common_voice_train[rand_int]["sampling_rate"])

Target text: 一个一个的重新推崇起来 ， 替他们修墓 ， 
Input array shape: (144560,)
Sampling rate: 16000


In [None]:
def prepare_dataset(batch):
    # check that all files have the correct sampling rate
    assert (
        len(set(batch["sampling_rate"])) == 1
    ), f"Make sure all inputs have the same sampling rate of {processor.feature_extractor.sampling_rate}."

    batch["input_values"] = processor(batch["speech"], sampling_rate=batch["sampling_rate"][0]).input_values
    
    with processor.as_target_processor():
        batch["labels"] = processor(batch["target_text"]).input_ids
    return batch

In [None]:
common_voice_train = common_voice_train.map(prepare_dataset, remove_columns=common_voice_train.column_names, batch_size=8, num_proc=4, batched=True)
common_voice_test = common_voice_test.map(prepare_dataset, remove_columns=common_voice_test.column_names, batch_size=8, num_proc=4, batched=True)

Map (num_proc=4):   0%|          | 0/2063 [00:00<?, ? examples/s]

  tensor = as_tensor(value)
  tensor = as_tensor(value)
  tensor = as_tensor(value)
  tensor = as_tensor(value)


Map (num_proc=4):   0%|          | 0/258 [00:00<?, ? examples/s]

  tensor = as_tensor(value)
  tensor = as_tensor(value)
  tensor = as_tensor(value)
  tensor = as_tensor(value)


In [None]:
import torch

from dataclasses import dataclass, field
from typing import Any, Dict, List, Optional, Union

@dataclass
class DataCollatorCTCWithPadding:
    """
    Data collator that will dynamically pad the inputs received.
    Args:
        processor (:class:`~transformers.Wav2Vec2Processor`)
            The processor used for proccessing the data.
        padding (:obj:`bool`, :obj:`str` or :class:`~transformers.tokenization_utils_base.PaddingStrategy`, `optional`, defaults to :obj:`True`):
            Select a strategy to pad the returned sequences (according to the model's padding side and padding index)
            among:
            * :obj:`True` or :obj:`'longest'`: Pad to the longest sequence in the batch (or no padding if only a single
              sequence if provided).
            * :obj:`'max_length'`: Pad to a maximum length specified with the argument :obj:`max_length` or to the
              maximum acceptable input length for the model if that argument is not provided.
            * :obj:`False` or :obj:`'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of
              different lengths).
        max_length (:obj:`int`, `optional`):
            Maximum length of the ``input_values`` of the returned list and optionally padding length (see above).
        max_length_labels (:obj:`int`, `optional`):
            Maximum length of the ``labels`` returned list and optionally padding length (see above).
        pad_to_multiple_of (:obj:`int`, `optional`):
            If set will pad the sequence to a multiple of the provided value.
            This is especially useful to enable the use of Tensor Cores on NVIDIA hardware with compute capability >=
            7.5 (Volta).
    """

    processor: Wav2Vec2Processor
    padding: Union[bool, str] = True
    max_length: Optional[int] = None
    max_length_labels: Optional[int] = None
    pad_to_multiple_of: Optional[int] = None
    pad_to_multiple_of_labels: Optional[int] = None

    def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:
        # split inputs and labels since they have to be of different lenghts and need
        # different padding methods
        input_features = [{"input_values": feature["input_values"]} for feature in features]
        label_features = [{"input_ids": feature["labels"]} for feature in features]

        batch = self.processor.pad(
            input_features,
            padding=self.padding,
            max_length=self.max_length,
            pad_to_multiple_of=self.pad_to_multiple_of,
            return_tensors="pt",
        )
        with self.processor.as_target_processor():
            labels_batch = self.processor.pad(
                label_features,
                padding=self.padding,
                max_length=self.max_length_labels,
                pad_to_multiple_of=self.pad_to_multiple_of_labels,
                return_tensors="pt",
            )

        # replace padding with -100 to ignore loss correctly
        labels = labels_batch["input_ids"].masked_fill(labels_batch.attention_mask.ne(1), -100)

        batch["labels"] = labels

        return batch

In [None]:
data_collator = DataCollatorCTCWithPadding(processor=processor, padding=True)

In [None]:
wer_metric = load_metric("wer")

  wer_metric = load_metric("wer")


Downloading builder script:   0%|          | 0.00/1.90k [00:00<?, ?B/s]

In [None]:
def compute_metrics(pred):
    pred_logits = pred.predictions
    pred_ids = np.argmax(pred_logits, axis=-1)

    pred.label_ids[pred.label_ids == -100] = processor.tokenizer.pad_token_id

    pred_str = processor.batch_decode(pred_ids)
    # we do not want to group tokens when computing the metrics
    label_str = processor.batch_decode(pred.label_ids, group_tokens=False)

    wer = wer_metric.compute(predictions=pred_str, references=label_str)

    return {"wer": wer}

In [None]:

model = Wav2Vec2ForCTC.from_pretrained(
   "facebook/wav2vec2-large-xlsr-53", 
   attention_dropout=0.1,
   hidden_dropout=0.1,
    feat_proj_dropout=0.0,
   mask_time_prob=0.05,
   layerdrop=0.1,
   gradient_checkpointing=True, 
   ctc_loss_reduction="mean", 
   pad_token_id=processor.tokenizer.pad_token_id,
   vocab_size=len(processor.tokenizer)
)

In [None]:

# model = Wav2Vec2ForCTC.from_pretrained(
#     "/content/drive/Shareddrives/Hoggy Project/hoggyPro/checkpoint-12400", 
#     attention_dropout=0.1,
#     hidden_dropout=0.1,
#     feat_proj_dropout=0.0,
#     mask_time_prob=0.05,
#     layerdrop=0.1,
#     gradient_checkpointing=True, 
#     ctc_loss_reduction="mean", 
#     pad_token_id=processor.tokenizer.pad_token_id,
#     vocab_size=len(processor.tokenizer)
# )

In [None]:
model.freeze_feature_extractor()

In [None]:

training_args = TrainingArguments(
  output_dir=output_models_dir,
  group_by_length=True,
  per_device_train_batch_size=8,
  gradient_accumulation_steps=2,
  evaluation_strategy="steps",
  num_train_epochs=100,
  fp16=True,
  save_steps=400,
  eval_steps=400,
  logging_steps=400,
  learning_rate=3e-4,
  warmup_steps=500,
  save_total_limit=2,
)

In [None]:

trainer = Trainer(
    model=model,
    data_collator=data_collator,
    args=training_args,
    compute_metrics=compute_metrics,
    train_dataset=common_voice_train,
    eval_dataset=common_voice_test,
    tokenizer=processor.feature_extractor,
)

### Training

In [None]:
trainer.train(resume_from_checkpoint=checkpoint)

In [None]:
!cp /content/vocab.json "/content/drive/MyDrive/model_checkpoints/checkpoint-1200"
!cp /content/vocab.json "/content/drive/MyDrive/model_checkpoints/checkpoint-1600"


In [None]:
!cp /content/vocab.json "/content/drive/Shareddrives/Hoggy Project/hoggyPro/checkpoint-10800"
!cp /content/vocab.json "/content/drive/Shareddrives/Hoggy Project/hoggyPro/checkpoint-11200"

In [None]:
model = Wav2Vec2ForCTC.from_pretrained("/content/drive/Shareddrives/Hoggy Project/hoggyPro/checkpoint-11200").to("cuda")
processor = Wav2Vec2Processor.from_pretrained("/content/drive/Shareddrives/Hoggy Project/hoggyPro/checkpoint-11200")

In [None]:
import torch

In [None]:
input_dict = processor(common_voice_test["input_values"][1], return_tensors="pt", padding=True, sampling_rate=16_000)

logits = model(input_dict.input_values.to("cuda")).logits

pred_ids = torch.argmax(logits, dim=-1)[0]

Finally, we can decode the example.

In [None]:
common_voice_test_ = load_dataset("csv", data_files=['/content/val_df.csv'], split='train')

In [None]:
print("Prediction:")
print(processor.decode(pred_ids))

print("\nReference:")
print(common_voice_test_[1]['sentence'].lower())


In [None]:
import pandas as pd
import re

In [None]:
df = pd.read_csv("/content/test_df.csv")

In [None]:
!gdown --id "1cJrAa3GA0AF258JnCFZWyWa50c3yPmwz"


Downloading...
From: https://drive.google.com/uc?id=1cJrAa3GA0AF258JnCFZWyWa50c3yPmwz
To: /content/wav2.zip
100% 421M/421M [00:10<00:00, 41.1MB/s]


In [None]:
!unzip "/content/wav2.zip"

In [None]:
df[0:10]

Unnamed: 0,sentence,path,labels
0,如果你相信这个,/content/wav/000100110.WAV,1
1,使我们从实证的方面去解决生物界的根本问题,/content/wav/000100050.WAV,1
2,又不是英国的消极自由所谓平等,/content/wav/000120092.WAV,1
3,这就是调和,/content/wav/000120328.WAV,1
4,他洋洋得意的据得这种生活狠可以终身了,/content/wav/000110276.WAV,1
5,如果那人和那虫的死不是有意注定的,/content/wav/000100112.WAV,1
6,只好说我本不爱吃这酸葡萄狐狸吃不着甜葡萄,/content/wav/000110421.WAV,1
7,自治的自治的社会共和的国家,/content/wav/000110316.WAV,1
8,虽然还有做洋八股有更时髦的党八股的,/content/wav/000130399.WAV,1
9,也有亲昵与轻贱两种情调,/content/wav/000120616.WAV,1


In [None]:
df["sentence"] = df["sentence"].apply(lambda x: re.sub("[a-zA-Z\<\>（）“”—— !?:;./ ！ 《》：”“!，。]", "", x))

In [None]:
!pip install datasets
!pip install transformers

# Inferencing

In [None]:
import numpy as np

In [None]:
import torch
import librosa
from datasets import load_dataset
from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor


In [None]:
def speech_file_to_array_fn(path):
    #bath = pathtowav(a["id"])
    print(path)
    speech_array, sampling_rate = librosa.load(path, sr=16_000)
    #batch["speech"] = speech_array
    #atch["sentence"] = batch["sentence"].upper()
    return speech_array


In [None]:
class Realwav:
  def __init__(self):
    MODEL_ID = "/content/drive/Shareddrives/Hoggy Project/hoggyPro/checkpoint-12400"
    tokenizer = Wav2Vec2CTCTokenizer("./vocab.json", unk_token="[UNK]", pad_token="[PAD]", word_delimiter_token="|")
    feature_extractor = Wav2Vec2FeatureExtractor(feature_size=1, sampling_rate=16000, padding_value=0.0, do_normalize=True, return_attention_mask=True)
    self.model = Wav2Vec2ForCTC.from_pretrained(MODEL_ID)
    self.processor = Wav2Vec2Processor(feature_extractor=feature_extractor, tokenizer=tokenizer)
    

  def sound_and_sentence(self,sound,sentence):
    inputs = self.processor(sound, sampling_rate=16_000, return_tensors="pt", padding=True)
    with torch.no_grad():
        logits = self.model(inputs.input_values, attention_mask=inputs.attention_mask).logits
    
    probs = torch.softmax(logits, dim=-1)
    ref_ids = self.processor(text=sentence)["input_ids"]
    scores = []
    ref_count = 0
    pred_ids = torch.argmax(logits[0], dim=-1)
    for seq_idx in range(pred_ids.shape[0]):
      if pred_ids[seq_idx] != 0:
        print(f"position of the word {sentence[ref_count]}: {seq_idx}")
        ref_id = ref_ids[ref_count]
        conf_score = probs[0, seq_idx, ref_id].tolist()
        scores.append(conf_score)
        print(conf_score)
        ref_count += 1
        if ref_count >= len(ref_ids):
          break
    #print("-" * 100)
    #print(scores)
    print(scores)
    sentence_score = np.mean(scores)
    #print(sentence_score)
    return sentence_score, int(sentence_score > 0.5)

In [None]:
rw = Realwav()

In [None]:
def pathtowav(id):
  path = f"{id}"
  return path
  #/content/wav/000100001.WAV
  #/content/wav/000100110.WAV	

In [None]:
df

Unnamed: 0,sentence,path,labels
0,如果你相信这个,/content/wav/000100110.WAV,1
1,使我们从实证的方面去解决生物界的根本问题,/content/wav/000100050.WAV,1
2,又不是英国的消极自由所谓平等,/content/wav/000120092.WAV,1
3,这就是调和,/content/wav/000120328.WAV,1
4,他洋洋得意的据得这种生活狠可以终身了,/content/wav/000110276.WAV,1
...,...,...,...
510,使我们从实证的方面去解决生物界的根本问题,/content/wav/000100142.WAV,0
511,五,/content/wav/000120522.WAV,0
512,葡萄太高了他吃不着,/content/wav/000120269.WAV,0
513,也许隔开两三万里路,/content/wav/000130123.WAV,0


In [None]:
predicts = []
for i in range(len(df)):
  sample = df.iloc[i]
  id = sample["path"]
  sentence = sample["sentence"]
  pathwav = pathtowav(id)
  filesound = speech_file_to_array_fn(pathwav)
  output = rw.sound_and_sentence(filesound,sentence)
  predicts.append(output[1])

In [None]:
id = "/content/translate_tts.wav"
sentence = "你好"
#pathwav = pathtowav(id)
filesound = speech_file_to_array_fn(id)
output = rw.sound_and_sentence(filesound,sentence)
predicts.append(output[1])
#โมเดลตัวใหม่

In [None]:
predicts

In [None]:
import re

In [None]:
df["sentence"] = df["sentence"].apply(lambda x: re.sub("[a-zA-Z\<\>（）“”—— !?:;./ ！ 《》：”“!，。]", "", x))


In [None]:
df