# Setup


In [5]:
import pandas as pd
import numpy as np
import nltk
import spacy
import en_core_web_sm
from spacymoji import Emoji
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from pyvi import ViTokenizer, ViPosTagger
from underthesea import word_tokenize
from collections import Counter
import re


# Load data

In [33]:
data = pd.read_csv(r'/content/drive/MyDrive/speech/RAG/data_csv.csv')
data.head()


Unnamed: 0,file,content
0,ViÃ£t MoÃÇÃÅc BiÃÄnh ƒêiÃ£nh.txt,V·ªãt M·ªëc B√¨nh ƒê·ªãnh\n\nV·ªãt M·ªëc l√† gi·ªëng v·ªãt h∆∞·ªõn...
1,Elliott Management Corporation.txt,Elliott Management Corporation\n\nElliott Mana...
2,Parma mccullochi.txt,Parma mccullochi\n\nParma mccullochi l√† m·ªôt lo...
3,DuÃõoÃõng Quang ThaÃÄnh.txt,D∆∞∆°ng Quang Th√†nh\n\nD∆∞∆°ng Quang Th√†nh (sinh n...
4,Sergey Vadimovich Stepashin.txt,Sergey Vadimovich Stepashin\n\nSergei Vadimovi...


# Preprocessing

In [35]:
text = data['content'].to_list()

In [36]:
def remove_special_characters(data):
    # S·ª≠ d·ª•ng bi·ªÉu th·ª©c ch√≠nh quy ƒë·ªÉ ch·ªâ l·∫•y c√°c k√Ω t·ª± ch·ªØ c√°i v√† s·ªë
    clean_data = re.sub(r'[^\w\s√Ä-√ø]', ' ', data)
    return clean_data

In [37]:
remove_special_characters(text[0])

'V·ªãt M·ªëc B√¨nh ƒê·ªãnh\n\nV·ªãt M·ªëc l√† gi·ªëng v·ªãt h∆∞·ªõng tr·ª©ng  c√≥ m√†u l√¥ng n√¢u nh·∫°t gi·ªëng nh∆∞ m·ªëc n√™n g·ª£i l√† v·ªãt M·ªëc  ƒë∆∞·ª£c h√¨nh th√†nh do qu√° tr√¨nh lai t·∫°o t·ª± nhi√™n v√† ƒë√£ c√≥ t·ª´ l√¢u ƒë·ªùi ·ªü v√πng G√≤ B·ªìi  huy·ªán Tuy Ph∆∞·ªõc  ·ªü ph√≠a ƒê√¥ng c·ªßa t·ªânh B√¨nh ƒê·ªãnh  Hi·ªán nay  v·ªãt c√≥ ·ªü nhi·ªÅu n∆°i c·ªßa v√πng duy√™n h·∫£i mi·ªÅn trung  \nƒê·∫∑c ƒëi·ªÉm \nL√∫c m·ªõi n·ªü  v·ªãt con c√≥ h√¨nh d√°ng thon  d√†i  l√¥ng m√†u lang nh∆∞ b·ªã m·ªëc  m·ªè v√† ch√¢n v√†ng nh·∫°t  Khi tr∆∞·ªüng th√†nh  v·ªãt c√≥ m√¨nh thon  ng·ª±c l√©p  ƒë·∫ßu thon  c·ªï thon d√†i  h√¨nh d√°ng t·∫°o v·ªõi m·∫∑t ƒë·∫•t 1 g√≥c 45   l√¥ng m√†u n√¢u nh·∫°t  v√†ng c√≥ ƒë·ªëm n√¢u  v√†ng hoe v√† v√†ng  ·ªü con tr·ªëng c√≥ 1   2 l√¥ng m√≥c  m·ªè v√† ch√¢n c√≥ m√†u x√°m ƒëen  ch√¨   ƒë·ªè ho·∫∑c v√†ng  kh·ªëi l∆∞·ª£ng 1 4   1 5\xa0kg con  Kh·∫£ nƒÉng s·∫£n xu·∫•t  V·ªãt ƒë·∫ª b√≥i l√∫c 140 ng√†y tu·ªïi  nƒÉng su·∫•t tr·ª©ng ƒë·∫°t 90 qu·∫£ 25 tu·∫ßn  kh·

In [38]:
uniChars = "√†√°·∫£√£·∫°√¢·∫ß·∫•·∫©·∫´·∫≠ƒÉ·∫±·∫Ø·∫≥·∫µ·∫∑√®√©·∫ª·∫Ω·∫π√™·ªÅ·∫ø·ªÉ·ªÖ·ªáƒë√¨√≠·ªâƒ©·ªã√≤√≥·ªè√µ·ªç√¥·ªì·ªë·ªï·ªó·ªô∆°·ªù·ªõ·ªü·ª°·ª£√π√∫·ªß≈©·ª•∆∞·ª´·ª©·ª≠·ªØ·ª±·ª≥√Ω·ª∑·ªπ·ªµ√Ä√Å·∫¢√É·∫†√Ç·∫¶·∫§·∫®·∫™·∫¨ƒÇ·∫∞·∫Æ·∫≤·∫¥·∫∂√à√â·∫∫·∫º·∫∏√ä·ªÄ·∫æ·ªÇ·ªÑ·ªÜƒê√å√ç·ªàƒ®·ªä√í√ì·ªé√ï·ªå√î·ªí·ªê·ªî·ªñ·ªò∆†·ªú·ªö·ªû·ª†·ª¢√ô√ö·ª¶≈®·ª§∆Ø·ª™·ª®·ª¨·ªÆ·ª∞·ª≤√ù·ª∂·ª∏·ª¥√ÇƒÇƒê√î∆†∆Ø"
unsignChars = "aaaaaaaaaaaaaaaaaeeeeeeeeeeediiiiiooooooooooooooooouuuuuuuuuuuyyyyyAAAAAAAAAAAAAAAAAEEEEEEEEEEEDIIIOOOOOOOOOOOOOOOOOOOUUUUUUUUUUUYYYYYAADOOU"


# t·∫°o ra m·ªôt t·ª´ ƒëi·ªÉn (dic) ƒë·ªÉ √°nh x·∫° c√°c k√Ω t·ª± ti·∫øng Vi·ªát t·ª´ b·∫£ng m√£ Latin-1 (1252) sang UTF-8.
def loaddicchar():
  dic = {}
  char1252 = 'aÃÄ|aÃÅ|aÃâ|aÃÉ|aÃ£|√¢ÃÄ|√¢ÃÅ|√¢Ãâ|√¢ÃÉ|√¢Ã£|ƒÉÃÄ|ƒÉÃÅ|ƒÉÃâ|ƒÉÃÉ|ƒÉÃ£|eÃÄ|eÃÅ|eÃâ|eÃÉ|eÃ£|√™ÃÄ|√™ÃÅ|√™Ãâ|√™ÃÉ|√™Ã£|iÃÄ|iÃÅ|iÃâ|iÃÉ|iÃ£|oÃÄ|oÃÅ|oÃâ|oÃÉ|oÃ£|√¥ÃÄ|√¥ÃÅ|√¥Ãâ|√¥ÃÉ|√¥Ã£|∆°ÃÄ|∆°ÃÅ|∆°Ãâ|∆°ÃÉ|∆°Ã£|uÃÄ|uÃÅ|uÃâ|uÃÉ|uÃ£|∆∞ÃÄ|∆∞ÃÅ|∆∞Ãâ|∆∞ÃÉ|∆∞Ã£|yÃÄ|yÃÅ|yÃâ|yÃÉ|yÃ£|AÃÄ|AÃÅ|AÃâ|AÃÉ|AÃ£|√ÇÃÄ|√ÇÃÅ|√ÇÃâ|√ÇÃÉ|√ÇÃ£|ƒÇÃÄ|ƒÇÃÅ|ƒÇÃâ|ƒÇÃÉ|ƒÇÃ£|EÃÄ|EÃÅ|EÃâ|EÃÉ|EÃ£|√äÃÄ|√äÃÅ|√äÃâ|√äÃÉ|√äÃ£|IÃÄ|IÃÅ|IÃâ|IÃÉ|IÃ£|OÃÄ|OÃÅ|OÃâ|OÃÉ|OÃ£|√îÃÄ|√îÃÅ|√îÃâ|√îÃÉ|√îÃ£|∆†ÃÄ|∆†ÃÅ|∆†Ãâ|∆†ÃÉ|∆†Ã£|UÃÄ|UÃÅ|UÃâ|UÃÉ|UÃ£|∆ØÃÄ|∆ØÃÅ|∆ØÃâ|∆ØÃÉ|∆ØÃ£|YÃÄ|YÃÅ|YÃâ|YÃÉ|YÃ£'.split('|')
  charutf8 = "√†|√°|·∫£|√£|·∫°|·∫ß|·∫•|·∫©|·∫´|·∫≠|·∫±|·∫Ø|·∫≥|·∫µ|·∫∑|√®|√©|·∫ª|·∫Ω|·∫π|·ªÅ|·∫ø|·ªÉ|·ªÖ|·ªá|√¨|√≠|·ªâ|ƒ©|·ªã|√≤|√≥|·ªè|√µ|·ªç|·ªì|·ªë|·ªï|·ªó|·ªô|·ªù|·ªõ|·ªü|·ª°|·ª£|√π|√∫|·ªß|≈©|·ª•|·ª´|·ª©|·ª≠|·ªØ|·ª±|·ª≥|√Ω|·ª∑|·ªπ|·ªµ|√Ä|√Å|·∫¢|√É|·∫†|·∫¶|·∫§|·∫®|·∫™|·∫¨|·∫∞|·∫Æ|·∫≤|·∫¥|·∫∂|√à|√â|·∫∫|·∫º|·∫∏|·ªÄ|·∫æ|·ªÇ|·ªÑ|·ªÜ|√å|√ç|·ªà|ƒ®|·ªä|√í|√ì|·ªé|√ï|·ªå|·ªí|·ªê|·ªî|·ªñ|·ªò|·ªú|·ªö|·ªû|·ª†|·ª¢|√ô|√ö|·ª¶|≈®|·ª§|·ª™|·ª®|·ª¨|·ªÆ|·ª∞|·ª≤|√ù|·ª∂|·ª∏|·ª¥".split('|')
  for i in range(len(char1252)):
      dic[char1252[i]] = charutf8[i]
  return dic

dicchar = loaddicchar()

# ƒê∆∞a d·ªØ li·ªáu qua h√†m n√†y ƒë·ªÉ chu·∫©n h√≥a l·∫°i
def convert_unicode(data):
    return re.sub(r'aÃÄ|aÃÅ|aÃâ|aÃÉ|aÃ£|√¢ÃÄ|√¢ÃÅ|√¢Ãâ|√¢ÃÉ|√¢Ã£|ƒÉÃÄ|ƒÉÃÅ|ƒÉÃâ|ƒÉÃÉ|ƒÉÃ£|eÃÄ|eÃÅ|eÃâ|eÃÉ|eÃ£|√™ÃÄ|√™ÃÅ|√™Ãâ|√™ÃÉ|√™Ã£|iÃÄ|iÃÅ|iÃâ|iÃÉ|iÃ£|oÃÄ|oÃÅ|oÃâ|oÃÉ|oÃ£|√¥ÃÄ|√¥ÃÅ|√¥Ãâ|√¥ÃÉ|√¥Ã£|∆°ÃÄ|∆°ÃÅ|∆°Ãâ|∆°ÃÉ|∆°Ã£|uÃÄ|uÃÅ|uÃâ|uÃÉ|uÃ£|∆∞ÃÄ|∆∞ÃÅ|∆∞Ãâ|∆∞ÃÉ|∆∞Ã£|yÃÄ|yÃÅ|yÃâ|yÃÉ|yÃ£|AÃÄ|AÃÅ|AÃâ|AÃÉ|AÃ£|√ÇÃÄ|√ÇÃÅ|√ÇÃâ|√ÇÃÉ|√ÇÃ£|ƒÇÃÄ|ƒÇÃÅ|ƒÇÃâ|ƒÇÃÉ|ƒÇÃ£|EÃÄ|EÃÅ|EÃâ|EÃÉ|EÃ£|√äÃÄ|√äÃÅ|√äÃâ|√äÃÉ|√äÃ£|IÃÄ|IÃÅ|IÃâ|IÃÉ|IÃ£|OÃÄ|OÃÅ|OÃâ|OÃÉ|OÃ£|√îÃÄ|√îÃÅ|√îÃâ|√îÃÉ|√îÃ£|∆†ÃÄ|∆†ÃÅ|∆†Ãâ|∆†ÃÉ|∆†Ã£|UÃÄ|UÃÅ|UÃâ|UÃÉ|UÃ£|∆ØÃÄ|∆ØÃÅ|∆ØÃâ|∆ØÃÉ|∆ØÃ£|YÃÄ|YÃÅ|YÃâ|YÃÉ|YÃ£',
        lambda x: dicchar[x.group()], data)

In [39]:
bang_nguyen_am = [['a', '√†', '√°', '·∫£', '√£', '·∫°', 'a'],
                  ['ƒÉ', '·∫±', '·∫Ø', '·∫≥', '·∫µ', '·∫∑', 'aw'],
                  ['√¢', '·∫ß', '·∫•', '·∫©', '·∫´', '·∫≠', 'aa'],
                  ['e', '√®', '√©', '·∫ª', '·∫Ω', '·∫π', 'e'],
                  ['√™', '·ªÅ', '·∫ø', '·ªÉ', '·ªÖ', '·ªá', 'ee'],
                  ['i', '√¨', '√≠', '·ªâ', 'ƒ©', '·ªã', 'i'],
                  ['o', '√≤', '√≥', '·ªè', '√µ', '·ªç', 'o'],
                  ['√¥', '·ªì', '·ªë', '·ªï', '·ªó', '·ªô', 'oo'],
                  ['∆°', '·ªù', '·ªõ', '·ªü', '·ª°', '·ª£', 'ow'],
                  ['u', '√π', '√∫', '·ªß', '≈©', '·ª•', 'u'],
                  ['∆∞', '·ª´', '·ª©', '·ª≠', '·ªØ', '·ª±', 'uw'],
                  ['y', '·ª≥', '√Ω', '·ª∑', '·ªπ', '·ªµ', 'y']]
bang_ky_tu_dau = ['', 'f', 's', 'r', 'x', 'j']

nguyen_am_to_ids = {}

for i in range(len(bang_nguyen_am)):
    for j in range(len(bang_nguyen_am[i]) - 1):
        nguyen_am_to_ids[bang_nguyen_am[i][j]] = (i, j)

def vn_word_to_telex_type(word):
    dau_cau = 0
    new_word = ''
    for char in word:
        # N·∫øu char c√≥ trong t·ª´ ƒëi·ªÉn nguyen_am_to_ids, th√¨ bi·∫øn x s·∫Ω nh·∫≠n gi√° tr·ªã ƒë∆∞·ª£c √°nh x·∫° t·ª´ char v√† bi·∫øn y s·∫Ω nh·∫≠n gi√° tr·ªã th·ª© hai trong c·∫∑p gi√° tr·ªã t∆∞∆°ng ·ª©ng v·ªõi char trong t·ª´ ƒëi·ªÉn.
        # N·∫øu char kh√¥ng t·ªìn t·∫°i trong t·ª´ ƒëi·ªÉn, th√¨ get() s·∫Ω tr·∫£ v·ªÅ gi√° tr·ªã m·∫∑c ƒë·ªãnh l√† (-1, -1).
        x, y = nguyen_am_to_ids.get(char, (-1, -1))
        if x == -1:
            new_word += char
            continue
        if y != 0:
            dau_cau = y
        new_word += bang_nguyen_am[x][-1]
    new_word += bang_ky_tu_dau[dau_cau]
    return new_word

def vn_sentence_to_telex_type(sentence):
    """
    Chuy·ªÉn c√¢u ti·∫øng vi·ªát c√≥ d·∫•u v·ªÅ ki·ªÉu g√µ telex.
    :param sentence:
    :return:
    """
    words = sentence.split()
    for index, word in enumerate(words):
        words[index] = vn_word_to_telex_type(word)
    return ' '.join(words)

def chuan_hoa_dau_tu_tieng_viet(word):
    if not is_valid_vietnam_word(word):
        return word

    chars = list(word)
    dau_cau = 0
    nguyen_am_index = []
    qu_or_gi = False
    for index, char in enumerate(chars):
        x, y = nguyen_am_to_ids.get(char, (-1, -1))
        if x == -1:
            continue
        elif x == 9:  # check qu
            if index != 0 and chars[index - 1] == 'q':
                chars[index] = 'u'
                qu_or_gi = True
        elif x == 5:  # check gi
            if index != 0 and chars[index - 1] == 'g':
                chars[index] = 'i'
                qu_or_gi = True
        if y != 0:
            dau_cau = y
            chars[index] = bang_nguyen_am[x][0]
        if not qu_or_gi or index != 1:
            nguyen_am_index.append(index)

    if len(nguyen_am_index) < 2:
        if qu_or_gi:
            if len(chars) == 2:
                x, y = nguyen_am_to_ids.get(chars[1])
                chars[1] = bang_nguyen_am[x][dau_cau]
            else:
                x, y = nguyen_am_to_ids.get(chars[2], (-1, -1))
                if x != -1:
                    chars[2] = bang_nguyen_am[x][dau_cau]
                else:
                    chars[1] = bang_nguyen_am[5][dau_cau] if chars[1] == 'i' else bang_nguyen_am[9][dau_cau]
            return ''.join(chars)
        return word

    for index in nguyen_am_index:
        x, y = nguyen_am_to_ids[chars[index]]
        if x == 4 or x == 8:  # √™, ∆°
            chars[index] = bang_nguyen_am[x][dau_cau]
            return ''.join(chars)

    if len(nguyen_am_index) == 2:
        if nguyen_am_index[-1] == len(chars) - 1:
            x, y = nguyen_am_to_ids[chars[nguyen_am_index[0]]]
            chars[nguyen_am_index[0]] = bang_nguyen_am[x][dau_cau]
        else:
            x, y = nguyen_am_to_ids[chars[nguyen_am_index[1]]]
            chars[nguyen_am_index[1]] = bang_nguyen_am[x][dau_cau]
    else:
        x, y = nguyen_am_to_ids[chars[nguyen_am_index[1]]]
        chars[nguyen_am_index[1]] = bang_nguyen_am[x][dau_cau]
    return ''.join(chars)

# ki·ªÉm tra xem m·ªôt t·ª´ ti·∫øng Vi·ªát c√≥ tu√¢n theo quy t·∫Øc ƒë·∫∑t c√°ch nhau c·ªßa c√°c nguy√™n √¢m kh√¥ng
def is_valid_vietnam_word(word):
    chars = list(word)
    nguyen_am_index = -1
    for index, char in enumerate(chars):
        x, y = nguyen_am_to_ids.get(char, (-1, -1))
        if x != -1:
            if nguyen_am_index == -1:
                nguyen_am_index = index
            else:
                if index - nguyen_am_index != 1:
                    return False
                nguyen_am_index = index
    return True

def chuan_hoa_dau_cau_tieng_viet(sentence):
    """
        Chuy·ªÉn c√¢u ti·∫øng vi·ªát v·ªÅ chu·∫©n g√µ d·∫•u ki·ªÉu c≈©.
        :param sentence:
        :return:
        """
    sentence = sentence.lower()
    words = sentence.split()
    for index, word in enumerate(words):
        words[index] = chuan_hoa_dau_tu_tieng_viet(word)
    return ' '.join(words)


In [40]:
def lower_case(data):
  return data.lower()

def remove_dub(sentence):
    return re.sub(r'([A-Z])\1+', lambda m: m.group(1), sentence, flags=re.IGNORECASE)

In [41]:
nlp = en_core_web_sm.load()
emoji = Emoji(nlp)
nlp.add_pipe('emoji', first=True)

def remove_emoji(text):
  doc = nlp(text)
  text_with_spaces = ''.join(token.text_with_ws if not token._.is_emoji else ' ' for token in doc)
  return text_with_spaces

In [42]:
teencode_data = pd.read_csv('/content/drive/MyDrive/BA documents/DBA/Data/teencode.txt',names=['teencode','map'],sep='\t',)
teencode_list = teencode_data['teencode'].to_list()
map_list = teencode_data['map'].to_list()

In [43]:
def searchTeencode(word):
  try:
    global teencode_count
    index = teencode_list.index(word)
    map_word = map_list[index]
    teencode_count += 1
    return map_word
  except:
    pass

stopword_count = 0
teencode_count =0
def Teencode(sentence):
    # X·ª© l√Ω c√°c t·ª´ ok
    sentence = re.sub(r'\bok\w*\b', 'ok', sentence)
    # Tokenize
    List_tokens = word_tokenize(sentence,format='text')
    List_tokens = word_tokenize(sentence)

    # Teencode
    for tokens_idx, text_tokens in enumerate(List_tokens):
        deteencoded = searchTeencode(text_tokens)
        if deteencoded is not None:
            List_tokens[tokens_idx] = deteencoded

    deteencode_sentence = " ".join(List_tokens)
    return deteencode_sentence


In [44]:

def text_preprocess(document):
    # x√≥a k√Ω t·ª± ƒë·∫∑c bi·ªát
    document = remove_special_characters(document)
    # X√≥a emoji
    document = remove_emoji(document)
    # ƒë∆∞a v·ªÅ lower
    document = document.lower()
    # X√≥a k√Ω t·ª± c·ªë √Ω vi·∫øt d√†i
    document = remove_dub(document)
    # chu·∫©n h√≥a unicode
    document = convert_unicode(document)
    # chu·∫©n h√≥a c√°ch g√µ d·∫•u ti·∫øng Vi·ªát
    document = chuan_hoa_dau_cau_tieng_viet(document)
    # chu·∫©n h√≥a teencode
    document = Teencode(document)
    # t√°ch t·ª´
    document = ViTokenizer.tokenize(document)
    # x√≥a c√°c k√Ω t·ª± kh√¥ng c·∫ßn thi·∫øt
    document = re.sub(r'[^\s\w√°√†·∫£√£·∫°ƒÉ·∫Ø·∫±·∫≥·∫µ·∫∑√¢·∫•·∫ß·∫©·∫´·∫≠√©√®·∫ª·∫Ω·∫π√™·∫ø·ªÅ·ªÉ·ªÖ·ªá√≥√≤·ªè√µ·ªç√¥·ªë·ªì·ªï·ªó·ªô∆°·ªõ·ªù·ªü·ª°·ª£√≠√¨·ªâƒ©·ªã√∫√π·ªß≈©·ª•∆∞·ª©·ª´·ª≠·ªØ·ª±√Ω·ª≥·ª∑·ªπ·ªµƒë_]',' ',document)
    # x√≥a kho·∫£ng tr·∫Øng th·ª´a
    document = re.sub(r'\s+', ' ', document).strip()
    return document

In [45]:
document = 'm√°y ƒë·∫πp xu·∫•t s·∫Øc, mua onl m·ªói t·ªôi tiki h√¥k b√°n k√®m c∆∞·ªùng l·ª±c v√† ·ªëp hjhj =))))) m√°y ƒë·∫πp l·ª©m n√™n mua nka ü•∫üòªüòΩ'
document = text_preprocess(document)
print(document)

m√°y ƒë·∫πp xu·∫•t_s·∫Øc mua online m·ªói t·ªôi tiki kh√¥ng b√°n k√®m c∆∞·ªùng_l·ª±c v√† ·ªëp hi_hi m√°y ƒë·∫πp l·ª©m n√™n mua nha


In [46]:
data['processed'] = data['content'].apply(text_preprocess)

In [47]:
stopwords = ['c√≥', 'th√¨', 'n√™n', 'v√†', 'l√†', 'n√†y', 'b·ªã', 'c≈©ng', 'cho', 'ƒë√£', 'c·ªßa', 'm√†', 'l√™n', 'nh∆∞', 'v·ªõi', 's·∫Ω', 'l·∫°i', 'c√≤n', 'lu√¥n', 'r·ªìi', 'nha', 'nh√©', '·∫°', '√†', '∆°i', 'm√¨nh', 'v·∫´n', 'v·ªÅ', 'v√¨', '*', 'ki·ªÉu', 'th·∫•y', '√°', 'l·∫Øm']

In [48]:
def word_segmentation(x):
    word=word_tokenize(x)
    return word

def stopword_find(x):
      return Counter(x)

def remove_stopwords(line):
  s2=[]
  for word in line:
      if word not in stopwords:
          s2.append(word)
  return s2
def detokenize(tokens):
    # Join tokens to form a complete sentence
    return ' '.join(tokens)

In [49]:
data["tokenized_text"]=data["processed"].apply(word_segmentation)
data["tokenized_text"]=data["tokenized_text"].apply(remove_stopwords)
data['processed'] = data['tokenized_text'].apply(detokenize)

In [50]:
data['processed']

0        v·ªãt m·ªëc b√¨nh ƒë·ªãnh v·ªãt m·ªëc gi·ªëng v·ªãt h∆∞·ªõng tr·ª©n...
1        eliot management corporation eliot management ...
2        parma mculochi parma mculochi m·ªôt lo√†i c√° bi·ªÉn...
3        d∆∞∆°ng quang th√†nh d∆∞∆°ng quang th√†nh sinh ng√†y ...
4        sergey vadimovich stepashin sergei vadimovich ...
                               ...                        
10001    h·ªì th·ªã thanh h·ªìng h·ªì th·ªã thanh h·ªìng sinh 25 th...
10002    v√≤ng chung_k·∫øt gi·∫£i b√≥ng_ƒë√° trong nh√† v√¥_ƒë·ªãch ...
10003    sarmizegetusa regia sarmizegetusa regia l√†sarm...
10004    chi√™n ng·∫≠p d·∫ßu chi√™n ng·∫≠p d·∫ßu m·ªôt ph∆∞∆°ng_ph√°p ...
10005    takabuti takabuti m·ªôt ph·ª•_n·ªØ k·∫øt_h√¥n ƒë·∫°t ƒë·∫øn ƒë...
Name: processed, Length: 10006, dtype: object

In [None]:
data = data[data['transcript_processed'].notnull()]

In [51]:
data.head()

Unnamed: 0,file,content,processed,tokenized_text
0,ViÃ£t MoÃÇÃÅc BiÃÄnh ƒêiÃ£nh.txt,V·ªãt M·ªëc B√¨nh ƒê·ªãnh\n\nV·ªãt M·ªëc l√† gi·ªëng v·ªãt h∆∞·ªõn...,v·ªãt m·ªëc b√¨nh ƒë·ªãnh v·ªãt m·ªëc gi·ªëng v·ªãt h∆∞·ªõng tr·ª©n...,"[v·ªãt, m·ªëc, b√¨nh ƒë·ªãnh, v·ªãt, m·ªëc, gi·ªëng, v·ªãt, h∆∞..."
1,Elliott Management Corporation.txt,Elliott Management Corporation\n\nElliott Mana...,eliot management corporation eliot management ...,"[eliot, management, corporation, eliot, manage..."
2,Parma mccullochi.txt,Parma mccullochi\n\nParma mccullochi l√† m·ªôt lo...,parma mculochi parma mculochi m·ªôt lo√†i c√° bi·ªÉn...,"[parma, mculochi, parma, mculochi, m·ªôt, lo√†i, ..."
3,DuÃõoÃõng Quang ThaÃÄnh.txt,D∆∞∆°ng Quang Th√†nh\n\nD∆∞∆°ng Quang Th√†nh (sinh n...,d∆∞∆°ng quang th√†nh d∆∞∆°ng quang th√†nh sinh ng√†y ...,"[d∆∞∆°ng quang, th√†nh, d∆∞∆°ng quang, th√†nh, sinh,..."
4,Sergey Vadimovich Stepashin.txt,Sergey Vadimovich Stepashin\n\nSergei Vadimovi...,sergey vadimovich stepashin sergei vadimovich ...,"[sergey, vadimovich, stepashin, sergei, vadimo..."


In [52]:
data.to_csv('/content/drive/MyDrive/speech/RAG/clean_csv.csv', index=False, encoding = 'utf-8-sig')