In [1]:
import pandas as pd
import requests
import re
import os

# **Use `reviewkhachsan500.csv` to get required column**

In [3]:
df = pd.read_csv("../data_tiktok/reviewkhachsan500.csv")

In [4]:
df.columns.to_list()

['authorMeta/digg',
 'authorMeta/fans',
 'authorMeta/heart',
 'webVideoUrl',
 'isAd',
 'isSponsored',
 'playCount',
 'diggCount',
 'commentCount',
 'shareCount',
 'videoMeta/duration',
 'videoMeta/subtitleLinks/1/downloadLink',
 'text']

# **Process all files**



In [5]:
folder_path = '../data_tiktok/'

columns_to_keep = [
    'authorMeta/digg',
    'authorMeta/fans',
    'authorMeta/heart',
    'webVideoUrl',
    'isAd',
    'isSponsored',
    'playCount',
    'diggCount',
    'commentCount',
    'shareCount',
    'videoMeta/duration',
    'videoMeta/subtitleLinks/1/downloadLink',
    'text'
]

csv_files = [f for f in os.listdir(folder_path) if f.endswith('.csv')]

if csv_files:
    processed_data = []

    for csv_file in csv_files:
        file_path = os.path.join(folder_path, csv_file)
        df = pd.read_csv(file_path)

        if 'videoMeta/subtitleLinks/1/downloadLink' in df.columns:
            df = df[columns_to_keep]
            df = df[df['videoMeta/subtitleLinks/1/downloadLink'].notnull()]
            df = df[columns_to_keep]
            print(f"Loaded file: {csv_file}")
            processed_data.append(df)

    if processed_data:
        final_df = pd.concat(processed_data, ignore_index=True)
        print(final_df.head(2))
else:
    print("No valid CSV files found in the folder.")

Loaded file: homestay_latest.csv
Loaded file: homestay_oldest.csv
Loaded file: homestay_popular.csv
Loaded file: hotelcantho210.csv
Loaded file: hoteldaklak100.csv
Loaded file: hoteldalat265.csv
Loaded file: hoteldanang159.csv
Loaded file: hotelgiare209.csv
Loaded file: hotelhagiang230.csv
Loaded file: hotelhaiphong214.csv
Loaded file: hotelhalong152.csv
Loaded file: hotelhanoi234.csv
Loaded file: hotelhoian234.csv
Loaded file: hotelhue234.csv
Loaded file: hotelluxury207.csv
Loaded file: hotelnhatrang262.csv
Loaded file: hotelninhthuan260.csv
Loaded file: hotelphuquoc234.csv
Loaded file: hotelphuquy170.csv
Loaded file: hotelquan1-184.csv
Loaded file: hotelquangbinh238.csv
Loaded file: hotelsaigon190.csv
Loaded file: hoteltuyenquang210.csv
Loaded file: hotelvungtau158.csv
Loaded file: khachsan162.csv
Loaded file: khachsan180lastest.csv
Loaded file: resort1.csv
Loaded file: resort2.csv
Loaded file: resort3.csv
Loaded file: reviewhomestay_latest.csv
Loaded file: reviewhomestay_oldest.csv


In [6]:
final_df = final_df.drop_duplicates(subset='webVideoUrl', keep='first')

In [7]:
final_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1281 entries, 0 to 2260
Data columns (total 13 columns):
 #   Column                                  Non-Null Count  Dtype 
---  ------                                  --------------  ----- 
 0   authorMeta/digg                         1281 non-null   int64 
 1   authorMeta/fans                         1281 non-null   int64 
 2   authorMeta/heart                        1281 non-null   int64 
 3   webVideoUrl                             1281 non-null   object
 4   isAd                                    1281 non-null   bool  
 5   isSponsored                             1281 non-null   bool  
 6   playCount                               1281 non-null   int64 
 7   diggCount                               1281 non-null   int64 
 8   commentCount                            1281 non-null   int64 
 9   shareCount                              1281 non-null   int64 
 10  videoMeta/duration                      1281 non-null   int64 
 11  videoMeta

# **Change script link suffix to pattern -vie-VN**

In [8]:
def fix_subtitle_link(url):
    if not url.endswith('-vie-VN'):
        url = url.rsplit('-', 1)[0] + '-vie-VN'
    return url

final_df['videoMeta/subtitleLinks/1/downloadLink'] = final_df['videoMeta/subtitleLinks/1/downloadLink'].apply(fix_subtitle_link)

In [9]:
final_df.insert(0, 'id', ['video' + str(i).zfill(2) for i in range(1, len(final_df) + 1)])


# **Extract script video**

In [11]:
def clean_vtt_from_url(url):
    try:
        response = requests.get(url)
        content_type = response.headers.get("Content-Type", "")

        if "application/json" in content_type:
            try:
                json_data = response.json()
                if json_data.get("error", {}).get("type") == "record-not-found":
                    return "__NOT_FOUND__"
            except Exception:
                pass

        text = response.content.decode('utf-8-sig', errors='replace')
        lines = text.splitlines()
        cleaned_lines = []

        for line in lines:
            line = line.strip()
            if line == "WEBVTT":
                continue
            if re.match(r'^\d{2}:\d{2}:\d{2}\.\d{3} --> \d{2}:\d{2}:\d{2}\.\d{3}$', line):
                continue
            if line:
                cleaned_lines.append(line)

        return "\n".join(cleaned_lines)

    except Exception as e:
        print(f"Error processing URL {url}: {e}")
        return "__NOT_FOUND__"

indexes_to_drop = []

for index, row in final_df.iterrows():
    video_id = row['id']
    subtitle_url = row['videoMeta/subtitleLinks/1/downloadLink']

    cleaned_text = clean_vtt_from_url(subtitle_url)

    if cleaned_text == "__NOT_FOUND__":
        print(f"No subtitle found for video_id: {video_id}, will drop this row.")
        indexes_to_drop.append(index)
        continue

    final_df.loc[index, 'script_video'] = cleaned_text
    print(f"Processed script for video_id: {video_id}")

final_df.drop(indexes_to_drop, inplace=True)
final_df.reset_index(drop=True, inplace=True)

Processed script for video_id: video01
Processed script for video_id: video02
Processed script for video_id: video03
Processed script for video_id: video04
Processed script for video_id: video05
Processed script for video_id: video06
Processed script for video_id: video07
Processed script for video_id: video08
Processed script for video_id: video09
Processed script for video_id: video10
No subtitle found for video_id: video11, will drop this row.
Processed script for video_id: video12
Processed script for video_id: video13
Processed script for video_id: video14
Processed script for video_id: video15
No subtitle found for video_id: video16, will drop this row.
Processed script for video_id: video17
Processed script for video_id: video18
Processed script for video_id: video19
Processed script for video_id: video20
Processed script for video_id: video21
Processed script for video_id: video22
Processed script for video_id: video23
Processed script for video_id: video24
No subtitle found fo

In [12]:
final_df['id'] = ['video' + str(i).zfill(2) for i in range(1, len(final_df) + 1)]

In [14]:
final_df.to_csv("fulldata10271.csv")