In [1]:
import re
import time
import csv
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

In [3]:
# Chromium browser and WebDriver paths
chromium_binary_path = 'chrome-win64\\chrome.exe'
chromium_driver_path = 'chromedriver-win64\\chromedriver.exe'

# Selenium WebDriver configuration
service = Service(executable_path=chromium_driver_path)
options = webdriver.ChromeOptions()
options.binary_location = chromium_binary_path

options.add_argument('--no-sandbox')
options.add_argument('--disable-gpu')
options.add_argument('--ignore-certificate-errors')
options.add_argument('--disable-blink-features=AutomationControlled')
options.add_argument('--disable-features=AutoUpdate')
options.add_argument('--allow-running-insecure-content')
options.add_experimental_option('excludeSwitches', ['enable-logging'])
options.add_argument("user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36")
options.add_experimental_option('excludeSwitches', ['enable-automation'])
options.add_experimental_option('useAutomationExtension', False)

In [None]:
# Input CSV file name
input_file_name = 'tiktok_data_head.csv'

# List of hashtags to collect
hashtags = ["khach san ha long", "khach san ha noi", "khach san hoi an", 
            "khach san ho chi minh", "khach san da nang", "khach san sapa", 
            "khach san ninh binh", "khach san da lat", "khach san phu quoc", 
            "khach san hue", "khach san mui ne", "khach san nha trang", 
            "khach san quy nhon", "khach san can tho", "khach san phong nha", 
            "khach san con dao", "khach san cao bang", "khach san mai chau", 
            "khach san tay ninh", "khach san vung tau", "khach san ha giang", "khach san phan thiet"
]

# Function to scroll the page to load more videos
def scroll_page(driver, scroll_count=1):
    last_height = driver.execute_script("return document.body.scrollHeight")
    for _ in range(scroll_count):
        driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
        time.sleep(10)
        new_height = driver.execute_script("return document.body.scrollHeight")
        if new_height == last_height:
            break
        last_height = new_height

# Open the CSV file for writing, including the header
with open(input_file_name, 'w', newline='', encoding='utf-8') as csvfile:
    writer = csv.writer(csvfile)
    writer.writerow(['Video URL', 'Title', 'Hashtags'])  # Write header and add search hashtag column

    # Loop through each hashtag
    for hashtag in hashtags:
        print(f"Collecting data for #{hashtag}...")

        # Open the browser and navigate to the TikTok hashtag page
        driver = webdriver.Chrome(service=service, options=options)
        driver.get(f"https://www.tiktok.com/tag/{hashtag}?lang=vi-VN")
        time.sleep(10)  # Wait for the page to load

        # Scroll the page to load more videos
        scroll_page(driver, scroll_count=13) # Adjust scroll count as needed

        # Find all videos on the hashtag page
        elements = driver.find_elements(By.CSS_SELECTOR, 'a[href*="/video/"]')

        for element in elements:
            try:
                video_url = element.get_attribute('href')
                full_title = element.find_element(By.CSS_SELECTOR, 'img').get_attribute('alt')
                title = re.split(r'#', full_title, 1)[0].strip()
                hashtags_list = ' '.join(re.findall(r"#\w+", full_title))
                writer.writerow([video_url, title, hashtags_list])  # Write data and searched hashtag
            except Exception as e:
                #print(f"Error processing video: {e} in hashtag #{hashtag}")
                continue

        driver.quit()
        print(f'Finished collecting #{hashtag}')

print(f"Finished collecting all hashtags! Data saved to {input_file_name}")

Collecting data for #khach san ha long...
Finished collecting #khach san ha long
Collecting data for #khach san ha noi...
Finished collecting #khach san ha noi
Collecting data for #khach san hoi an...
Finished collecting #khach san hoi an
Finished collecting all hashtags! Data saved to tiktok_data_head.csv


In [41]:
import pandas as pd
# Đọc file CSV và hiển thị dữ liệu
df = pd.read_csv("tiktok_data_head.csv")
df

Unnamed: 0,Video URL,Title,Hashtags
0,https://www.tiktok.com/@halongtattantat/video/...,Jade 3* khách sạn gần biển nhất Hạ long,#hạlongtấttầntật #hạlong #hạlongbay #hotel #kh...
1,https://www.tiktok.com/@nangdauquangninh/video...,• Khách sạn A La Carte Ha Long Bay giá commit,#khachsanalacarte #alacartehalong #khachsanhal...
2,https://www.tiktok.com/@qunh.hoa.h.long/video/...,"Trả lời @Phương 🍀 Chiếc khách sạn giá rẻ, dịc...",#halong #khachsan #khachsanhalong #reviewhalon...
3,https://www.tiktok.com/@xuananhhalong/video/72...,Khách sạn cách biển chỉ 150m Hạ Long,#khachsanhalong #dulichhalong #ReviewHaLong #v...
4,https://www.tiktok.com/@dulichha.long/video/73...,Chỉ 3️⃣0️⃣0️⃣🐠/ Người Có ngay khách sạn 3 sao...,#khachsan3sao #khachsanhalong #chil #nấmhalong...
...,...,...,...
144,https://www.tiktok.com/@xiuthuhai/video/736642...,Trả lời @Kielive chỉ với hơn 300 🐠 khách sạn ở...,#viral #betixiuriviu #fyp #danang
145,https://www.tiktok.com/@le.petit.villa.hoian/v...,"Khách Sạn Hội An chỉ 370k/2 người/ngày đêm, fr...",#hoian #khachsanhoian #lepetitvillahoian
146,https://www.tiktok.com/@locatravel.vn/video/74...,Chiếc Villa sân vườn ngay cạnh Phố Cổ cực xinh...,#locatravel #car24h #dulichhoian2024 #reviewho...
147,https://www.tiktok.com/@huongtalk/video/747720...,Khách sạn view đồng lúa 10-điểm-trọn-vẹn ở Hội An,#huongtalk #huongtalkreview #huongodau #reubou...


In [6]:
# Input CSV file name
input_file_name = 'tiktok_data.csv'

# Read video URLs from the CSV file
video_data = []
try:
    with open(input_file_name, 'r', encoding='utf-8') as csvfile:
        reader = csv.DictReader(csvfile)
        video_data = list(reader)
except FileNotFoundError:
    print(f"Error: File {input_file_name} not found")
    exit()

# Initialize the browser
driver = webdriver.Chrome(service=service, options=options)

# Browse each video URL to collect interaction data
for row in video_data:
    url = row['Video URL']
    print(f"Collecting data from: {url}")
    driver.get(url)

    likes = '0'
    comments = '0'
    shares = '0'
    saves = '0'

    try:
        # Get number of likes
        likes_element = WebDriverWait(driver, 30).until(
            EC.presence_of_element_located((By.CSS_SELECTOR, "strong[data-e2e='like-count']"))
        )
        likes = likes_element.text

        # Get number of comments
        comments_element = WebDriverWait(driver, 30).until(
            EC.presence_of_element_located((By.CSS_SELECTOR, "strong[data-e2e='comment-count']"))
        )
        comments = comments_element.text

        # Get number of shares
        shares_element = WebDriverWait(driver, 30).until(
            EC.presence_of_element_located((By.CSS_SELECTOR, "strong[data-e2e='share-count']"))
        )
        shares = shares_element.text

        # Get number of saves
        saves_element = WebDriverWait(driver, 30).until(
            EC.presence_of_element_located((By.CSS_SELECTOR, "strong[data-e2e='undefined-count']"))
        )        
        saves = saves_element.text

        # Calculate Engagement Rate
        row['Likes'] = likes
        row['Comments'] = comments
        row['Shares'] = shares
        row['Saves'] = saves

    except Exception as e:
        print(f"Error collecting data from {url}: {e}")
        row['Likes'] = 'Not found'
        row['Comments'] = 'Not found'
        row['Shares'] = 'Not found'
        row['Saves'] = 'Not found'

# Close the browser after collection is complete
driver.quit()
print("Finished collecting interaction data!")

# Write updated data back to the same CSV file
with open(input_file_name, 'w', newline='', encoding='utf-8') as csvfile:
    fieldnames = video_data[0].keys() if video_data else ['Video URL', 'Title', 'Hashtags', 'Views', 'Likes', 'Comments', 'Shares', 'Saves', 'Engagement Rate']
    writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
    writer.writeheader()
    writer.writerows(video_data)

print(f"Interaction data has been added to {input_file_name}")

Collecting data from: https://www.tiktok.com/@homnaodidi/video/7453084715533257992
Collecting data from: https://www.tiktok.com/@peanut_homestay/video/7398151412611321104
Collecting data from: https://www.tiktok.com/@kid_2509/video/7412277687949085959
Collecting data from: https://www.tiktok.com/@justfly.vn/video/7276324739923053842
Collecting data from: https://www.tiktok.com/@loveboxstaycation/video/7479356078933675282
Collecting data from: https://www.tiktok.com/@emhome.dn/video/7433402262027767056
Collecting data from: https://www.tiktok.com/@our.homestay/video/7323229083154861330
Collecting data from: https://www.tiktok.com/@_lianhomestay/video/7492390579830131986
Collecting data from: https://www.tiktok.com/@bemycoffee/video/7105617325952486683
Collecting data from: https://www.tiktok.com/@cadehehehe/video/7431187124054936840
Collecting data from: https://www.tiktok.com/@labonghouse_homestay/video/7384438262212889863
Collecting data from: https://www.tiktok.com/@_chanhome.homesta

In [7]:
import pandas as pd
# Đọc file CSV và hiển thị dữ liệu
df = pd.read_csv("tiktok_data.csv")
df

Unnamed: 0,Video URL,Title,Hashtags,Likes,Comments,Shares,Saves
0,https://www.tiktok.com/@homnaodidi/video/74530...,Staycation trong chính thành phố mà mình đang ...,#homnaodidi #hanoi #chillhome #homestay #homes...,3977,80,1903,2651
1,https://www.tiktok.com/@peanut_homestay/video/...,Homestay lãng mạng Sài Gòn chỉ từ 34k/h chia t...,#homestaysaigon #henho #peanuthomestay,5315,125,1640,2029
2,https://www.tiktok.com/@kid_2509/video/7412277...,Một chiếc homestay xinh xắn yên bình ẩn mình d...,#homestay #ninhbinh #dulich,114K,1513,17.5K,21.6K
3,https://www.tiktok.com/@justfly.vn/video/72763...,Top 5 homestay sống ảo siêu đỉnh tại Sóc Sơn,#justfly #checkin #songao #review #homestay #d...,1525,159,1969,680
4,https://www.tiktok.com/@loveboxstaycation/vide...,Hẹn hò riêng tư tại Lovebox,#homestay #loveboxhomestay #homestaysaigon #he...,11.1K,433,3237,3065
5,https://www.tiktok.com/@emhome.dn/video/743340...,"có một tình yêu nhẹ nhàng, một buổi hẹn hò bìn...",#homestaydanang #netflixandchill #reviewhomest...,5209,372,2946,2217
6,https://www.tiktok.com/@our.homestay/video/732...,"Địa điểm hẹn hò bí mật của tui và người iu, nơ...",#reviewhomestay #fypシ #viralvideo #xuhuong #xh...,52.4K,2586,11.3K,17.8K
7,https://www.tiktok.com/@_lianhomestay/video/74...,Tôy không cần gì hơn huhu,#reviewhomestayhanoi #lianhomestay #couple #yt...,1755,98,548,785
8,https://www.tiktok.com/@bemycoffee/video/71056...,Homestay ở Ninh Bình 🌸,#homestay #ninhbinh #foryou #fyp,40K,840,4612,6930
9,https://www.tiktok.com/@cadehehehe/video/74311...,tìm chỗ riêng tư để khóc? Đó là thứ mà Cade có,#homestay #homestaysaigon #cadesaigon #cade #c...,15K,114,1390,2443


In [None]:
# Input CSV file name
input_file_name = 'tiktok_data.csv'

# Open the CSV and process
updated_data = []
try:
    with open(input_file_name, 'r', encoding='utf-8') as infile:
        reader = csv.DictReader(infile)
        fieldnames = reader.fieldnames

        if 'Video URL' not in fieldnames:
            print("Error: CSV file does not contain a 'Video URL' column.")
            driver.quit()
            exit()

        if 'Transcript' not in fieldnames:
            fieldnames.append('Transcript')
        
        driver = webdriver.Chrome(service=service, options=options)

        for row in reader:
            url = row['Video URL'] + "?lang=vi-VN"
            print(f"Processing video: {url}")
            transcript_text = ''

            driver.get('https://script.tokaudit.io/')
            try:
                # Click outside the modal (bottom-left) to close the overlay popup
                modal_content = WebDriverWait(driver, 30).until(
                    EC.presence_of_element_located((By.CLASS_NAME, "modal-content"))
                )
                # Get the position and size of the modal
                location = modal_content.location
                size = modal_content.size
                # Calculate a position outside the modal
                window_size = driver.get_window_size()
                # Choose a click point (top-left corner, away from the modal)
                x_click = 10
                y_click = 10
                # Ensure the click position is not within the modal
                if x_click >= location['x'] and x_click <= location['x'] + size['width'] and \
                y_click >= location['y'] and y_click <= location['y'] + size['height']:
                    # If the position is inside the modal, choose a different location
                    x_click = window_size['width'] - 10  # Right corner
                    y_click = 10  # Still at the top
                # Perform the click
                actions = webdriver.ActionChains(driver)
                actions.move_by_offset(x_click, y_click).click().perform()
                actions.reset_actions()
                time.sleep(1)

                # Find input box and paste URL
                insert_link_input = WebDriverWait(driver, 30).until(
                    EC.visibility_of_element_located((By.CSS_SELECTOR, 'textarea[placeholder="Enter Video Url"]'))
                )
                insert_link_input.click()                
                insert_link_input.send_keys(url)

                # Click START button
                start_button = WebDriverWait(driver, 30).until(
                    EC.element_to_be_clickable((By.XPATH, '//button[contains(text(), "START")]'))
                )
                start_button.click()

                # Remove readonly checkbox and check it
                checkbox = WebDriverWait(driver, 30).until(
                    EC.presence_of_element_located((By.CSS_SELECTOR, 'input[type="checkbox"].mr-1'))
                )
                driver.execute_script("arguments[0].removeAttribute('readonly'); arguments[0].click();", checkbox)

                # Get the transcript text
                try:
                    subtitle_message = WebDriverWait(driver, 10).until(
                        EC.presence_of_element_located((By.XPATH, "//p[contains(text(), 'Subtitles Not Available')]"))
                    )
                    row['Transcript'] = 'Subtitles Not Available'
                    
                except:
                    # Get the transcript text elements
                    text_elements = WebDriverWait(driver, 30).until(
                        EC.presence_of_all_elements_located((By.CSS_SELECTOR, ".text.hover\\:text-gray-200.text-xs.text-justify"))
                    )
                    # Extract and combine the text
                    transcript = []
                    for element in text_elements:
                        transcript.append(element.text)
                    # Join all text pieces
                    full_transcript = " ".join(transcript)
                    row['Transcript'] = full_transcript

            except Exception as e:
                row['Transcript'] = 'Subtitles Not Available'

            updated_data.append(row)
            time.sleep(3)

    # Write back to CSV (or rename if you want to keep the original)
    with open(input_file_name, 'w', newline='', encoding='utf-8') as outfile:
        writer = csv.DictWriter(outfile, fieldnames=fieldnames)
        writer.writeheader()
        writer.writerows(updated_data)

    print(f"Transcript has been updated in file: {input_file_name}")

except FileNotFoundError:
    print(f"File not found: {input_file_name}")
except Exception as e:
    print(f"General error: {e}")
finally:
    driver.quit()
    print("Process completed.")

Processing video: https://www.tiktok.com/@homnaodidi/video/7453084715533257992?lang=vi-VN
Processing video: https://www.tiktok.com/@peanut_homestay/video/7398151412611321104?lang=vi-VN
Processing video: https://www.tiktok.com/@kid_2509/video/7412277687949085959?lang=vi-VN
Processing video: https://www.tiktok.com/@justfly.vn/video/7276324739923053842?lang=vi-VN
Processing video: https://www.tiktok.com/@loveboxstaycation/video/7479356078933675282?lang=vi-VN
Processing video: https://www.tiktok.com/@emhome.dn/video/7433402262027767056?lang=vi-VN
Processing video: https://www.tiktok.com/@our.homestay/video/7323229083154861330?lang=vi-VN
Processing video: https://www.tiktok.com/@_lianhomestay/video/7492390579830131986?lang=vi-VN
Processing video: https://www.tiktok.com/@bemycoffee/video/7105617325952486683?lang=vi-VN
Processing video: https://www.tiktok.com/@cadehehehe/video/7431187124054936840?lang=vi-VN
Processing video: https://www.tiktok.com/@labonghouse_homestay/video/73844382622128898

In [12]:
import pandas as pd
# Đọc file CSV và hiển thị dữ liệu
df = pd.read_csv("tiktok_data.csv")
df

Unnamed: 0,Video URL,Title,Hashtags,Likes,Comments,Shares,Saves,Transcript
0,https://www.tiktok.com/@homnaodidi/video/74530...,Staycation trong chính thành phố mà mình đang ...,#homnaodidi #hanoi #chillhome #homestay #homes...,3977,80,1903,2651,staycation trong chính thành phố mà mình đang ...
1,https://www.tiktok.com/@peanut_homestay/video/...,Homestay lãng mạng Sài Gòn chỉ từ 34k/h chia t...,#homestaysaigon #henho #peanuthomestay,5315,125,1640,2029,Subtitles Not Available
2,https://www.tiktok.com/@kid_2509/video/7412277...,Một chiếc homestay xinh xắn yên bình ẩn mình d...,#homestay #ninhbinh #dulich,114K,1513,17.5K,21.6K,Subtitles Not Available
3,https://www.tiktok.com/@justfly.vn/video/72763...,Top 5 homestay sống ảo siêu đỉnh tại Sóc Sơn,#justfly #checkin #songao #review #homestay #d...,1525,159,1969,680,hê lô hê lô hôm nay hãy cùng Chubb Life điểm q...
4,https://www.tiktok.com/@loveboxstaycation/vide...,Hẹn hò riêng tư tại Lovebox,#homestay #loveboxhomestay #homestaysaigon #he...,11.1K,433,3237,3065,Subtitles Not Available
5,https://www.tiktok.com/@emhome.dn/video/743340...,"có một tình yêu nhẹ nhàng, một buổi hẹn hò bìn...",#homestaydanang #netflixandchill #reviewhomest...,5209,372,2946,2217,Subtitles Not Available
6,https://www.tiktok.com/@our.homestay/video/732...,"Địa điểm hẹn hò bí mật của tui và người iu, nơ...",#reviewhomestay #fypシ #viralvideo #xuhuong #xh...,52.4K,2586,11.3K,17.8K,Subtitles Not Available
7,https://www.tiktok.com/@_lianhomestay/video/74...,Tôy không cần gì hơn huhu,#reviewhomestayhanoi #lianhomestay #couple #yt...,1755,98,548,785,eo ô kê vãi ê xinh nha ê nha ê ô kê la đấy các...
8,https://www.tiktok.com/@bemycoffee/video/71056...,Homestay ở Ninh Bình 🌸,#homestay #ninhbinh #foryou #fyp,40K,840,4612,6930,Subtitles Not Available
9,https://www.tiktok.com/@cadehehehe/video/74311...,tìm chỗ riêng tư để khóc? Đó là thứ mà Cade có,#homestay #homestaysaigon #cadesaigon #cade #c...,15K,114,1390,2443,Subtitles Not Available
