In [23]:
import pandas as pd
from collections import Counter
import re
import nltk
from nltk.corpus import stopwords

# Assuming the CSV file is named 'tweets.csv' and it has a column named 'tweet-body-text'
# Since the actual file is not available, this is a demonstration of how the code would look

try:
    tweets_df = pd.read_csv('tweets.csv', encoding='latin-1')
except UnicodeDecodeError as e:
    print("Error reading file:", e)

In [24]:
# nltk.download('stopwords')
# Get English stopwords from NLTK
stop_words = set(stopwords.words('english'))

# Assuming 'tweets_df' is your DataFrame after reading the CSV
tweets = tweets_df['tweet-body-text'].tolist()
all_tweets = ' '.join(tweets)
clean_tweets = re.sub(r'[^\w\s]', '', all_tweets.lower())  # Remove punctuation
clean_tweets = re.sub(r'\d+', '', clean_tweets)  # Remove numbers
words = clean_tweets.split()

# Filter out stop words
filtered_words = [word for word in words if word not in stop_words]

# Count the frequency of each word
word_counts = Counter(filtered_words)

# Get the top 10 most common words
top_30 = word_counts.most_common(50)

top_30

[('mat', 1555),
 ('movie', 1540),
 ('kilau', 1509),
 ('tu', 455),
 ('tak', 439),
 ('ni', 408),
 ('aku', 388),
 ('tengok', 302),
 ('nak', 256),
 ('yang', 230),
 ('yg', 227),
 ('dia', 223),
 ('je', 216),
 ('pun', 215),
 ('la', 197),
 ('dah', 180),
 ('ada', 171),
 ('tgk', 164),
 ('netflix', 162),
 ('tapi', 155),
 ('melayu', 153),
 ('best', 142),
 ('filem', 140),
 ('ke', 135),
 ('orang', 132),
 ('kalau', 125),
 ('buat', 121),
 ('movies', 121),
 ('lagi', 120),
 ('like', 120),
 ('kat', 118),
 ('cerita', 117),
 ('boleh', 114),
 ('dan', 111),
 ('org', 108),
 ('apa', 106),
 ('lah', 106),
 ('pasal', 106),
 ('kau', 100),
 ('mcm', 97),
 ('suka', 91),
 ('macam', 90),
 ('lain', 87),
 ('dari', 85),
 ('watch', 82),
 ('bukan', 82),
 ('sejarah', 80),
 ('itõs', 79),
 ('ramai', 79),
 ('kita', 78)]

# Translate tweet

In [61]:
df.head(5)

Unnamed: 0,tweet-avatar-link href,tweet-avatar src,tweet-header-name,tweet-header-handle,tweet-time,tweet-time href,tweet-translate-after,tweet-reply-to,tweet-reply-to 2,tweet-reply-to href,...,tweet-interact-reply,tweet-interact-retweet,tweet-interact-favorite,tweet-interact-more-menu-follow,tweet-interact-more-menu-block,tweet-interact-more-menu-mute-user,tweet-self-thread-button,tweet-self-thread-button href,emoji src,Unnamed: 21
0,https://twitter.com/zackiel1528645,https://pbs.twimg.com/profile_images/167884799...,zackiel,@zackiel1528645,2d,https://twitter.com/zackiel1528645/status/1722...,View translation,Replying to,@zamirmohyedin,https://twitter.com/zamirmohyedin,...,1,0,0,Follow @zackiel1528645,Block @zackiel1528645,Mute @zackiel1528645,Show thread,https://twitter.com/zamirmohyedin/status/17211...,,
1,https://twitter.com/Kalli_Marii,https://pbs.twimg.com/profile_images/153259792...,Kallimari,@Kalli_Marii,5d,https://twitter.com/Kalli_Marii/status/1721168...,View translation,,,,...,0,0,1,Follow @Kalli_Marii,Block @Kalli_Marii,Mute @Kalli_Marii,Show thread,https://twitter.com/Kalli_Marii/status/1721166...,https://cdn.jsdelivr.net/gh/twitter/twemoji@14...,
2,https://twitter.com/naadds,https://pbs.twimg.com/profile_images/114570151...,nadiah,@naadds,5d,https://twitter.com/naadds/status/172145389339...,,,,,...,0,0,0,Follow @naadds,Block @naadds,Mute @naadds,,,,
3,https://twitter.com/neexxzz,https://pbs.twimg.com/profile_images/170071843...,Nizz,@neexxzz,5d,https://twitter.com/neexxzz/status/17215025265...,,,,,...,2,0,3,Follow @neexxzz,Block @neexxzz,Mute @neexxzz,,,,
4,https://twitter.com/Kodeykodey1,https://pbs.twimg.com/profile_images/150656071...,Alterafro,@Kodeykodey1,November 4,https://twitter.com/Kodeykodey1/status/1720627...,View translation,Replying to,@thepatriotsasia,https://twitter.com/thepatriotsasia,...,0,0,2,Follow @Kodeykodey1,Block @Kodeykodey1,Mute @Kodeykodey1,Show thread,https://twitter.com/thepatriotsasia/status/172...,,


In [17]:
'tweet-body-translated'from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException
import pandas as pd

# Connect to existing Chrome session
chrome_options = Options()
chrome_options.add_experimental_option("debuggerAddress", "localhost:9222")
driver = webdriver.Chrome(options=chrome_options)

# Start from a specific row (set this variable as needed)
start_from_row = 0

# Read the Excel file
df = pd.read_excel(f'tweets.xlsx')

# Initialize the 'tweet-translated' column with False
df['tweet-translated'] = False

# Set up a WebDriverWait instance for handling waits
wait = WebDriverWait(driver, 10)

# Iterate through DataFrame rows
for index, row in df.iterrows():
    if index < start_from_row:
        continue  # Skip to the starting row
    try:
        # Check if 'tweet-translate-after' is a string and contains 'View translation'
        if isinstance(row['tweet-translate-after'], str) and 'View translation' in row['tweet-translate-after']:
            # Navigate to the tweet URL
            driver.get(row['tweet-time href'])

            # Wait for the page to load sufficiently
            wait.until(EC.presence_of_element_located((By.CLASS_NAME, 'tweet-body-main')))

            # Click the 'View translation' button
            translate_button = wait.until(EC.element_to_be_clickable((By.CLASS_NAME, 'tweet-translate')))
            translate_button.click()

            try:
                # Wait for the translation to appear with a timeout
                translated_text_element = wait.until(EC.visibility_of_element_located((By.CLASS_NAME, 'tweet-translated-text')), message="Translation not found within timeout")
                translated_text = translated_text_element.text
                
                # Write the translated tweet to 'tweet-body-translated' row
                df.at[index, 'tweet-body-translated'] = translated_text
                
                df.at[index, 'tweet-translated'] = True  # Indicate that this tweet has been translated
            except TimeoutException:
                print(f"Translation not found for tweet at row {index}")
        else:
            df.at[index, 'tweet-body-translated'] = df.at[index, 'tweet-body-text']
            
    except Exception as e:
        print(f"Error at row {index}: {e}")
        # Save the DataFrame state on error
        df.to_excel(f'error_at_row_{index}.xlsx', index=False)
        # Continue to the next row on error

# Close the driver
driver.quit()

# Save the DataFrame to a new Excel file
df.to_excel('translated_tweets.xlsx', index=False)


Error at row 1057: Message: 
Stacktrace:
0   chromedriver                        0x0000000102952004 chromedriver + 4169732
1   chromedriver                        0x0000000102949ff8 chromedriver + 4136952
2   chromedriver                        0x000000010259f500 chromedriver + 292096
3   chromedriver                        0x00000001025e47a0 chromedriver + 575392
4   chromedriver                        0x000000010261f818 chromedriver + 817176
5   chromedriver                        0x00000001025d85e8 chromedriver + 525800
6   chromedriver                        0x00000001025d94b8 chromedriver + 529592
7   chromedriver                        0x0000000102918334 chromedriver + 3932980
8   chromedriver                        0x000000010291c970 chromedriver + 3950960
9   chromedriver                        0x0000000102900774 chromedriver + 3835764
10  chromedriver                        0x000000010291d478 chromedriver + 3953784
11  chromedriver                        0x00000001028f2ab4 chr

In [23]:
# df.to_excel(f'update_tweets.xlsx', index=False)
df.iloc[0]["tweet-body-text"]

'Ajak rakyat boikot bakal movie2 boxoffice yg akan keluar tidak lama lagi. Jangan lupa industri movie juga penyumbang terbesar mereka apa lagi film Wonder woman terang2 pelakun bangsa yahudi. Cuba lihat film di negara Mat Kilau boxoffice di negara kita sahaja dah berapa kutipan.'