In [25]:
import pandas as pd
import os
import copy
import re
import ast

from tqdm.notebook import tqdm
tqdm.pandas()

import urllib.request
from urllib.parse import urlparse
from bs4 import BeautifulSoup
import requests
import random
import itertools
import nltk
# nltk.download('wordnet')

import string
import enchant
import swifter

In [26]:
DATE = '07052023'
tweets_file_name = 'all_tweets_from_last_7_days_{}_with_sentiment_emotion.csv'.format(DATE)

# Read data

In [27]:
# Read the data
posts = pd.read_csv(tweets_file_name, nrows=None)

In [28]:
if 'tokens' in posts.columns:
    tokens_list = [ast.literal_eval(pp) for pp in posts['tokens'].values.tolist()]
    posts["tokens"] = tokens_list

In [29]:
posts["id"] = posts["id"].astype(str)
posts["author_id"] = posts["author_id"].astype(str)

# Extract URLs from text

In [30]:
URL_REGEX = r'(https?://\S+)'

In [31]:
def extract_url_from_text(text):
    urls = re.findall(URL_REGEX, text)
    return urls

In [32]:
# Example
text = """https://t.co/IAIl5xbttn What does ChatGPT return about human values? Exploring value bias in ChatGPT using a descriptive value theory. (arXiv:2304.03612v1 [https://t.co/HW5RVw5sac]) #NLProc"""

extract_url_from_text(text)

['https://t.co/IAIl5xbttn', 'https://t.co/HW5RVw5sac])']

In [33]:
posts["urls"] = posts["post"].progress_apply(lambda x: extract_url_from_text(x))

  0%|          | 0/295741 [00:00<?, ?it/s]

In [34]:
print("Non-empty extractec tiny URLs: ", posts[posts["urls"].map(len) > 0].shape[0])

Non-empty extractec tiny URLs:  197275


# Extract images from URL HTML 

In [35]:
import time 
 
import pandas as pd 
from selenium import webdriver 
from selenium.webdriver import Chrome 
from selenium.webdriver.chrome.service import Service 
from selenium.webdriver.common.by import By 
from webdriver_manager.chrome import ChromeDriverManager

In [36]:
# start by defining the options 
options = webdriver.ChromeOptions() 
options.headless = True # it's more scalable to work in headless mode 
# normally, selenium waits for all resources to download 
# we don't need it as the page also populated with the running javascript code. 
options.page_load_strategy = 'none' 
# this returns the path web driver downloaded 
chrome_path = ChromeDriverManager().install() 
chrome_service = Service(chrome_path) 
# pass the defined options and service objects to initialize the web driver 
driver = Chrome(options=options, service=chrome_service) 
driver.implicitly_wait(5)

  options.headless = True # it's more scalable to work in headless mode


In [37]:
def extract_imgs_from_url(driver):
    img_links = []
    imgs = driver.find_elements(By.TAG_NAME, "img")
    for img in imgs:
        img_src = img.get_attribute("src")
        if '/media/' in img_src:
            img_links.append(img_src)
    return img_links

In [38]:
def scrape_from_url(url):
    time.sleep(random.uniform(0.5,1.1))
    try:
        driver.get(url)
        img_links = extract_imgs_from_url(driver)
        return img_links
    except Exception as e:
        #return ["ERROR :{}".format(e)]
        return []
    
def scrape_from_urls(urls):
    all_links = []
    if not urls:
        return all_links
    for url in urls:
        img_links = scrape_from_url(url)
        all_links.append(img_links)
    img_links_merged = list(itertools.chain.from_iterable(all_links))
    return img_links_merged

In [39]:
# Examples
url = "https://twitter.com/georgtanner/status/1647549133652361220/photo/1"
url = "https://twitter.com/JustIdeals/status/1634078983640981504"
imgs = scrape_from_url(url)

In [None]:
posts["imgs_urls"] = posts["urls"].progress_apply(lambda x: scrape_from_urls(x))

  0%|          | 0/295741 [00:00<?, ?it/s]

In [None]:
print("Non-empty image URLs: ", posts[posts["imgs_urls"].map(len) > 0].shape[0])

# Image to Text using pytesseract

In [92]:
import pytesseract
from PIL import Image
import io
pytesseract.pytesseract.tesseract_cmd = r'C:\Program Files\Tesseract-OCR\tesseract.exe'

In [93]:
print(pytesseract.get_languages(config='')) 

['eng', 'osd']


In [94]:
def img_to_text(img_url, lang="eng", timeout=10):
    time.sleep(random.uniform(0.2, 0.5))
    try:
        response = requests.get(img_url)
        img = Image.open(io.BytesIO(response.content))
        text = pytesseract.image_to_string(img, lang=lang, timeout=timeout)
        return text
    except Exception as e:
        # return "ERROR :{}".format(e)
        return ""

def imgs_to_texts(urls, lang="eng", timeout=10):
    all_texts = []
    for url in urls:
        text = img_to_text(url, lang=lang, timeout=timeout)
        all_texts.append(text)
    all_texts = list(set(all_texts))
    return all_texts

In [95]:
eng_dict = enchant.Dict("en_US")

SENTENCES_TO_IGNORE = ["Send a message..."]
CHATGPT_SCREENSHOT_PATTERN = r"ChatGPT [A-z][a-z][a-z] [1-9][0-9] Version"
TIME_PATTERN_REGEX = ""
def clean_img_text(img_text):
    source = 'Other'
    img_text = img_text.replace(r'\n', ' ').replace(r'\r', '').strip()
    img_text = re.sub(r'http\S+', '', img_text, flags=re.MULTILINE)
    img_text = re.sub(r'(\d+):(\d+)', '', img_text)
    
    chatgpt_patterns = re.search(CHATGPT_SCREENSHOT_PATTERN, img_text, flags=re.MULTILINE)
    if chatgpt_patterns:
        source = "ChatGPT"
        print(chatgpt_patterns[0])
        img_text = img_text.split(chatgpt_patterns[0])[0]
        
    img_text = ' '.join(word for word in img_text.split() if len(word.strip().translate(str.maketrans('', '', string.punctuation))) > 0 and eng_dict.check(word.strip().translate(str.maketrans('', '', string.punctuation))))

    for sent in SENTENCES_TO_IGNORE:
        if sent in img_text:
            img_text = img_text.replace(sent, '')
    return img_text, source

def clean_imgs_texts(img_texts):
    all_texts = []
    all_sources = []
    for text in img_texts:
        clean_text, source = clean_img_text(text)
        all_texts.append(clean_text)
        all_sources.append(source)
    return all_texts, all_sources

In [96]:
# Example
img_url1 = "https://pbs.twimg.com/media/Ft1FM8QaEAAXobl?format=jpg&name=medium"
img_url2 = "https://pbs.twimg.com/media/Fq1qLHtaUAAf0hM?format=jpg&name=large"
img_to_text(img_url2, lang="eng", timeout=10)
ex_te = imgs_to_texts([img_url1, img_url2])
ex_te

['10:25 oll S 14)\n\n@ chat.openai.com G\n\nRisks with Al Algorithms. +\n\nIt is true that algorithms can sometimes\nproduce correct results for the wrong\nreasons. This can happen if the data used to\ntrain the algorithm is biased or incomplete, or\nif the algorithm is not designed to account for\n\nall relevant factors.\n\nFor example, an algorithm that predicts\nwhether a loan applicant is likely to default\nmay be trained on data that is biased against\ncertain groups of people, such as minorities\nor low-income individuals. Even if the\nalgorithm produces accurate predictions\nbased on this data, it may be discriminating\nagainst these groups and perpetuating unfair\n\nlending practices.\n\nTo mitigate this risk, it is important to\ncarefully design algorithms and ensure that\n\n. . v\nthey are trained on representative and\n\n7 9\n\nChatGPT Feb 13 Version. Free Research Preview. Our goal is to make\nAl systems more natural and safe to interact with. Your feedback will\n\nhelp us 

In [97]:
posts["imgs_texts"] = posts["imgs_urls"].progress_apply(lambda x: imgs_to_texts(x))

  0%|          | 0/100 [00:00<?, ?it/s]

In [98]:
posts["imgs_clean_texts"], posts["imgs_text_source"] = zip(*posts["imgs_texts"].progress_apply(lambda x: clean_imgs_texts(x)))

  0%|          | 0/100 [00:00<?, ?it/s]

In [104]:
print("Non-empty image texts: ", posts[posts["imgs_clean_texts"].map(len) > 0].shape[0])

Non-empty image texts:  25


In [113]:
posts[posts["imgs_clean_texts"].map(len) > 0].head()

Unnamed: 0,author_id,created_at,id,post,edit_history_tweet_ids,retweet_count,reply_count,like_count,quote_count,impression_count,...,clean_post_for_sentiment,sentiment_score,sentiment,emotion,emotion_score,urls,imgs_urls,imgs_texts,imgs_clean_texts,imgs_text_source
0,227768169,2023-02-09T18:53:48.000Z,1623757070825660419,You can now text @TSA to ask them questions! P...,['1623757070825660419'],0,0,0,0,13,...,You can now text to ask them questions Please...,0.939657,Neutral,neutral,0.916366,[https://t.co/BnEUnkynP7],[https://pbs.twimg.com/media/FoizvygXoAEbo12?f...,"[=\n\n=\nASKTSA\n\nv er [~)\n@AskTSA Text ""TRA...","[v er Text ""TRAVEL"" to (275872)]",[Other]
7,42633253,2023-02-09T18:05:21.000Z,1623744878030073857,"Please join ACEC Georgia IT Forum Chairman, Ro...",['1623744878030073857'],0,0,0,0,17,...,Please join ACEC Georgia IT Forum Chairman Rob...,0.789183,Neutral,neutral,0.856153,"[https://t.co/pAG3JsnDFo, https://t.co/CfyCHDG...",[https://pbs.twimg.com/media/FoizW7PWIAIsAX7?f...,"[ACEC Georgia\n\nIT FORUM\n\nFebruary 22, 2023...","[Georgia IT FORUM February 22, 2023 PM PM EDT]",[Other]
11,3184112874,2023-02-09T17:49:10.000Z,1623740805650468864,Question:\nCan #OpenAI #ChatGPT give me LEGAL ...,['1623740805650468864'],0,0,0,0,34,...,Question\nCan hashtag open ai hashtag chat gpt...,0.618685,Neutral,fear,0.938373,[https://t.co/Ny2jsVbG1b],[https://pbs.twimg.com/media/FwBvrdzacAAqsE4?f...,[],[],[Other]
12,931470139,2023-02-09T17:45:05.000Z,1623739777978626049,Unpopular opinion: Microsoft integrating ChatG...,['1623739777978626049'],8,26,92,3,8307,...,Unpopular opinion Microsoft integrating ChatGP...,0.928594,Negative,neutral,0.470574,[https://t.co/SVnVwg1ys8],[https://pbs.twimg.com/media/FoiuWUBWIAEfTS4?f...,[EARTHWEB\n\nSEARCH ENGINE MARKET SHARE\n\nGoo...,[SEARCH ENGINE MARKET SHARE Google Yahoo 92.42...,"[Other, Other]"
15,4557770926,2023-02-09T17:38:28.000Z,1623738115545518081,Asked #ChatGPT opinion on job opportunities wh...,['1623738115545518081'],0,0,0,0,29,...,Asked hashtag chat gpt opinion on job oppounit...,0.96906,Neutral,neutral,0.563248,[https://t.co/2KPKcWNvAX],[https://pbs.twimg.com/media/FoisuSZakAA6Pns?f...,[chat.openal.com/chat.\n\n‘Ge an 20 Fee Reseac...,[an 20 Fee Out goa make Al gate mete MO. de [G...,[Other]


# Save data to file

In [None]:
save_file_path = tweets_file_name.split[".csv"][0] + "_img_texts" + ".csv"
posts.to_csv(save_file_path, index=False)