In [18]:
import pandas as pd
import os
import copy
import re
import ast

from tqdm.notebook import tqdm
tqdm.pandas()

import urllib.request
from urllib.parse import urlparse
from bs4 import BeautifulSoup
import requests
import random
import itertools
import nltk
# nltk.download('wordnet')
import string
import enchant
import swifter
import tqdm

In [20]:
DATE = '07052023'
tweets_file_name = 'all_tweets_from_last_7_days_{}_with_sentiment_emotion'.format(DATE)

# Read data

In [218]:
# Read the data
posts = pd.read_csv("{}.csv".format(tweets_file_name), nrows=None)

In [219]:
if 'tokens' in posts.columns:
    tokens_list = [ast.literal_eval(pp) for pp in posts['tokens'].values.tolist()]
    posts["tokens"] = tokens_list

In [220]:
posts["id"] = posts["id"].astype(str)
posts["author_id"] = posts["author_id"].astype(str)

In [221]:
# get the tweets with the most engagement and check the scrapping on them
posts = posts.sort_values(by=["engagement_count"], ascending=True)[0:10_000]

# Extract URLs from text

In [223]:
URL_REGEX = r'(https?://\S+)'

In [224]:
def extract_url_from_text(text):
    urls = re.findall(URL_REGEX, text)
    return urls

In [227]:
posts["urls"] = posts["post"].progress_apply(lambda x: extract_url_from_text(x))

  0%|          | 0/10000 [00:00<?, ?it/s]

In [228]:
print("Non-empty extractec tiny URLs: ", posts[posts["urls"].map(len) > 0].shape[0])

Non-empty extractec tiny URLs:  5662


# Extract images from URL HTML 

In [229]:
import time 
 
import pandas as pd 
from selenium import webdriver 
from selenium.webdriver import Chrome 
from selenium.webdriver.chrome.service import Service 
from selenium.webdriver.common.by import By 
from webdriver_manager.chrome import ChromeDriverManager

In [230]:
# start by defining the options 
options = webdriver.ChromeOptions() 
options.headless = True # it's more scalable to work in headless mode 
# normally, selenium waits for all resources to download 
# we don't need it as the page also populated with the running javascript code. 
options.page_load_strategy = 'none' 
# this returns the path web driver downloaded 
chrome_path = ChromeDriverManager().install() 
chrome_service = Service(chrome_path) 
# pass the defined options and service objects to initialize the web driver 
driver = Chrome(options=options, service=chrome_service) 
driver.implicitly_wait(5)

  options.headless = True # it's more scalable to work in headless mode


In [231]:
def extract_imgs_from_url(driver):
    img_links = []
    try:
        imgs = driver.find_elements(By.TAG_NAME, "img")
        for img in imgs:
            img_src = img.get_attribute("src")
            if '/media/' in img_src:
                img_links.append(img_src)
        return img_links
    except Exception as e:
        return [f"ERROR: {e}"]

In [232]:
def scrape_from_url(url):
    time.sleep(random.uniform(0.6,1.1))
    driver = Chrome(options=options, service=chrome_service) 
    driver.get(url)
    time.sleep(6)
    img_links = extract_imgs_from_url(driver)
    driver.close()
    return img_links
    
def scrape_from_urls(urls):
    all_links = []
    if not urls:
        return all_links
    for url in urls:
        img_links = scrape_from_url(url)
        all_links.append(img_links)
    img_links_merged = list(itertools.chain.from_iterable(all_links))
    return img_links_merged

In [233]:
posts["imgs_urls"] = posts["urls"].progress_apply(lambda x: scrape_from_urls(x))

  0%|          | 0/10000 [00:00<?, ?it/s]

In [234]:
print("Non-empty image URLs: ", posts[posts["imgs_urls"].map(len) > 0].shape[0])

Non-empty image URLs:  4130


# Image to Text using pytesseract

In [138]:
import pytesseract
from PIL import Image
import io
pytesseract.pytesseract.tesseract_cmd = r'C:\Program Files\Tesseract-OCR\tesseract.exe'

In [139]:
print(pytesseract.get_languages(config='')) 

['eng', 'osd']


In [140]:
def img_to_text(img_url, lang="eng", timeout=10):
    time.sleep(random.uniform(0.2, 0.5))
    try:
        response = requests.get(img_url)
        img = Image.open(io.BytesIO(response.content))
        text = pytesseract.image_to_string(img, lang=lang, timeout=timeout)
        return text
    except Exception as e:
        # return "ERROR :{}".format(e)
        return ""

def imgs_to_texts(urls, lang="eng", timeout=10):
    all_texts = []
    for url in urls:
        text = img_to_text(url, lang=lang, timeout=timeout)
        all_texts.append(text)
    all_texts = list(set(all_texts))
    return all_texts

In [187]:
eng_dict = enchant.Dict("en_US")

SENTENCES_TO_IGNORE = ["Send a message..."]
CHATGPT_SCREENSHOT_PATTERN = r"ChatGPT [A-z][a-z][a-z] [1-9][0-9] Version"
TIME_PATTERN_REGEX = ""
def clean_img_text(img_text):
    source = 'Other'
    img_text = img_text.replace(r'\n', ' ').replace(r'\r', '').strip()
    img_text = re.sub(r'http\S+', '', img_text, flags=re.MULTILINE)
    img_text = re.sub(r'(\d+):(\d+)', '', img_text)
    
    chatgpt_patterns = re.search(CHATGPT_SCREENSHOT_PATTERN, img_text, flags=re.MULTILINE)
    if chatgpt_patterns:
        source = "ChatGPT"
        print(chatgpt_patterns[0])
        img_text = img_text.split(chatgpt_patterns[0])[0]
    
    translating = str.maketrans('', '', string.punctuation)
    new_img_text = img_text.translate(translating)
    img_text = ' '.join(word for word in new_img_text.split() if len(word.strip()) > 0 and eng_dict.check(word.strip()))

    for sent in SENTENCES_TO_IGNORE:
        if sent in img_text:
            img_text = img_text.replace(sent, '')
    return img_text, source

def clean_imgs_texts(img_texts):
    all_texts = []
    all_sources = []
    for text in img_texts:
        clean_text, source = clean_img_text(text)
        all_texts.append(clean_text)
        all_sources.append(source)
    return all_texts, all_sources

In [188]:
posts["imgs_texts"] = posts["imgs_urls"].progress_apply(lambda x: imgs_to_texts(x))

  0%|          | 0/1 [00:00<?, ?it/s]

In [189]:
posts["imgs_clean_texts"], posts["imgs_text_source"] = zip(*posts["imgs_texts"].progress_apply(lambda x: clean_imgs_texts(x)))

  0%|          | 0/1 [00:00<?, ?it/s]

In [190]:
print("Non-empty image texts: ", posts[posts["imgs_clean_texts"].map(len) > 0].shape[0])

Non-empty image texts:  1


In [206]:
def get_only_chatgpt_texts(x):
    chtgpt_txts = []
    for  text, source in zip(x[0], x[1]):
        if source == "ChatGPT":
            chtgpt_txts.append(text)
    return " ".join(chtgpt_txts)
 
posts["imgs_clean_texts_chatgpt"] = posts[["imgs_clean_texts", "imgs_text_source"]].progress_apply(lambda x: get_only_chatgpt_texts(x), axis=1)

  0%|          | 0/1 [00:00<?, ?it/s]

In [None]:
posts[posts["imgs_clean_texts"].map(len) > 0].head()

In [None]:
posts[posts["imgs_clean_texts_chatgpt"].map(len) > 0].head()

# Save data to file

In [38]:
save_file_path = tweets_file_name.split(".csv")[0] + "_img_texts_10_000_most_engaged" + ".csv"
posts.to_csv(save_file_path, index=False)

# Merge intermediate files

In [22]:
# Merge all intermediate files
files_dir = 'Scrapping'
num_files = 0
posts_all_df = pd.DataFrame()
for i in range(20):
    if i < 20:
        f_path = os.path.join(files_dir, "{}_{}_{}_{}.csv".format(tweets_file_name, "img_texts", i*10_000, (i+1)*10_000))
    if os.path.exists(f_path):
        num_files = i + 1
        print("Reading file {}".format(f_path))
    else:
        continue
    interm_df = pd.read_csv(f_path)
    posts_all_df = pd.concat([posts_all_df, interm_df], axis=0)
    
for i in range(10):
    f_path = os.path.join(files_dir, "{}_{}_{}_{}.csv".format(tweets_file_name, "img_texts", 100_000 + (i) * 20_000, 100_000 + (i+1)*20_000))
    if os.path.exists(f_path):
        num_files = i + 1
        print("Reading file {}".format(f_path))
    else:
        continue
    interm_df = pd.read_csv(f_path)
    posts_all_df = pd.concat([posts_all_df, interm_df], axis=0)
    
posts_all_df = posts_all_df.reset_index(drop=True)
print("Total files merged: {}".format(num_files))
print("Total tweets merged: {}".format(posts_all_df.shape[0]))

Reading file Scrapping\all_tweets_from_last_7_days_07052023_with_sentiment_emotion_img_texts_0_10000.csv
Reading file Scrapping\all_tweets_from_last_7_days_07052023_with_sentiment_emotion_img_texts_10000_20000.csv
Reading file Scrapping\all_tweets_from_last_7_days_07052023_with_sentiment_emotion_img_texts_20000_30000.csv
Reading file Scrapping\all_tweets_from_last_7_days_07052023_with_sentiment_emotion_img_texts_30000_40000.csv
Reading file Scrapping\all_tweets_from_last_7_days_07052023_with_sentiment_emotion_img_texts_40000_50000.csv
Reading file Scrapping\all_tweets_from_last_7_days_07052023_with_sentiment_emotion_img_texts_50000_60000.csv
Reading file Scrapping\all_tweets_from_last_7_days_07052023_with_sentiment_emotion_img_texts_60000_70000.csv
Reading file Scrapping\all_tweets_from_last_7_days_07052023_with_sentiment_emotion_img_texts_70000_80000.csv
Reading file Scrapping\all_tweets_from_last_7_days_07052023_with_sentiment_emotion_img_texts_80000_90000.csv
Reading file Scrapping\

  interm_df = pd.read_csv(f_path)


Total files merged: 9
Total tweets merged: 275741


In [23]:
posts_all_df["id"] = posts_all_df["id"].astype(str)

In [24]:
# remove duplicates
posts_all_df = posts_all_df.drop_duplicates(subset=["id"])
print("Total tweets merged with de-deduplication: {}".format(posts_all_df.shape[0]))

Total tweets merged with de-deduplication: 275741


In [29]:
from spacy.lang.en import stop_words
import little_mallet_wrapper

from spacy.lang.en import stop_words

extended_stop_wrods = list(stop_words.STOP_WORDS)
extended_stop_wrods.remove('no')
extended_stop_wrods.remove('not')
if "doesnt" in extended_stop_wrods:
    extended_stop_wrods.remove("doesnt")
if "doesn't" in extended_stop_wrods:
    extended_stop_wrods.remove("doesn't")
if "dont" in extended_stop_wrods:
    extended_stop_wrods.remove("dont")
if "don't" in extended_stop_wrods:
    extended_stop_wrods.remove("don't")
if "aren't" in extended_stop_wrods:
    extended_stop_wrods.remove("aren't")
if "arent" in extended_stop_wrods:
    extended_stop_wrods.remove("arent")
if "give" in extended_stop_wrods:
    extended_stop_wrods.remove("give")
if "up" in extended_stop_wrods:
    extended_stop_wrods.remove("up")
if "call" in extended_stop_wrods:
    extended_stop_wrods.remove("call")
if "down" in extended_stop_wrods:
    extended_stop_wrods.remove("down")
STOP_WORDS = set(extended_stop_wrods)

def get_url_patern():
    return re.compile(
        r'(https?:\/\/(?:www\.|(?!www))[a-zA-Z0-9][a-za-z0-9-]+[a-zA-Z0-9]\.[^\s]{2,}|https?:\/\/(?:www\.|(?!www))'
        r'[a-zA-Z0-9]\.[^\s]{2,}|www\.[a-zA-Z0-9]\.[^\s]{2,})')


def get_hashtags_pattern():
    return re.compile(r'#\w\*')


def get_single_letter_words_pattern():
    return re.compile(r'(?<![\w-])\w(?![\w-])')


def get_blank_spaces_pattern():
    return re.compile(r'\s{2,}|\t')


def get_twitter_reserved_words_pattern():
    return re.compile(r'(RT|rt|FAV|fav|VIA|via)')


def get_mentions_pattern():
    return re.compile(r'@\w\*')


def clean_sentence(sentence, remove_stop_words=False, remove_short_words=False, remove_numbers=True):
    sentence = re.sub(pattern=get_url_patern(), repl="", string=sentence)
    sentence = re.sub(pattern=get_mentions_pattern(), repl="", string=sentence)
    sentence = re.sub(pattern=get_hashtags_pattern(), repl="", string=sentence)
    sentence = re.sub(pattern=get_twitter_reserved_words_pattern(), repl='', string=sentence)
    sentence = re.sub(r'http\S+', "", sentence)  # remove http links
    sentence = re.sub(r'bit.ly/\S+', "", sentence)  # remove bitly links
    sentence = sentence.strip('[link]')  # remove [links]
    sentence = re.sub('(RT\s@[A-Za-z]+[A-Za-z0-9-*]+)', "", sentence)  # remove retweet
    sentence = re.sub('(@[A-Za-z]+[A-Za-z0-9-_]+)', "", sentence)  # remove tweeted at
    sentence = sentence.encode('ascii', 'ignore').decode('ascii')

    clean_sentence = little_mallet_wrapper.process_string(sentence,
                                                          numbers='remove' if remove_numbers else None,
                                                          remove_stop_words=remove_stop_words,
                                                          remove_short_words=remove_short_words)
    return clean_sentence

import unicodedata
# Our spaCy model:
import en_core_web_trf
nlp = en_core_web_trf.load()
from nltk.tokenize.toktok import ToktokTokenizer

CONTRACTION_MAP = {
"ain't": "is not",
"aren't": "are not",
"can't": "cannot",
"can't've": "cannot have",
"'cause": "because",
"could've": "could have",
"couldn't": "could not",
"couldn't've": "could not have",
"didn't": "did not",
"doesn't": "does not",
"don't": "do not",
"hadn't": "had not",
"hadn't've": "had not have",
"hasn't": "has not",
"haven't": "have not",
"he'd": "he would",
"he'd've": "he would have",
"he'll": "he will",
"he'll've": "he he will have",
"he's": "he is",
"how'd": "how did",
"how'd'y": "how do you",
"how'll": "how will",
"how's": "how is",
"I'd": "I would",
"I'd've": "I would have",
"I'll": "I will",
"I'll've": "I will have",
"I'm": "I am",
"I've": "I have",
"i'd": "i would",
"i'd've": "i would have",
"i'll": "i will",
"i'll've": "i will have",
"i'm": "i am",
"i've": "i have",
"isn't": "is not",
"it'd": "it would",
"it'd've": "it would have",
"it'll": "it will",
"it'll've": "it will have",
"it's": "it is",
"let's": "let us",
"ma'am": "madam",
"mayn't": "may not",
"might've": "might have",
"mightn't": "might not",
"mightn't've": "might not have",
"must've": "must have",
"mustn't": "must not",
"mustn't've": "must not have",
"needn't": "need not",
"needn't've": "need not have",
"o'clock": "of the clock",
"oughtn't": "ought not",
"oughtn't've": "ought not have",
"shan't": "shall not",
"sha'n't": "shall not",
"shan't've": "shall not have",
"she'd": "she would",
"she'd've": "she would have",
"she'll": "she will",
"she'll've": "she will have",
"she's": "she is",
"should've": "should have",
"shouldn't": "should not",
"shouldn't've": "should not have",
"so've": "so have",
"so's": "so as",
"that'd": "that would",
"that'd've": "that would have",
"that's": "that is",
"there'd": "there would",
"there'd've": "there would have",
"there's": "there is",
"they'd": "they would",
"they'd've": "they would have",
"they'll": "they will",
"they'll've": "they will have",
"they're": "they are",
"they've": "they have",
"to've": "to have",
"wasn't": "was not",
"we'd": "we would",
"we'd've": "we would have",
"we'll": "we will",
"we'll've": "we will have",
"we're": "we are",
"we've": "we have",
"weren't": "were not",
"what'll": "what will",
"what'll've": "what will have",
"what're": "what are",
"what's": "what is",
"what've": "what have",
"when's": "when is",
"when've": "when have",
"where'd": "where did",
"where's": "where is",
"where've": "where have",
"who'll": "who will",
"who'll've": "who will have",
"who's": "who is",
"who've": "who have",
"why's": "why is",
"why've": "why have",
"will've": "will have",
"won't": "will not",
"won't've": "will not have",
"would've": "would have",
"wouldn't": "would not",
"wouldn't've": "would not have",
"y'all": "you all",
"y'all'd": "you all would",
"y'all'd've": "you all would have",
"y'all're": "you all are",
"y'all've": "you all have",
"you'd": "you would",
"you'd've": "you would have",
"you'll": "you will",
"you'll've": "you will have",
"you're": "you are",
"you've": "you have"
}

def remove_stopwords(text, tokenizer, is_lower_case=False):
    tokens = tokenizer.tokenize(text)
    tokens = [token.strip() for token in tokens]
    if is_lower_case:
        filtered_tokens = [token for token in tokens if token not in STOP_WORDS]
    else:
        filtered_tokens = [token for token in tokens if token.lower() not in STOP_WORDS]
    filtered_text = ' '.join(filtered_tokens)    
    return filtered_text

def lemmatize_text(text):
    text = nlp(text)
    text = ' '.join([word.lemma_ if word.lemma_ != '-PRON-' else word.text for word in text])
    return text

def remove_special_characters(text, remove_digits=False):
    pattern = r'[^a-zA-z0-9\s]' if not remove_digits else r'[^a-zA-z\s]'
    text = re.sub(pattern, '', text)
    return text

def expand_contractions(text, contraction_mapping=CONTRACTION_MAP):
    
    contractions_pattern = re.compile('({})'.format('|'.join(contraction_mapping.keys())), 
                                      flags=re.IGNORECASE|re.DOTALL)
    def expand_match(contraction):
        match = contraction.group(0)
        first_char = match[0]
        expanded_contraction = contraction_mapping.get(match)\
                                if contraction_mapping.get(match)\
                                else contraction_mapping.get(match.lower())                       
        expanded_contraction = first_char+expanded_contraction[1:]
        return expanded_contraction
        
    expanded_text = contractions_pattern.sub(expand_match, text)
    expanded_text = re.sub("'", "", expanded_text)
    return expanded_text

def remove_accented_chars(text):
    text = unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode('utf-8', 'ignore')
    return text

def remove_url_tokens(text):
    text = re.sub(pattern=get_twitter_reserved_words_pattern(), repl='', string=text)
    text = re.sub(pattern=get_url_patern(), repl="", string=text)
    text = re.sub(r'http\S+', "", text)  # remove http links
    text = re.sub(r'bit.ly/\S+', "", text)  # remove bitly links
    text = text.strip('[link]')  # remove [links]
    return text

def remove_short_tokens(text, remove_len=2):
    return ' '.join([w for w in text.split(" ") if (len(w) > remove_len) or (w in stop_words.STOP_WORDS)])

def normalize_sentence(doc, tokenizer=ToktokTokenizer(), remove_urls=True, remove_short_words=True, contraction_expansion=True, remove_mentions=True,
                     accented_char_removal=True, text_lower_case=True, 
                     text_lemmatization=True, special_char_removal=True, 
                     stopword_removal=True, remove_digits=True):
    # normalize each document in the corpus
    if remove_urls:
        doc = remove_url_tokens(doc)
    if remove_mentions:
        doc = re.sub(pattern=get_mentions_pattern(), repl="", string=doc)
        doc = re.sub('(RT\s@[A-Za-z]+[A-Za-z0-9-*]+)', "", doc)  # remove retweet
        doc = re.sub('(@[A-Za-z]+[A-Za-z0-9-_]+)', "", doc)  # remove tweeted at
        doc = re.sub(pattern=get_twitter_reserved_words_pattern(), repl='', string=doc)
        doc = doc.replace("&amp", "")
        doc = doc.replace("'s", "")
    # remove accented characters
    if accented_char_removal:
        doc = remove_accented_chars(doc)
    # expand contractions    
    if contraction_expansion:
        doc = expand_contractions(doc)
    # lowercase the text    
    if text_lower_case:
        doc = doc.lower()
    # remove extra newlines
    doc = re.sub(r'[\r|\n|\r\n]+', ' ',doc)
    # lemmatize text
    if text_lemmatization:
        doc = lemmatize_text(doc)
    # remove special characters and\or digits    
    if special_char_removal:
        # insert spaces between special characters to isolate them    
        special_char_pattern = re.compile(r'([{.(-)!}])')
        doc = special_char_pattern.sub(" \\1 ", doc)
        doc = remove_special_characters(doc, remove_digits=remove_digits)  
    # remove extra whitespace
    doc = re.sub(' +', ' ', doc)
    # remove stopwords
    if stopword_removal:
        doc = remove_stopwords(doc, tokenizer, is_lower_case=text_lower_case)  
    if remove_short_words:
        doc = remove_short_tokens(doc)
    return doc

In [30]:
posts_all_df["imgs_clean_texts_chatgpt_cleaned"] = posts_all_df["imgs_clean_texts_chatgpt"].progress_apply(lambda x: normalize_sentence(x, tokenizer=ToktokTokenizer(),
                                                                                                                                        remove_urls=True,
                                                                                                                                        remove_short_words=True,
                                                                                                                                        contraction_expansion=True,
                                                                                                                                        remove_mentions=True,
                                                                                                                                        accented_char_removal=True,
                                                                                                                                        text_lower_case=True,
                                                                                                                                        text_lemmatization=True,
                                                                                                                                        special_char_removal=True,
                                                                                                                                        stopword_removal=True,
                                                                                                                                        remove_digits=True)
                                                                                                           if not pd.isnull(x) else x)

  0%|          | 0/275741 [00:00<?, ?it/s]

In [None]:
non_null_cleand_gpt_texts = posts_all_df[~posts_all_df["imgs_clean_texts_chatgpt_cleaned"].isnull()]

In [32]:
non_null_cleand_gpt_texts.shape

(42, 26)

In [33]:
save_file_path = tweets_file_name + "_img_texts" + ".csv"
posts_all_df.to_csv(save_file_path, index=False)

# Print Statistics

In [34]:
file_path = tweets_file_name + "_img_texts" + ".csv"
posts = pd.read_csv(file_path)

  posts = pd.read_csv(file_path)


In [35]:
import numpy as np

def convert_to_nan(x):
    x = [ix for ix in x if len(ix)> 0]
    if len(x) == 0:
        return None
    if "ERROR" in x and len(x) == 1:
        return None
    if "" in x and len(x) == 1:
        return None
    return x
posts["imgs_texts"] = posts["imgs_texts"].apply(lambda x: convert_to_nan(ast.literal_eval(x)))
posts["imgs_clean_texts"] = posts["imgs_clean_texts"].apply(lambda x: convert_to_nan(ast.literal_eval(x)))
posts["imgs_text_source"] = posts["imgs_text_source"].apply(lambda x: convert_to_nan(ast.literal_eval(x)))

In [None]:
posts[~posts["imgs_clean_texts_chatgpt_cleaned"].isnull()].head()

In [36]:
total_posts = posts.shape[0]
print("Total number of tweets: ", total_posts)

urls = posts[posts["urls"].map(len) > 0]
print("Tweets with extracted tiny URLs: {} ({}%)".format(urls.shape[0], round(100 * urls.shape[0] / total_posts, 2)))
imgs_urls = posts[posts["imgs_urls"].map(len) > 0]
print("Tweets with scrapped URLs: {} ({}%)".format(imgs_urls.shape[0], round(100 * imgs_urls.shape[0] / total_posts, 2)))
imgs_clean_texts = posts[~posts["imgs_clean_texts"].isnull()]
print("Tweets with extracted URLs texts: {} ({}%)".format(imgs_clean_texts.shape[0], round(100 * imgs_clean_texts.shape[0] / total_posts, 2)))

print("=============================")

# ChatGPT related images
is_chatPGT_propmpt_in_tweet = posts["imgs_text_source"].apply(lambda x: "ChatGPT" in x if x is not None else False)
chatPGT_content_df = posts[is_chatPGT_propmpt_in_tweet]
chatgpt_prompts_imgs_df_count = is_chatPGT_propmpt_in_tweet.sum()
chatPGT_content = chatPGT_content_df[~chatPGT_content_df["imgs_clean_texts"].isnull()]
print("Extracted images with ChatGPT promts: {} ({}%)".format(chatgpt_prompts_imgs_df_count, round(100 * chatgpt_prompts_imgs_df_count / total_posts, 2)))
print("After cleaning the ChatGPT content: {} ({}%)".format(chatPGT_content.shape[0], round(100 * chatPGT_content.shape[0] / total_posts, 2)))

print("=============================")

# Unknown content images
is_other_content_in_tweet = posts["imgs_text_source"].apply(lambda x: "Other" in x if x is not None else False)
non_empty_other_content_df = posts[is_other_content_in_tweet]
other_unknown_imgs_df_count = is_other_content_in_tweet.sum()
non_empty_other_content = non_empty_other_content_df[~non_empty_other_content_df["imgs_clean_texts"].isnull()]
print("Extracted images with other unknown content: {} ({}%)".format(other_unknown_imgs_df_count, round(100 * other_unknown_imgs_df_count / total_posts, 2)))
print("After cleaning the other uknown content: {} ({}%)".format(non_empty_other_content.shape[0], round(100 * non_empty_other_content.shape[0] / total_posts, 2)))

Total number of tweets:  275741
Tweets with extracted tiny URLs: 275741 (100.0%)
Tweets with scrapped URLs: 275741 (100.0%)
Tweets with extracted URLs texts: 3678 (1.33%)
Extracted images with ChatGPT promts: 42 (0.02%)
After cleaning the ChatGPT content: 42 (0.02%)
Extracted images with other unknown content: 30305 (10.99%)
After cleaning the other uknown content: 3657 (1.33%)


## Word frequency statistics

In [19]:
from sklearn.feature_extraction.text import CountVectorizer
from spacy.lang.en import stop_words

extended_stop_wrods = list(stop_words.STOP_WORDS)
extended_stop_wrods.append("doesnt")
extended_stop_wrods.append("dont")
extended_stop_wrods.append("not")
STOP_WORDS = list(set(extended_stop_wrods))

In [20]:
non_empty_content = posts[is_other_content_in_tweet]
non_empty_content = non_empty_content[~non_empty_content["imgs_clean_texts"].isnull()]
non_empty_content["imgs_clean_texts"] = non_empty_content["imgs_clean_texts"].apply(lambda x: " ".join(x))

In [21]:
from string import digits
# remove numbers from clean text
non_empty_content["imgs_clean_texts"] = non_empty_content["imgs_clean_texts"].apply(lambda x: x.translate(str.maketrans('', '', digits)))

In [22]:
count_model = CountVectorizer(stop_words=STOP_WORDS)
count_vector = count_model.fit_transform(non_empty_content["imgs_clean_texts"])
count_array = count_vector.toarray()

words_set = count_model.get_feature_names_out()

chatgpt_prompts_df_counts = pd.DataFrame(count_array, columns=words_set)
chatgpt_prompts_df_counts.loc['Total']= chatgpt_prompts_df_counts.sum(numeric_only=True, axis=0)



In [26]:
chatgpt_prompts_df_counts.loc["Total", :].T.reset_index().sort_values("Total", ascending=False)

Unnamed: 0,index,Total
53,pee,12
68,secret,10
13,chat,5
51,patient,4
20,date,4
...,...,...
34,indicating,1
32,good,1
31,giving,1
30,free,1
