In [5]:
from datasets import load_dataset
import re

In [6]:
##Loading in the dataset
ds = load_dataset("Exorde/exorde-social-media-december-2024-week1")
ds

DatasetDict({
    train: Dataset({
        features: ['date', 'original_text', 'url', 'author_hash', 'language', 'primary_theme', 'english_keywords', 'sentiment', 'main_emotion', 'secondary_themes'],
        num_rows: 65542211
    })
})

In [7]:
##Function to remove hyperlinks, spaces and emojis from text
def clean_text(text):
    text = str(text).lower()
    text = re.sub(r"http\S+|@\w+|#\w+|[^\x00-\x7F]+", " ", text)  # Removing url using regex on HTTP, @ regex mentions, and emoji code regex patterns
    text = re.sub(r"[^a-z\s]", "", text)                          # Letter Only Regex
    text = re.sub(r"\s+", " ", text).strip()                     # Space Trim Logic
    return text

In [14]:
##Filter to English only inputs - choosing 10,000 as we still need to account for removing emojis, etc.
##This will generate the X dataset, replace with any social media platform domain to generate same dataset for other platforms  
en_ds = ds['train'].filter(lambda example: example['language'] == 'en' and example['url'].startswith('https://x.com/') and clean_text(example.get('original_text', '')) != '')
sample_en_ds = en_ds.shuffle(seed=42).select(range(min(5000, len(en_ds))))
sample_en_ds = sample_en_ds.map(lambda example: {**example, 'cleaned_text': clean_text(example.get('original_text', ''))})
sample_en_df = sample_en_ds.to_pandas()
sample_en_df

Map: 100%|██████████| 5000/5000 [00:04<00:00, 1004.34 examples/s]


Unnamed: 0,date,original_text,url,author_hash,language,primary_theme,english_keywords,sentiment,main_emotion,secondary_themes,cleaned_text
0,2024-12-03T18:13:09.000Z,They make us realize we have other options . T...,https://x.com/DarrylRoss72/status/186400999686...,f14ec6948ee11dedce31054f4127a4c24c42b7ec,en,People,"saturday, delivering, make, realize, options d...",-0.18,realization,"[2, 7]",they make us realize we have other options the...
1,2024-12-07T05:07:31.000Z,This is how Grok2 AI represents this scenario....,https://x.com/5L4Y3R_0F_D00M/status/1865261836...,,en,Technology,"varahavtar, grok2, badass, missed, scenario, y...",-0.05,neutral,"[3, 7]",this is how grok ai represents this scenario y...
2,2024-12-04T04:00:06.000Z,I a pretty sure it may have been because they ...,https://x.com/Chrishall05/status/1864157706342...,,en,Sports,"undefeated, pretty",-0.04,approval,"[8, 3, 7]",i a pretty sure it may have been because they ...
3,2024-12-06T23:02:22.000Z,Check out R! It's the most underrated programm...,https://x.com/nrachabathuni/status/18651699426...,913df5d294a176a797b8b4f5b4ad5a72927a751e,en,Technology,"data analysis, statistics, datascience, underr...",0.18,neutral,"[3, 7]",check out r its the most underrated programmin...
4,2024-12-02T23:35:56.000Z,I came to this car just knowing I was about to...,https://x.com/_jayywalker/status/1863728840939...,25e3b6f98cbdea5f4715aec413139574f84b806f,en,Environment,"i’m, car, home smh, home, cold, knowing, smh, ...",-0.62,neutral,"[10, 3, 7]",i came to this car just knowing i was about to...
...,...,...,...,...,...,...,...,...,...,...,...
4995,2024-12-01T15:41:01.000Z,i think $20 is enough for my cat's health main...,https://x.com/0xdreawins/status/18632469342046...,,en,Health,"cat, cat health, health maintenance, parvo, he...",-0.07,sadness,"[3, 4, 7]",i think is enough for my cats health maintenan...
4996,2024-12-04T00:04:58.000Z,Sure your mom's ass is getting pretty messy,https://x.com/Astonedgoodboi/status/1864098533...,,en,Environment,"mom ass, ass, pretty, mom, messy, pretty messy",-0.13,annoyance,"[10, 3, 7]",sure your moms ass is getting pretty messy
4997,2024-12-01T16:34:08.000Z,Joo Bayden no fodii waɗude njillu jaango Altin...,https://x.com/RFI_Ff/status/1863260301799625048,7a9ec4a656efb945ff4eacc8b28cbe6da7ba62f1,en,Cryptocurrency,"fodii waɗude, jaango, fodii, bayden, jaango al...",-0.16,neutral,"[9, 3, 7]",joo bayden no fodii wa ude njillu jaango altin...
4998,2024-12-02T06:47:46.000Z,ARE THERE ANY WINGS U DONT MIND GIVING,https://x.com/rec0v3r3d/status/186347512716745...,,en,People,"DONT, there, GIVING, wings, dont, ANY, DONT MI...",0.01,neutral,"[2, 3, 7]",are there any wings u dont mind giving


In [15]:
##Save to a .csv
sample_en_df.to_csv('x_en_dataset.csv', index=False)