##  Mastodon toot collection

### Importing needed libraries

In [6]:
from mastodon import Mastodon
from datetime import datetime, timedelta, timezone
import pandas as pd
import re

### Mastadon Data Collection

In [7]:
# Create Mastodon app to connect to API

Mastodon.create_app(
    'your_app_name',
    api_base_url = 'https://mastodon.social',
    to_file = 'client_crediential_file_name.secret'
)

('g503DPxqF6-rTAD6IV9IG-rkKejSS8GgJKKGVWNhWuc',
 'GbZWQOE2DIgPP9C6uwEZGkP4pDtC7Gcs03cV_NdrLJ4')

In [8]:
# Connect to API using OAuth

API = Mastodon(client_id = 'client_crediential_file_name.secret',)
url = API.auth_request_url(scopes=["read", "write", "follow", "push"])
print("Please visit this URL to authorize the app:", url)

Please visit this URL to authorize the app: https://mastodon.social/oauth/authorize?client_id=g503DPxqF6-rTAD6IV9IG-rkKejSS8GgJKKGVWNhWuc&response_type=code&redirect_uri=urn%3Aietf%3Awg%3Aoauth%3A2.0%3Aoob&scope=read+write+follow+push&force_login=False&state=None&lang=None


In [9]:
# Enter token from OAuth

code = input("Enter the code from the browser: ")

In [10]:
# Login to API

API.log_in(
    code=code,
    to_file = 'user_crediential_file_name.secret'
)
print("Successfully logged in. User access token saved in 'apple_user.secret'.")

Successfully logged in. User access token saved in 'apple_user.secret'.


In [11]:
# Initialize query terms, lists containing toots and offset value for searching

q0 = "ChatGPT"
q1 = "chatgpt"
q2 = "artifical intelligence"
q3 = "Artifical Intelligence"
mast_toots = []
ai_toots = []
off_set = 0

In [None]:
# Collecting Chatgpt toots

for i in [q0, q1]: # Fetch toots with both Chatgpt queries
    
    off_set = 0 # Refresh offset for the next query
    
    # Loop to fetch older toots using offsets
    while True:

        # Fetch toots with the search API using offset
        search_results = API.search_v2(q=i, result_type="statuses", offset=off_set)
        statuses = search_results["statuses"]

        # Exit loop if no more toots were found
        if not statuses:
            print("Found all toots")
            break 

        # Add new toots to the collection
        mast_toots.extend(statuses)

        # Find the oldest toot
        oldest_toot = min(statuses, key=lambda x: x["created_at"])

        # End loop once we fetched 1 week worth of toots or if we reach 1000 toots with a longer time frame
        if  timedelta(days=7) <= datetime.now(tz=timezone.utc) - oldest_toot["created_at"] and len(mast_toots) > 1000:
            print("Reached 1 week")
            break

        # Update offset to fetch the next batch of results
        off_set += len(statuses)

    # Print results
    print(f"Total toots collected: {len(mast_toots)}")
    if mast_toots:
        print(f"Earliest toot: {mast_toots[-1]['created_at']}")
        print(f"Latest toot: {mast_toots[0]['created_at']}")

Reached 1 week
Total toots collected: 2125
Earliest toot: 2024-12-14 08:29:19+00:00
Latest toot: 2024-12-21 09:26:36.227000+00:00
Reached 1 week
Total toots collected: 4250
Earliest toot: 2024-12-14 08:29:19+00:00
Latest toot: 2024-12-21 09:26:36.227000+00:00


In [14]:
# Collecting AI toots

for i in [q2, q3]: # Fetch toots with both AI queries

    off_set = 0 # Refresh offset for the next query
    
    # Loop to fetch older toots using offsets
    while True:
        # Fetch toots with the search API using offset
        search_results = API.search_v2(q=i, result_type="statuses", offset=off_set)
        statuses = search_results["statuses"]

        # Exit loop if no more toots are returned
        if not statuses:
            print("found all toots")
            break

        # Add new toots to the collection
        ai_toots.extend(statuses)

        # Excluded time frame restraint due to the small number of toots

        # Update offset to fetch the next batch of results
        off_set += len(statuses)

    # Print results
    print(f"Total toots collected: {len(ai_toots)}")
    if ai_toots:
        print(f"Earliest toot: {ai_toots[-1]['created_at']}")
        print(f"Latest toot: {ai_toots[0]['created_at']}")

found all toots
Total toots collected: 236
Earliest toot: 2018-08-21 06:30:29+00:00
Latest toot: 2024-12-19 20:35:25+00:00
found all toots
Total toots collected: 423
Earliest toot: 2018-08-21 06:30:29+00:00
Latest toot: 2024-12-19 20:35:25+00:00


### Create Dataframes for the collected toots

In [23]:
# Specific attributes extracted from the fetched toots
needed_attributes = [
    'id', 'created_at', 'content', 'language', 'replies_count', 
    'reblogs_count', 'favourites_count', 'visibility', 'tags'
]

cleaned_mast_toots = []
cleaned_ai_toots = []

for toot in mast_toots:
    cleaned_toot = {key: toot[key] for key in needed_attributes if key in toot}
    cleaned_mast_toots.append(cleaned_toot)

for toot in ai_toots:
    cleaned_toot = {key: toot[key] for key in needed_attributes if key in toot}
    cleaned_ai_toots.append(cleaned_toot)
    

In [24]:
# Removing html text from collected toots
def strip_html_tags(html_text):
    plain_text = re.sub(r'<.*?>', '', html_text)
    return plain_text

for toot in cleaned_mast_toots:
    if 'content' in toot and toot['content']:  # Check if 'content' exists and is not None
        toot['content'] = strip_html_tags(toot['content'])

for toot in cleaned_ai_toots:
    if 'content' in toot and toot['content']:  # Check if 'content' exists and is not None
        toot['content'] = strip_html_tags(toot['content'])

In [25]:
# Create dataframes for each group of toots
mast_df = pd.DataFrame(cleaned_mast_toots)
ai_df = pd.DataFrame(cleaned_ai_toots)

In [26]:
mast_df.head()

Unnamed: 0,id,created_at,content,language,replies_count,reblogs_count,favourites_count,visibility,tags
0,113690096187773544,2024-12-21 09:26:36.227000+00:00,Music can thrive in the age of AIThe birth of ...,en,0,0,0,public,[]
1,113690081293255708,2024-12-21 09:22:48+00:00,ChatGPT integriert auf PC und Mac weitere Apps...,de,0,0,0,public,[]
2,113690067666967378,2024-12-21 09:19:18+00:00,オチが色々と考えさせられる。--月額3万円の最強ChatGPTに「電源で音質が変わるか」と聞...,ja,0,0,0,public,[]
3,113690066527344469,2024-12-21 09:19:03+00:00,https://www.phileweb.com/review/column/202412/...,ja,0,0,0,public,[]
4,113690015360566765,2024-12-21 09:06:02+00:00,"🇪🇸 Top artículos: viernes, 20 de diciembre de ...",es,0,0,0,public,[]


In [27]:
ai_df.head()

Unnamed: 0,id,created_at,content,language,replies_count,reblogs_count,favourites_count,visibility,tags
0,113681401668833860,2024-12-19 20:35:25+00:00,#digipresadventcalendar #day19U-matic was deve...,en,0,4,0,public,"[{'name': 'digipresadventcalendar', 'url': 'ht..."
1,113639550502224256,2024-12-12 11:12:01.740000+00:00,Ainsi s'ouvrirent les portes de l'enfer.\nwww....,fr,0,0,0,public,[]
2,113639537305656138,2024-12-11 23:50:20+00:00,What if AI can actually help with your burnout...,en,0,0,0,public,[]
3,113637263387670537,2024-12-12 01:30:31.801000+00:00,@fbi @washingtonpost haha dude Biden is really...,en,0,0,0,public,[]
4,113629601136983194,2024-12-10 16:56:54+00:00,Oracle Just Shared Some Amazing Insight With A...,en,0,0,0,public,[]


In [40]:
# Create csv files for each group of toots
mast_df.to_csv('mast_toots.csv', index=False)
ai_df.to_csv('ai_toots.csv', index=False)