In [2]:
from mastodon import Mastodon

# 初始化Mastodon客户端
mastodon_instance_url = "https://mastodon.social/"
mastodon = Mastodon(api_base_url=mastodon_instance_url)

In [3]:
from utils import *

In [4]:
# Import the necessary library
import json

# Read the dictionary from the text file
with open('topic_dictionary.txt', 'r') as file:
    topic_hashtags = json.load(file)

for topic in topic_hashtags.keys():
    print('-----------',topic,"-----------")
    for hashtag in topic_hashtags[topic].keys():
        print('                         ',hashtag)
        list_dict = extract_mastodon(mastodon, hashtag,50 )
        topic_hashtags[topic][hashtag]=list_dict

----------- Politics -----------
                          NationalPolitics
                          Elections2023
                          ForeignAffairs
                          LawsAndRegulations
                          LocalGovernance
----------- Technology -----------
                          Computing
                          TechNews
                          Gadgets
                          Programming
                          ArtificialIntelligence
----------- Science -----------
                          Biology
                          QuantumPhysics
                          OrganicChemistry
                          Astronomy
                          MedicalResearch
----------- Entertainment -----------
                          Movies
                          Music
                          VideoGames
                          Theatre
                          PopCulture
----------- News -----------
                          TopHeadlines
                      

In [5]:
import pandas as pd

# Your nested dictionary
# author +  create_at + contents + hashtag_related
# Initialize an empty list to store records
records = []

# Flatten the nested dictionary into records
for topic, hashtags in topic_hashtags.items():
    for hashtag, entries in hashtags.items():
        for entry in entries:
            records.append({
                "topic": topic,
                "hashtag": hashtag,
                "content": entry["content"],
                "hashtag_related 1": entry["related_topics"][0],
                "hashtag_related 2": entry["related_topics"][1],
                "hashtag_related 3": entry["related_topics"][2],
                "hashtag_related 4": entry["related_topics"][3],
                "hashtag_related 5": entry["related_topics"][4],
                "author": entry["author"],
                "create_at": entry["created_at"],
            })

# Create a Pandas DataFrame
df = pd.DataFrame(records)
df.to_csv("15topics.csv", index=False)
df

Unnamed: 0,topic,hashtag,content,hashtag_related 1,hashtag_related 2,hashtag_related 3,hashtag_related 4,hashtag_related 5,author,create_at
0,Politics,NationalPolitics,European socialists suspend Robert Fico’s Smer...,,,,,,politico_eu_bot,2023-10-12 16:34:46.456000+00:00
1,Politics,NationalPolitics,"Now, a member of Minnesota's Congressional del...",#Business,#Hudson,#UAWStrike,,,strike,2023-10-06 11:52:06+00:00
2,Politics,NationalPolitics,Markus Söder’s crumbling empire https://www.po...,,,,,,politico_eu_bot,2023-10-06 02:18:11.689000+00:00
3,Politics,NationalPolitics,"Poland, Hungary, Slovakia impose own Ukraine g...",,,,,,politico_eu_bot,2023-09-16 09:34:38.494000+00:00
4,Politics,NationalPolitics,Vienna seeks to calm Selmayr ‘blood money’ fur...,,,,,,politico_eu_bot,2023-09-10 17:33:43.008000+00:00
...,...,...,...,...,...,...,...,...,...,...
2090,Lifestyle,BalancedLife,"Wishing you a day filled with balance, positiv...",#WorkPlayTravel,#GoodDayAhead,,,,workplaytravel,2023-11-01 19:55:15.454000+00:00
2091,Lifestyle,BalancedLife,Embrace calm with Ashwagandha 🌿✨ by Moodbeli i...,#PlantiaWellness,#AshwagandhaMagic,#StressRelief,#ImmuneBoost,,Plantia,2023-10-19 19:40:02.490000+00:00
2092,Lifestyle,BalancedLife,Sheesh - was I ever that young?? Please check ...,#Novel,#deathspaleflag,#psychologicalthriller,#medicine,#neurosurgeon,GaryRSimonds,2023-06-28 15:32:31.069000+00:00
2093,Lifestyle,BalancedLife,# StressManagement # BalancedLife # Mindfulnes...,#StressManagement,#MindfulnessTips,#SelfCare,,,Clusterado,2023-06-19 12:29:29.076000+00:00


In [6]:
from transformers import GPT2Tokenizer, GPT2ForSequenceClassification

tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
model = GPT2ForSequenceClassification.from_pretrained("gpt2")


Some weights of GPT2ForSequenceClassification were not initialized from the model checkpoint at gpt2 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [7]:
# Example data preparation (replace with your own data loading and processing)
texts = df["content"].tolist()
#delete url of texts
texts = [re.sub(r"http\S+", "", text) for text in texts]

labels = df["topic"].tolist()
topic_to_num = {topic :index for index,topic in enumerate(topic_hashtags.keys())}
labels_num = [topic_to_num[label] for label in labels]
print(texts[0])
print(labels_num[0])


European socialists suspend Robert Fico’s Smer party and its ally Hlas    
0


In [None]:
#tokenize the texts
import torch

tokenizer.add_special_tokens({'pad_token': '[PAD]'})
model.config.pad_token_id = tokenizer.pad_token_id

encoding = tokenizer(texts, return_tensors="pt", padding=True, truncation=True)
input_ids = encoding["input_ids"]
labels_num = torch.tensor(labels_num)
print(input_ids.shape, labels_num.shape)

In [9]:
#trainset and testset
from torch.utils.data import TensorDataset, random_split
dataset = TensorDataset(input_ids, labels_num)
train_size = int(0.9 * len(dataset))
test_size = len(dataset) - train_size
train_dataset, test_dataset = random_split(dataset, [train_size, test_size])

#trainloader and testloader
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler
batch_size = 128
train_dataloader = DataLoader(train_dataset,sampler = RandomSampler(train_dataset),batch_size = batch_size)
test_dataloader = DataLoader(test_dataset,sampler = SequentialSampler(test_dataset),batch_size = batch_size)


In [10]:
num_classes = 15  # Replace with your actual number of classes
model.resize_token_embeddings(len(tokenizer))
model.classifier = torch.nn.Linear(model.config.hidden_size, num_classes)


criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-3)

In [None]:
# Training loop
model.train()
#for epoch in range(5):
for batch in train_dataloader:
    optimizer.zero_grad()
    input_ids, labels = batch
    print(input_ids, labels)
    print(input_ids.shape, labels.shape)
    outputs = model(input_ids=input_ids, labels=labels)
    loss = outputs.loss
    loss.backward()
    optimizer.step()
    print(loss.item())

In [None]:
# Testing loop
model.eval()
predictions = []
with torch.no_grad():
    for batch in test_dataloader:
        input_ids = batch[0]
        labels = batch[1]
        outputs = model(input_ids)
        logits = outputs[0]
        predictions.extend(torch.argmax(logits, dim=-1).tolist())

# Evaluation
from sklearn.metrics import classification_report
print(classification_report(labels_num[train_size:], predictions))


In [None]:
model.save_pretrained("fine_tuned_gpt2_model")
tokenizer.save_pretrained("fine_tuned_gpt2_model")


In [None]:
loaded_model = GPT2ForSequenceClassification.from_pretrained("fine_tuned_gpt2_model")
loaded_tokenizer = GPT2Tokenizer.from_pretrained("fine_tuned_gpt2_model")
