Create a vector of tag similarity of the secondary_tag field

In [None]:
pip install cdlib

In [30]:
from pathlib import Path
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import CountVectorizer
import networkx as nx
from cdlib import algorithms
from wordcloud import WordCloud
import matplotlib.pyplot as plt
from collections import Counter
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS

In [264]:
search_date="2024-07-15"
folder_path = Path(search_date)

file_names = [f.name for f in folder_path.iterdir() if f.is_file()]

print(file_names)

['Artists_of_tumblr.csv', 'Batman.csv', 'Cats_of_Tumblr.csv', 'Communities.csv', 'CrowdStrike.csv', 'Dungeon_Meshi.csv', 'House_of_the_Dragon.csv', 'Joe_Biden.csv', 'Jujutsu_Kaisen.csv', 'Palestine.csv', 'Supernatural.csv']


In [290]:
df = pd.read_csv(f'{search_date}/{file_names[10]}')
print(df.head(10))

   note_count   primary_tag  \
0         147  Supernatural   
1          62  Supernatural   
2           5  Supernatural   
3         251  Supernatural   
4          10  Supernatural   
5          20  Supernatural   
6           3  Supernatural   
7           8  Supernatural   
8          36  Supernatural   
9          23  Supernatural   

                                      secondary_tags  \
0  ['dean winchester'  'spn'  'deanwinchesterseri...   
1  ['spn'  'dean winchester'  'cowboy dean winche...   
2  ['lucifer spn'  'sam winchester'  'season 13 r...   
3  ['spnedit'  'supernaturaledit'  'samwinchester...   
4  ['spn incorrect quotes'  'supernatural incorre...   
5    ['jensen ackles'  'dean winchester'  'spnedit']   
6  ['fartface rambles'  'spn'  'fanfiction'  'twi...   
7  ['supernatural rewatch'  'season 13'  'sam win...   
8  ['spn'  'gabriel spn'  'sabriel'  'sam winches...   
9  ['sam winchester'  'jared padalecki'  'spnedit...   

                     blog_name  \
0      

In [292]:
df.columns

Index(['note_count', 'primary_tag', 'secondary_tags', 'blog_name', 'post_body',
       'date'],
      dtype='object')

In [294]:
def compute_cosine_similarity(input_tag_or_text):
    
    print(input_tag_or_text[:10])

    vectorizer = CountVectorizer(analyzer='word')
    tag_vectors = vectorizer.fit_transform(input_tag_or_text)
    return cosine_similarity(tag_vectors)

In [296]:
cosine_sim_matrix_tags = compute_cosine_similarity(
    df['secondary_tags'].apply(lambda tags: "".join(tags.replace("[","").replace("]","").replace("'",""))).tolist()
)
cosine_sim_matrix_post_body = compute_cosine_similarity(
    df['post_body'].apply(lambda body: "".join(body)).tolist()
)

['dean winchester  spn  deanwinchesterseries  spnedit  deanedit  spn 7x23  survival of the fittest', 'spn  dean winchester  cowboy dean winchester  destiel  castiel  sam winchester  cowboy dean  edit  spnedit  supernatural edit  edits  editors on tumblr  spn edit  jensen ackles', 'lucifer spn  sam winchester  season 13 really had me in a chokehold  i’m screaming  i’m crying  and i can’t stop', 'spnedit  supernaturaledit  samwinchesteredit  sam winchester  *m  i swear to all gods i had a whole gifset planned abt spn autopsy rooms but  i dont know what happened  s5  hands  sam', 'spn incorrect quotes  supernatural incorrect quotes  supernatural fandom  spnfandom  spn', 'jensen ackles  dean winchester  spnedit', 'fartface rambles  spn  fanfiction  twist and shout  fuck you fuck you fuck you  fuck your landlord', 'supernatural rewatch  season 13  sam winchester  lucifer  spn  lucifer spn  apocalypse world  i’m gonna throw up  i’m screaming  and sobbing', 'spn  gabriel spn  sabriel  sam win

In [298]:
G = nx.Graph()

# Add nodes (representing posts) to the graph
for idx, row in df.iterrows():
    G.add_node(idx, blog_name=row['blog_name'], tags=row['secondary_tags'], post_body=row['post_body'])

# Add edges based on similarity
similarity_threshold = 0.7  # Adjust threshold as needed
for i in range(len(df)):
    for j in range(i + 1, len(df)):  # Avoid duplicate pairs
        similarity = cosine_sim_matrix_tags[i, j]
        similarity_post_body = cosine_sim_matrix_post_body[i,j]
        #both are near 1 so they will be divided by 2
        
        if similarity > similarity_threshold:
            G.add_edge(i, j, weight=(similarity+similarity_post_body)/2)
            

In [300]:
communities = algorithms.louvain(G)

# Step 5: Display which profile belongs to which community
community_dict = {}

# For each community, map profiles to the community
for i, community in enumerate(communities.communities):
    community_dict[f"Community {i + 1}"] = [df.loc[node]['blog_name'] for node in community]

In [302]:
print(df.iloc[1,1])

Supernatural


In [304]:
communities_file = Path(f"{search_date}/Communities.csv")

if not communities_file.exists():
    communities_file.write_text("tag,community,blog_names\n")
    print("Communities file created and headers written.")

with communities_file.open("a", encoding='utf-8') as file:
    for community, profiles in community_dict.items():
        file.write(f"{df.iloc[1,1]},{community},{' '.join(profiles)}\n")

In [306]:
tag_folder_name = Path(f'{search_date}/{df.iloc[1,1].replace(" ","_")}/tag_cloud')
tag_folder_name.mkdir(parents=True, exist_ok=True)

In [308]:
i=0
for community, profiles in community_dict.items():
    if i==20:
        break
    else:
        i=i+1
    
    # Get the secondary tags for all posts in this community
    community_tags = []
    for profile in profiles:
        posts = df[df['blog_name'] == profile]
        for _, post in posts.iterrows():
            community_tags.extend(post['secondary_tags'].replace("[","").replace("]","").replace("'","").split())
    
    # Count the frequency of each tag in the community
    tag_counts = Counter(community_tags)

    # Generate the tag cloud from the tag counts
    wordcloud = WordCloud(width=800, height=400, background_color='white').generate_from_frequencies(tag_counts)

    #Display the tag cloud
    print(f"Creating word cloud {community}")
    wordcloud.to_file(f"{tag_folder_name}/tagcloud_{community.replace(" ","_")}.png")
    

Creating word cloud Community 1
Creating word cloud Community 2
Creating word cloud Community 3
Creating word cloud Community 4
Creating word cloud Community 5
Creating word cloud Community 6
Creating word cloud Community 7
Creating word cloud Community 8
Creating word cloud Community 9
Creating word cloud Community 10
Creating word cloud Community 11
Creating word cloud Community 12
Creating word cloud Community 13
Creating word cloud Community 14
Creating word cloud Community 15
Creating word cloud Community 16
Creating word cloud Community 17
Creating word cloud Community 18
Creating word cloud Community 19
Creating word cloud Community 20


In [310]:
word_folder_name = Path(f'{search_date}/{df.iloc[1,1].replace(" ","_")}/word_cloud')
word_folder_name.mkdir(parents=True, exist_ok=True)

In [312]:
stop_words = ENGLISH_STOP_WORDS  # You can also use your own custom list

i=0
for community, profiles in community_dict.items():
    if i==20:
        break
    else:
        i=i+1
    
    
    community_tags = []
    for profile in profiles:
        posts = df[df['blog_name'] == profile]
        for _, post in posts.iterrows():
            community_tags.extend(post['post_body'].split())
    
    filtered_words = [word for word in community_tags if word.lower() not in stop_words]
    tag_counts = Counter(filtered_words)

    wordcloud = WordCloud(width=800, height=400, background_color='white').generate_from_frequencies(tag_counts)

    print(f"Creating word cloud {community}")
    wordcloud.to_file(f"{word_folder_name}/wordcloud_{community.replace(" ","_")}.png")

Creating word cloud Community 1
Creating word cloud Community 2
Creating word cloud Community 3
Creating word cloud Community 4
Creating word cloud Community 5
Creating word cloud Community 6
Creating word cloud Community 7
Creating word cloud Community 8
Creating word cloud Community 9
Creating word cloud Community 10
Creating word cloud Community 11
Creating word cloud Community 12
Creating word cloud Community 13
Creating word cloud Community 14
Creating word cloud Community 15
Creating word cloud Community 16
Creating word cloud Community 17
Creating word cloud Community 18
Creating word cloud Community 19
Creating word cloud Community 20
