In [89]:
# [Cell 1] - Import required packages and setup
import json
import pandas as pd
import numpy as np
from typing import Dict, List, Union, Optional
import logging
from pathlib import Path
# Configure logging to display in notebook
logging.basicConfig(level=logging.INFO, format='%(message)s')


In [90]:
# [Cell 2] - Load the data
with open('/Users/hongxuzhou/rt_data/Related/cleaned_deathpositive_2024-11-27_05-42-41-750.json', 'r', encoding='utf-8') as f:
    data = json.load(f)
print(f"Successfully loaded data with {len(data)} items")



Successfully loaded data with 50 items


In [91]:
# [Cell 3] - Initial data inspection
print("Structure of first item:")
first_item = data[0]
print("\nMain keys:")
for key in first_item.keys():
    print(f"- {key}")

# Display the type and length of main components
print("\nComponent details:")
if 'related' in first_item:
    print(f"Related hashtags: {len(first_item['related'])} items")
if 'topPosts' in first_item:
    print(f"Top posts: {len(first_item['topPosts'])} posts")



Structure of first item:

Main keys:
- name
- topPosts
- related

Component details:
Related hashtags: 61 items
Top posts: 31 posts


In [92]:
# [Cell 4] - Examine related hashtags format
print("\nSample of related hashtags (first 5):")
if first_item.get('related'):
    for hashtag_info in first_item['related'][:5]:
        print(f"#{hashtag_info['hash'][1:]}: {hashtag_info['info']}")




Sample of related hashtags (first 5):
#death: 11.9 m
#cemetery: 3.29 m
#graveyard: 2.35 m
#macabre: 1.72 m
#oddities: 1.56 m


In [93]:
# [Cell 5] - Data completeness check
print("\nData completeness check:")
has_related = sum(1 for item in data if 'related' in item and item['related'])
has_topposts = sum(1 for item in data if 'topPosts' in item and item['topPosts'])

print(f"Items with related hashtags: {has_related}/{len(data)}")
print(f"Items with top posts: {has_topposts}/{len(data)}")




Data completeness check:
Items with related hashtags: 1/50
Items with top posts: 49/50


In [94]:
# [Cell 6] - Examine number formats in related hashtags
print("\nUnique number formats in 'info' field:")
if first_item.get('related'):
    formats = set()
    for item in first_item['related']:
        formats.add(item['info'])
    print("Sample formats:", list(formats)[:5])


Unique number formats in 'info' field:
Sample formats: ['129.17 k', '51.45 k', '32.48 k', '118.16 k', '60.15 k']


In [95]:
# [Cell 7] - Define normalization functions
def normalize_count(value: str) -> Optional[float]:
    """
    Convert string counts like '129.17 k' to numerical values
    Returns float or None if invalid
    """
    try:
        value = value.lower().strip()
        if 'k' in value:
            # Remove 'k' and convert to thousands
            num = float(value.replace('k', '').strip())
            return num * 1000
        if 'm' in value:
            # Remove 'm' and convert to millions
            num = float(value.replace('m', '').strip())
            return num * 1000000
        return float(value)
    except (ValueError, AttributeError) as e:
        logging.warning(f"Could not normalize value: {value}, Error: {str(e)}")
        return None



In [96]:
# [Cell 8] - Test normalization function
test_values = ['129.17 k', '51.45 k', '11.9 m', '32.48 k', '118.16 k']
print("Testing normalization:")
for value in test_values:
    normalized = normalize_count(value)
    print(f"Original: {value:10} -> Normalized: {normalized:,.2f}")



Testing normalization:
Original: 129.17 k   -> Normalized: 129,170.00
Original: 51.45 k    -> Normalized: 51,450.00
Original: 11.9 m     -> Normalized: 11,900,000.00
Original: 32.48 k    -> Normalized: 32,480.00
Original: 118.16 k   -> Normalized: 118,160.00


In [97]:
# [Cell 9] - Create normalized hashtag dictionary
def create_hashtag_dict(item: Dict) -> Dict[str, float]:
    """
    Create dictionary of hashtag:count pairs from related field
    """
    hashtag_counts = {}
    if item.get('related'):
        for hashtag_info in item['related']:
            tag = hashtag_info['hash'].lstrip('#')  # Remove # if present
            count = normalize_count(hashtag_info['info'])
            if count is not None:
                hashtag_counts[tag.lower()] = count
    return hashtag_counts


In [98]:

# [Cell 10] - Apply normalization to first item
first_hashtag_counts = create_hashtag_dict(first_item)
print("\nNormalized hashtag counts (first 5):")
for tag, count in list(first_hashtag_counts.items())[:5]:
    print(f"#{tag}: {count:,.0f}")


Normalized hashtag counts (first 5):
#death: 11,900,000
#cemetery: 3,290,000
#graveyard: 2,350,000
#macabre: 1,720,000
#oddities: 1,560,000


In [99]:
# [Cell 11] - Process full dataset
def process_dataset(data: List[Dict]) -> List[Dict[str, Dict[str, float]]]:
    """
    Process entire dataset, creating normalized hashtag counts for each item
    Returns list of processed items with hashtag counts
    """
    processed_items = []
    
    for item in data:
        # Try related hashtags first
        hashtag_counts = create_hashtag_dict(item)
        
        # If no related hashtags, log it
        if not hashtag_counts:
            logging.info(f"No related hashtags found for item {len(processed_items)}")
            # Fallback mechanism would go here
            
        processed_items.append({
            'main_hashtag': item.get('name', 'unknown'),
            'hashtag_counts': hashtag_counts
        })
    
    return processed_items



In [100]:
# [Cell 12] - Run processing and check results
processed_data = process_dataset(data)

# Display summary
print(f"\nProcessed {len(processed_data)} items")
print("\nSample of processed items:")
for item in processed_data[:2]:  # Show first 2 items
    print(f"\nMain hashtag: {item['main_hashtag']}")
    print(f"Number of related hashtags: {len(item['hashtag_counts'])}")
    if item['hashtag_counts']:
        print("Top 3 related hashtags by count:")
        top_3 = sorted(item['hashtag_counts'].items(), 
                      key=lambda x: x[1], 
                      reverse=True)[:3]
        for tag, count in top_3:
            print(f"  #{tag}: {count:,.0f}")

No related hashtags found for item 1
No related hashtags found for item 2
No related hashtags found for item 3
No related hashtags found for item 4
No related hashtags found for item 5
No related hashtags found for item 6
No related hashtags found for item 7
No related hashtags found for item 8
No related hashtags found for item 9
No related hashtags found for item 10
No related hashtags found for item 11
No related hashtags found for item 12
No related hashtags found for item 13
No related hashtags found for item 14
No related hashtags found for item 15
No related hashtags found for item 16
No related hashtags found for item 17
No related hashtags found for item 18
No related hashtags found for item 19
No related hashtags found for item 20
No related hashtags found for item 21
No related hashtags found for item 22
No related hashtags found for item 23
No related hashtags found for item 24
No related hashtags found for item 25
No related hashtags found for item 26
No related hashtags f


Processed 50 items

Sample of processed items:

Main hashtag: deathpositive
Number of related hashtags: 61
Top 3 related hashtags by count:
  #death: 11,900,000
  #cemetery: 3,290,000
  #graveyard: 2,350,000

Main hashtag: deathpositivepup
Number of related hashtags: 0


In [101]:
# [Cell 13] - Implement fallback mechanism
def extract_hashtags_from_posts(item: Dict) -> Dict[str, float]:
    """
    Extract and count hashtags from topPosts as fallback
    Returns dictionary of hashtag counts normalized to be comparable with 'related' data
    """
    hashtag_counts = {}
    
    if not item.get('topPosts'):
        logging.warning("No topPosts available for fallback counting")
        return hashtag_counts
        
    # Count hashtags from all posts
    total_posts = len(item['topPosts'])
    for post in item['topPosts']:
        if post.get('hashtags'):
            for tag in post['hashtags']:
                tag = tag.lower()  # Normalize tag case
                hashtag_counts[tag] = hashtag_counts.get(tag, 0) + 1
    
    # Remove the main hashtag if it exists in counts
    main_tag = item.get('name', '').lower()
    if main_tag in hashtag_counts:
        del hashtag_counts[main_tag]
    
    return hashtag_counts



In [102]:
# [Cell 14] - Updated processing function with fallback
def process_dataset_with_fallback(data: List[Dict]) -> List[Dict]:
    """
    Process dataset using related hashtags when available, falling back to topPosts when necessary
    """
    processed_items = []
    
    for item in data:
        # Try related hashtags first
        hashtag_counts = create_hashtag_dict(item)
        
        # Fallback to topPosts if no related hashtags
        if not hashtag_counts:
            logging.info(f"Using fallback for item {len(processed_items)}")
            hashtag_counts = extract_hashtags_from_posts(item)
        
        processed_items.append({
            'main_hashtag': item.get('name', 'unknown'),
            'hashtag_counts': hashtag_counts,
            'source': 'related' if create_hashtag_dict(item) else 'topPosts'
        })
    
    return processed_items



In [103]:
# [Cell 14] - Updated processing function with fallback
def process_dataset_with_fallback(data: List[Dict]) -> List[Dict]:
    """
    Process dataset using related hashtags when available, falling back to topPosts when necessary
    """
    processed_items = []
    
    for item in data:
        # Try related hashtags first
        hashtag_counts = create_hashtag_dict(item)
        
        # Fallback to topPosts if no related hashtags
        if not hashtag_counts:
            logging.info(f"Using fallback for item {len(processed_items)}")
            hashtag_counts = extract_hashtags_from_posts(item)
        
        processed_items.append({
            'main_hashtag': item.get('name', 'unknown'),
            'hashtag_counts': hashtag_counts,
            'source': 'related' if create_hashtag_dict(item) else 'topPosts'
        })
    
    return processed_items



In [104]:
# [Cell 15] - Test updated processing
processed_data_with_fallback = process_dataset_with_fallback(data)

# Display summary of results
sources = {'related': 0, 'topPosts': 0}
for item in processed_data_with_fallback:
    sources[item['source']] += 1

print("\nProcessing Summary:")
print(f"Total items processed: {len(processed_data_with_fallback)}")
print(f"Items using related data: {sources['related']}")
print(f"Items using topPosts fallback: {sources['topPosts']}")

# Show sample results from each source
print("\nSample results:")
for item in processed_data_with_fallback[:2]:
    print(f"\nMain hashtag: {item['main_hashtag']}")
    print(f"Data source: {item['source']}")
    print(f"Number of related hashtags: {len(item['hashtag_counts'])}")
    if item['hashtag_counts']:
        print("Top 3 related hashtags:")
        top_3 = sorted(item['hashtag_counts'].items(), 
                      key=lambda x: x[1], 
                      reverse=True)[:3]
        for tag, count in top_3:
            print(f"  #{tag}: {count}")

Using fallback for item 1
Using fallback for item 2
Using fallback for item 3
Using fallback for item 4
Using fallback for item 5
Using fallback for item 6
Using fallback for item 7
Using fallback for item 8
Using fallback for item 9
Using fallback for item 10
Using fallback for item 11
Using fallback for item 12
Using fallback for item 13
Using fallback for item 14
Using fallback for item 15
Using fallback for item 16
Using fallback for item 17
Using fallback for item 18
Using fallback for item 19
Using fallback for item 20
Using fallback for item 21
Using fallback for item 22
Using fallback for item 23
Using fallback for item 24
Using fallback for item 25
Using fallback for item 26
Using fallback for item 27
Using fallback for item 28
Using fallback for item 29
Using fallback for item 30
Using fallback for item 31
Using fallback for item 32
Using fallback for item 33
Using fallback for item 34
Using fallback for item 35
Using fallback for item 36
Using fallback for item 37
Using fall


Processing Summary:
Total items processed: 50
Items using related data: 1
Items using topPosts fallback: 49

Sample results:

Main hashtag: deathpositive
Data source: related
Number of related hashtags: 61
Top 3 related hashtags:
  #death: 11900000.0
  #cemetery: 3290000.0
  #graveyard: 2350000.0

Main hashtag: deathpositivepup
Data source: topPosts
Number of related hashtags: 162
Top 3 related hashtags:
  #kermitthedog: 11
  #deathpositive: 7
  #grieftherapykermit: 6


In [105]:
# [Cell 16] - Add normalization for comparing across sources
def normalize_counts_for_comparison(hashtag_counts: Dict[str, float], 
                                 source: str) -> Dict[str, float]:
    """
    Normalize counts to be comparable between sources
    """
    if not hashtag_counts:
        return {}
        
    if source == 'related':
        # Already normalized by Instagram's counting
        return hashtag_counts
    elif source == 'topPosts':
        # Convert to relative frequencies
        total_hashtags = sum(hashtag_counts.values())
        return {
            tag: (count / total_hashtags) 
            for tag, count in hashtag_counts.items()
        }



In [106]:
# [Cell 17] - Update processing function
def process_dataset_normalized(data: List[Dict]) -> List[Dict]:
    """
    Process dataset with normalized counts for both sources
    """
    processed_items = []
    
    for item in data:
        # Try related hashtags first
        hashtag_counts = create_hashtag_dict(item)
        source = 'related'
        
        # Fallback to topPosts if no related hashtags
        if not hashtag_counts:
            hashtag_counts = extract_hashtags_from_posts(item)
            source = 'topPosts'
        
        # Normalize counts based on source
        normalized_counts = normalize_counts_for_comparison(hashtag_counts, source)
        
        processed_items.append({
            'main_hashtag': item.get('name', 'unknown'),
            'hashtag_counts': normalized_counts,
            'source': source,
            'total_hashtags': len(normalized_counts)
        })
    
    return processed_items



In [107]:
# [Cell 18] - Test normalized processing
normalized_data = process_dataset_normalized(data)

# Display sample results
print("\nSample results with normalized counts:")
for item in normalized_data[:2]:
    print(f"\nMain hashtag: {item['main_hashtag']}")
    print(f"Data source: {item['source']}")
    print(f"Number of hashtags: {item['total_hashtags']}")
    if item['hashtag_counts']:
        print("Top 3 related hashtags:")
        top_3 = sorted(item['hashtag_counts'].items(), 
                      key=lambda x: x[1], 
                      reverse=True)[:3]
        for tag, count in top_3:
            print(f"  #{tag}: {count:.6f}")

No topPosts available for fallback counting



Sample results with normalized counts:

Main hashtag: deathpositive
Data source: related
Number of hashtags: 61
Top 3 related hashtags:
  #death: 11900000.000000
  #cemetery: 3290000.000000
  #graveyard: 2350000.000000

Main hashtag: deathpositivepup
Data source: topPosts
Number of hashtags: 162
Top 3 related hashtags:
  #kermitthedog: 0.040892
  #deathpositive: 0.026022
  #grieftherapykermit: 0.022305


In [108]:
# [Cell 19] - Standardize all measurements to relative frequencies
def convert_to_relative_frequencies(hashtag_counts: Dict[str, float]) -> Dict[str, float]:
    """
    Convert any hashtag counts to relative frequencies
    """
    if not hashtag_counts:
        return {}
    
    # Calculate total for normalization
    total = sum(hashtag_counts.values())
    
    # Convert to relative frequencies
    return {
        tag: count/total 
        for tag, count in hashtag_counts.items()
    }

def process_dataset_standardized(data: List[Dict]) -> List[Dict]:
    """
    Process dataset using standardized relative frequencies for all items
    """
    processed_items = []
    
    for item in data:
        # Try related hashtags first
        hashtag_counts = create_hashtag_dict(item)
        source = 'related'
        
        # Fallback to topPosts if no related hashtags
        if not hashtag_counts:
            hashtag_counts = extract_hashtags_from_posts(item)
            source = 'topPosts'
        
        # Convert to relative frequencies regardless of source
        freq_counts = convert_to_relative_frequencies(hashtag_counts)
        
        processed_items.append({
            'main_hashtag': item.get('name', 'unknown'),
            'hashtag_frequencies': freq_counts,
            'source': source,
            'total_hashtags': len(freq_counts)
        })
    
    return processed_items

# [Cell 20] - Test standardized processing
standardized_data = process_dataset_standardized(data)

# Display sample results
print("\nSample results with standardized frequencies:")
for item in standardized_data[:2]:
    print(f"\nMain hashtag: {item['main_hashtag']}")
    print(f"Data source: {item['source']}")
    print(f"Number of hashtags: {item['total_hashtags']}")
    if item['hashtag_frequencies']:
        print("Top 3 related hashtags:")
        top_3 = sorted(item['hashtag_frequencies'].items(), 
                      key=lambda x: x[1], 
                      reverse=True)[:3]
        for tag, freq in top_3:
            print(f"  #{tag}: {freq:.6f}")

# [Cell 21] - Basic statistics about frequencies
print("\nFrequency Statistics:")
all_frequencies = []
for item in standardized_data:
    if item['hashtag_frequencies']:
        all_frequencies.extend(item['hashtag_frequencies'].values())

if all_frequencies:
    print(f"Mean frequency: {np.mean(all_frequencies):.6f}")
    print(f"Median frequency: {np.median(all_frequencies):.6f}")
    print(f"Min frequency: {min(all_frequencies):.6f}")
    print(f"Max frequency: {max(all_frequencies):.6f}")

No topPosts available for fallback counting



Sample results with standardized frequencies:

Main hashtag: deathpositive
Data source: related
Number of hashtags: 61
Top 3 related hashtags:
  #death: 0.346047
  #cemetery: 0.095672
  #graveyard: 0.068337

Main hashtag: deathpositivepup
Data source: topPosts
Number of hashtags: 162
Top 3 related hashtags:
  #kermitthedog: 0.040892
  #deathpositive: 0.026022
  #grieftherapykermit: 0.022305

Frequency Statistics:
Mean frequency: 0.011551
Median frequency: 0.004310
Min frequency: 0.000880
Max frequency: 1.000000


In [109]:
# [Cell 25] - Fixed PMI calculation with debugging
def calculate_pmi_fixed(standardized_data: List[Dict]) -> Dict[str, Dict[str, float]]:
    """
    Calculate PMI with improved error handling and debugging
    """
    # Calculate individual hashtag probabilities
    total_occurrences = 0
    hashtag_counts = {}
    
    # First pass: collect all hashtags and their frequencies
    for item in standardized_data:
        main_tag = item['main_hashtag'].lower()  # Normalize case
        # Add main hashtag to counts
        hashtag_counts[main_tag] = hashtag_counts.get(main_tag, 0) + 1
        
        # Add related hashtags to counts
        for tag, freq in item['hashtag_frequencies'].items():
            tag = tag.lower()  # Normalize case
            hashtag_counts[tag] = hashtag_counts.get(tag, 0) + freq
            total_occurrences += freq
    
    # Calculate probabilities
    hashtag_probs = {
        tag: (count/total_occurrences) 
        for tag, count in hashtag_counts.items()
    }
    
    # Debug info
    print(f"Total unique hashtags found: {len(hashtag_counts)}")
    print(f"Total occurrences: {total_occurrences}")
    
    # Calculate PMI scores
    pmi_scores = {}
    missed_tags = set()  # Track any tags we miss
    
    for item in standardized_data:
        main_tag = item['main_hashtag'].lower()
        if main_tag not in pmi_scores:
            pmi_scores[main_tag] = {}
        
        for tag, freq in item['hashtag_frequencies'].items():
            tag = tag.lower()
            
            # Skip self-collocations
            if tag == main_tag:
                continue
            
            # Verify we have probabilities for both tags
            if main_tag not in hashtag_probs or tag not in hashtag_probs:
                missed_tags.add(tag if tag not in hashtag_probs else main_tag)
                continue
            
            # Calculate joint probability
            joint_prob = freq / total_occurrences
            
            # Calculate PMI if we have non-zero probabilities
            if joint_prob > 0 and hashtag_probs[main_tag] > 0 and hashtag_probs[tag] > 0:
                pmi = np.log2(joint_prob / (hashtag_probs[main_tag] * hashtag_probs[tag]))
                pmi_scores[main_tag][tag] = pmi
    
    if missed_tags:
        print(f"\nWarning: Could not calculate PMI for {len(missed_tags)} tags")
        print("Sample of missed tags:", list(missed_tags)[:5])
    
    return pmi_scores

# [Cell 26] - Test fixed PMI calculation
print("Calculating PMI scores...")
pmi_results = calculate_pmi_fixed(standardized_data)

# Display results
print("\nPMI Results Summary:")
print(f"Number of main hashtags with collocations: {len(pmi_results)}")

# Show sample of results
print("\nSample of PMI scores:")
for main_tag, collocations in list(pmi_results.items())[:2]:  # Show first 2 items
    if collocations:
        print(f"\nMain hashtag: #{main_tag}")
        print("Top 5 collocations:")
        top_5 = sorted(collocations.items(), 
                      key=lambda x: x[1], 
                      reverse=True)[:5]
        for tag, pmi in top_5:
            print(f"  #{tag}: {pmi:.4f}")

Calculating PMI scores...
Total unique hashtags found: 3125
Total occurrences: 49.00000000000082

PMI Results Summary:
Number of main hashtags with collocations: 50

Sample of PMI scores:

Main hashtag: #deathpositive
Top 5 collocations:
  #project_necropolis: 3.9420
  #haunting: 3.9420
  #graveyard_dead: 3.9420
  #graveyard_freaks: 3.9420
  #grave_affair: 3.9420

Main hashtag: #deathpositivepup
Top 5 collocations:
  #theresalwaysdogs: 5.5245
  #dogs: 5.5245
  #dogsofinstagram: 5.5245
  #doglover: 5.5245
  #deathconcious: 5.5245


In [110]:
# [Cell 29] - Adjusted PMI analysis with debugging
def analyze_collocations_debug(standardized_data: List[Dict], 
                             min_freq: float = 0.001) -> Dict[str, Dict[str, dict]]:
    """
    Calculate PMI with debugging information and adjusted threshold
    """
    # Initialize counts
    pair_counts = {}
    hashtag_counts = {}
    total_occurrences = 0
    
    # First pass: collect frequencies
    print("Collecting frequencies...")
    for item in standardized_data:
        main_tag = item['main_hashtag'].lower()
        if main_tag not in pair_counts:
            pair_counts[main_tag] = {}
            
        # Count co-occurrences
        for tag, freq in item['hashtag_frequencies'].items():
            tag = tag.lower()
            if tag == main_tag:
                continue
                
            hashtag_counts[tag] = hashtag_counts.get(tag, 0) + freq
            hashtag_counts[main_tag] = hashtag_counts.get(main_tag, 0) + freq
            
            if tag not in pair_counts[main_tag]:
                pair_counts[main_tag][tag] = 0
            pair_counts[main_tag][tag] += freq
            total_occurrences += freq
    
    print(f"\nTotal occurrences: {total_occurrences:.4f}")
    print(f"Number of unique hashtags: {len(hashtag_counts)}")
    
    # Calculate enhanced statistics
    results = {}
    filtered_pairs = 0
    total_pairs = 0
    
    for main_tag in pair_counts:
        results[main_tag] = {}
        
        for tag, pair_count in pair_counts[main_tag].items():
            total_pairs += 1
            # Debug frequency filtering
            if pair_count/total_occurrences < min_freq:
                filtered_pairs += 1
                continue
                
            # Calculate probabilities
            p_xy = pair_count/total_occurrences
            p_x = hashtag_counts[main_tag]/total_occurrences
            p_y = hashtag_counts[tag]/total_occurrences
            
            # Calculate PMI
            pmi = np.log2(p_xy/(p_x * p_y))
            
            results[main_tag][tag] = {
                'pmi': pmi,
                'count': pair_count,
                'joint_prob': p_xy,
                'tag_prob': p_y
            }
    
    print(f"\nFiltering statistics:")
    print(f"Total hashtag pairs examined: {total_pairs}")
    print(f"Pairs filtered out (freq < {min_freq}): {filtered_pairs}")
    print(f"Pairs retained: {total_pairs - filtered_pairs}")
    
    return results

# [Cell 30] - Run adjusted analysis
print("Running adjusted collocation analysis...")
collocation_results = analyze_collocations_debug(standardized_data, min_freq=0.001)

# Display detailed results
print("\nDetailed Collocation Analysis:")
for main_tag, collocations in list(collocation_results.items())[:2]:  # Show first 2
    if not collocations:  # Skip if no collocations
        continue
        
    print(f"\nMain hashtag: #{main_tag}")
    print("Top 5 collocations by PMI:")
    
    # Sort by PMI score
    sorted_collocations = sorted(
        collocations.items(),
        key=lambda x: x[1]['pmi'],
        reverse=True
    )[:5]
    
    for tag, stats in sorted_collocations:
        print(f"\n  #{tag}:")
        print(f"    PMI: {stats['pmi']:.4f}")
        print(f"    Count: {stats['count']:.4f}")
        print(f"    Joint Probability: {stats['joint_prob']:.4f}")
        print(f"    Tag Probability: {stats['tag_prob']:.4f}")

Running adjusted collocation analysis...
Collecting frequencies...

Total occurrences: 49.0000
Number of unique hashtags: 3124

Filtering statistics:
Total hashtag pairs examined: 4242
Pairs filtered out (freq < 0.001): 4076
Pairs retained: 166

Detailed Collocation Analysis:

Main hashtag: #deathpositive
Top 5 collocations by PMI:

  #macabre:
    PMI: 3.2436
    Count: 0.0500
    Joint Probability: 0.0010
    Tag Probability: 0.0017

  #graveyard:
    PMI: 2.9278
    Count: 0.0683
    Joint Probability: 0.0014
    Tag Probability: 0.0028

  #cemetery:
    PMI: 2.6105
    Count: 0.0957
    Joint Probability: 0.0020
    Tag Probability: 0.0049

  #death:
    PMI: 2.3281
    Count: 0.3460
    Joint Probability: 0.0071
    Tag Probability: 0.0216


In [111]:
# [Cell 31] - Complete PMI analysis for all hashtags
def analyze_all_collocations(standardized_data: List[Dict], 
                           min_freq: float = 0.001) -> Dict[str, Dict[str, dict]]:
    """
    Calculate PMI for all hashtags with improved tracking
    """
    # Initialize counts
    pair_counts = {}
    hashtag_counts = {}
    total_occurrences = 0
    processed_tags = set()
    
    print("Processing data...")
    # First pass: collect all hashtags and frequencies
    for item in standardized_data:
        main_tag = item['main_hashtag'].lower()
        processed_tags.add(main_tag)
        
        if main_tag not in pair_counts:
            pair_counts[main_tag] = {}
            
        # Count co-occurrences
        for tag, freq in item['hashtag_frequencies'].items():
            tag = tag.lower()
            if tag == main_tag:
                continue
                
            hashtag_counts[tag] = hashtag_counts.get(tag, 0) + freq
            hashtag_counts[main_tag] = hashtag_counts.get(main_tag, 0) + freq
            
            if tag not in pair_counts[main_tag]:
                pair_counts[main_tag][tag] = 0
            pair_counts[main_tag][tag] += freq
            total_occurrences += freq
    
    print(f"\nData Summary:")
    print(f"Total main hashtags processed: {len(processed_tags)}")
    print(f"Total unique hashtags found: {len(hashtag_counts)}")
    print(f"Total occurrences: {total_occurrences:.4f}")
    
    # Calculate PMI for each main hashtag
    results = {}
    filtered_counts = {tag: 0 for tag in processed_tags}
    retained_counts = {tag: 0 for tag in processed_tags}
    
    for main_tag in processed_tags:
        results[main_tag] = {}
        
        if main_tag not in pair_counts:
            print(f"Warning: No pairs found for #{main_tag}")
            continue
            
        for tag, pair_count in pair_counts[main_tag].items():
            # Apply frequency filter
            if pair_count/total_occurrences < min_freq:
                filtered_counts[main_tag] += 1
                continue
                
            # Calculate probabilities
            p_xy = pair_count/total_occurrences
            p_x = hashtag_counts[main_tag]/total_occurrences
            p_y = hashtag_counts[tag]/total_occurrences
            
            # Calculate PMI
            pmi = np.log2(p_xy/(p_x * p_y))
            
            results[main_tag][tag] = {
                'pmi': pmi,
                'count': pair_count,
                'joint_prob': p_xy,
                'tag_prob': p_y
            }
            retained_counts[main_tag] += 1
    
    # Display results for each main hashtag
    print("\nResults by main hashtag:")
    for tag in processed_tags:
        total = filtered_counts[tag] + retained_counts[tag]
        if total > 0:
            print(f"\n#{tag}:")
            print(f"  Pairs filtered: {filtered_counts[tag]}")
            print(f"  Pairs retained: {retained_counts[tag]}")
            print(f"  Total pairs: {total}")
    
    return results

# [Cell 32] - Run complete analysis
print("Running complete collocation analysis...")
full_results = analyze_all_collocations(standardized_data)

# Display top collocations for each hashtag
print("\nTop Collocations for Each Hashtag:")
for main_tag, collocations in full_results.items():
    if not collocations:  # Skip if no collocations
        continue
        
    print(f"\nMain hashtag: #{main_tag}")
    print("Top 5 collocations by PMI:")
    
    # Sort by PMI score
    sorted_collocations = sorted(
        collocations.items(),
        key=lambda x: x[1]['pmi'],
        reverse=True
    )[:5]
    
    for tag, stats in sorted_collocations:
        print(f"\n  #{tag}:")
        print(f"    PMI: {stats['pmi']:.4f}")
        print(f"    Count: {stats['count']:.4f}")
        print(f"    Joint Probability: {stats['joint_prob']:.4f}")

Running complete collocation analysis...
Processing data...

Data Summary:
Total main hashtags processed: 50
Total unique hashtags found: 3124
Total occurrences: 49.0000

Results by main hashtag:

#deathpositivemi:
  Pairs filtered: 18
  Pairs retained: 3
  Total pairs: 21

#deathpositivehamilton:
  Pairs filtered: 101
  Pairs retained: 2
  Total pairs: 103

#deathpositiveartists:
  Pairs filtered: 45
  Pairs retained: 2
  Total pairs: 47

#deathpositivemoment:
  Pairs filtered: 104
  Pairs retained: 3
  Total pairs: 107

#deathpositiveuk:
  Pairs filtered: 33
  Pairs retained: 6
  Total pairs: 39

#deathpositivedog:
  Pairs filtered: 25
  Pairs retained: 3
  Total pairs: 28

#deathpositiveevents:
  Pairs filtered: 29
  Pairs retained: 4
  Total pairs: 33

#deathpositivejewelry:
  Pairs filtered: 56
  Pairs retained: 0
  Total pairs: 56

#deathpositivecinema:
  Pairs filtered: 27
  Pairs retained: 0
  Total pairs: 27

#deathpositivegifts:
  Pairs filtered: 0
  Pairs retained: 9
  Total