In [None]:
import json
import pandas as pd
import ast
import re

# --- Data Cleaning and Validation Functions ---

def extract_codes_deep(codes):
    """
    Recursively extracts and flattens codes from various nested formats.
    Handles lists, dictionaries, and string-encoded lists/dictionaries.
    """
    result = []
    if isinstance(codes, list):
        for item in codes:
            result.extend(extract_codes_deep(item))
    elif isinstance(codes, dict):
        if 'code' in codes and isinstance(codes['code'], str):
            result.append(codes['code'])
    elif isinstance(codes, str):
        stripped_code = codes.strip()
        if (stripped_code.startswith('[') and stripped_code.endswith(']')) or \
           (stripped_code.startswith('{') and stripped_code.endswith('}')):
            try:
                parsed_code = ast.literal_eval(stripped_code)
                result.extend(extract_codes_deep(parsed_code))
            except (ValueError, SyntaxError):
                if re.match(r"^[A-Z]{2,}-[A-Z]{2,}$", stripped_code):
                    result.append(stripped_code)
        elif re.match(r"^[A-Z]{2,}-[A-Z]{2,}$", stripped_code):
            result.append(stripped_code)
    return result

def clean_entry_and_capture_failures(entry, failed_ids_list):
    """
    Validates and restructures a single JSON object. If validation fails,
    it adds the hashed_videoId to a list of failed IDs.
    
    Returns a clean dictionary if the entry is valid, otherwise returns None.
    """
    # Attempt to get hashed_videoId early for logging purposes
    video_id = entry.get('hashed_videoId') if isinstance(entry, dict) else None
    
    # 1. Check for the presence of essential keys
    if not isinstance(entry, dict) or not all(k in entry for k in ['codes', 'reason', 'hashed_videoId']):
        if video_id:
            failed_ids_list.append(video_id)
        return None
        
    # 2. Extract and validate codes
    codes_extracted = extract_codes_deep(entry['codes'])
    if not codes_extracted:
        if video_id:
            failed_ids_list.append(video_id)
        return None
        
    # 3. Validate reason and hashed_videoId
    reason = entry.get('reason')
    if not isinstance(reason, str) or not reason.strip() or not video_id:
        if video_id:
            failed_ids_list.append(video_id)
        return None
        
    # 4. Construct the clean object
    return {
        'codes': sorted(list(set(codes_extracted))),
        'reason': reason.strip(),
        'hashed_videoId': video_id
    }

# --- Main Data Loading and Processing Pipeline ---

# 1. Specify file paths
result_file_path = 'your_results.json'
dataset_path = '../your_dataset.csv'

# 2. Initialize lists for cleaned data and failed IDs
cleaned_data = []
failed_video_ids = []
results_df = None

# 3. Load and clean the JSON data
try:
    with open(result_file_path, 'r') as file:
        raw_data = json.load(file)
    print(f"Successfully loaded {result_file_path} with {len(raw_data)} records.")
    
    # Apply the cleaning function to each entry in the raw data
    cleaned_data = [clean_entry_and_capture_failures(e, failed_video_ids) for e in raw_data]
    cleaned_data = [item for item in cleaned_data if item is not None] # Remove None entries
    
    # Remove duplicates from failed_video_ids
    failed_video_ids = sorted(list(set(failed_video_ids)))
    
    if not cleaned_data:
        raise ValueError("No valid data remaining after cleaning. Check JSON file structure.")

    results_df = pd.DataFrame(cleaned_data)
    print(f"Data cleaning complete.")
    print(f"  - {len(results_df)} valid records processed.")
    print(f"  - {len(failed_video_ids)} records failed validation and were skipped.")

except FileNotFoundError:
    print(f"Error: The file {result_file_path} was not found. Please check the file path.")
except json.JSONDecodeError:
    print(f"Error: The file {result_file_path} is not a valid JSON. Please check its content.")
except ValueError as e:
    print(e)

# 4. Load external dataset and enrich the cleaned data
if results_df is not None:
    try:
        dataset = pd.read_csv(dataset_path)
        print(f"Successfully loaded {dataset_path}.")
        
        # Merge the cleaned data with the dataset to add 'dateTime'
        if 'hashed_videoId' in dataset.columns:
            enriched_data = pd.merge(results_df, dataset[['hashed_videoId', 'dateTime']], on='hashed_videoId', how='left')
            print("Cleaned data enriched with 'dateTime' column.")
            print("\n--- Sample of Final Enriched Data ---")
            print(enriched_data.head())
            
            # You can now inspect or save the list of failed video IDs
            print(f"\n--- {len(failed_video_ids)} Failed Video IDs Captured ---")
            # print(failed_video_ids) # Uncomment to display the list of failed IDs
            
            # Optionally, save the failed IDs to a file
            # with open('failed_video_ids.txt', 'w') as f:
            #     for video_id in failed_video_ids:
            #         f.write(f"{video_id}\\n")
            # print("Failed video IDs saved to failed_video_ids.txt")

        else:
            print("Error: 'hashed_videoId' column not found in the CSV dataset. Cannot enrich data.")
            enriched_data = results_df

    except FileNotFoundError:
        print(f"Error: The dataset file {dataset_path} was not found. Cannot enrich data.")
        enriched_data = results_df
else:
    print("Cannot proceed with data enrichment due to issues with analysis_results.json.")


In [None]:
import matplotlib.pyplot as plt
import pandas as pd
from pandas.plotting import register_matplotlib_converters
register_matplotlib_converters()

# Create a copy to avoid modifying the original dataframe
plot_df = enrich_with_datetime(result_data, dataset)

# Convert datetime using proper pandas methods
plot_df = plot_df.assign(
    date=pd.to_datetime(plot_df['dateTime']),
    Quarture =lambda x: x['date'].dt.to_period('Q')
)
# Filter the DataFrame to start from 2019Q4
plot_df = plot_df[plot_df['Quarture'] >= 'Jan2019'].copy()

# Group and count
grouped_data = plot_df.groupby('Quarture')['hashed_videoId'].count()

# Plotting
fig, ax = plt.subplots(figsize=(16, 8))
grouped_data.plot(kind='line', marker='o', ax=ax, linewidth=2)
ax.set_title('Video Publication Timeline', fontsize=14, pad=20)
ax.set_xlabel('')
ax.set_ylabel('Video Count', fontsize=12)

# Format x-ticks
ax.set_xticks(grouped_data.index)
ax.set_xticklabels(
    [pd.to_datetime(str(x)).strftime('%b\n%Y') for x in grouped_data.index],
    rotation=0,
    fontsize=10
)

# Add data labels using zip to avoid Period type issues
for date, count in zip(grouped_data.index, grouped_data.values):
    ax.text(
        pd.to_datetime(str(date)),
        count + 0.5,
        f'{count}',
        ha='center',
        va='bottom',
        fontsize=9
    )

plt.tight_layout()
plt.show()

In [None]:
import pandas as pd

# Convert 'dateTime' to datetime if not already
enriched_data['dateTime'] = pd.to_datetime(enriched_data['dateTime'])

# Assign each post to a quarter
enriched_data['quarter'] = enriched_data['dateTime'].dt.to_period('Q')


In [None]:

# 2) Distribution of content tagged with each 'element'
element_counts = merged_data['element'].value_counts(dropna=False).rename('Count')
element_percentages = (merged_data['element'].value_counts(normalize=True, dropna=False) * 100).round(2).rename('Percentage')

element_stats = pd.concat([element_counts, element_percentages], axis=1)
print("Content distribution by element:")
print(element_stats)

# 3) Distribution of 'need' within each 'element' (counts and row-wise percentages)
# Counts cross-tab
element_need_counts = pd.crosstab(merged_data['element'], merged_data['need'], dropna=False)
print("\nElement x Need (counts):")
print(element_need_counts)

# Row-wise percent within element
element_need_pct = (pd.crosstab(merged_data['element'], merged_data['need'], normalize='index', dropna=False) * 100).round(2)
print("\nElement x Need (row-wise %):")
print(element_need_pct)

# 4) Optional: overall need distribution (useful context)
need_counts = merged_data['need'].value_counts(dropna=False).rename('Count')
need_percentages = (merged_data['need'].value_counts(normalize=True, dropna=False) * 100).round(2).rename('Percentage')
need_stats = pd.concat([need_counts, need_percentages], axis=1)
print("\nOverall need distribution:")
print(need_stats)


# Stacked (long) format for stacked bar charts
element_need_stacked = (
    element_need_counts
    .reset_index()
    .melt(id_vars='element', var_name='Need', value_name='Count')
)
# Add row-wise percentage per element
row_totals = element_need_stacked.groupby('element')['Count'].transform('sum')
element_need_stacked['Percentage'] = (element_need_stacked['Count'] / row_totals * 100).round(2)


# 6) Optional: concise textual summary
total = len(merged_data)
print("\nSummary:")
for el, row in element_stats.iterrows():
    print(f"- {el}: {int(row['Count'])} videos ({row['Percentage']}%)")
print("\nNeed distribution within each element (%):"
for el, row in element_need_pct.iterrows():
    parts = [f"{need}={row[need]}%" for need in element_need_pct.columns]
    print(f"- {el}: " + ", ".join(parts))


In [None]:
#Jaccard scores for all co-occurring code pairs for each element and quarter

from itertools import combinations
import pandas as pd
import numpy as np

def build_jaccard_scores_over_time(enriched_data, needs_df):
    """
    Calculates Jaccard scores for all co-occurring code pairs for each element and quarter.

    Returns:
        list: A list of dictionaries, each containing the element, quarter, code pair, and Jaccard score.
    """
    # Merge to get 'element' associated with each video
    merged_data = pd.merge(enriched_data, needs_df[['videoId', 'element']], on='videoId', how='inner')
    
    all_scores = []

    # Group data by both element and quarter
    for (element, quarter), group in merged_data.groupby(['element', 'quarter']):
        
        # --- Standard Jaccard Calculation Logic ---
        all_codes_in_group = set()
        for codes in group['codes']:
            all_codes_in_group.update(codes)
        
        code_counts = {code: 0 for code in all_codes_in_group}
        
        cooccurrence_counts = {}

        for codes_list in group['codes']:
            unique_codes_in_video = set(codes_list)
            for code in unique_codes_in_video:
                if code in code_counts:
                    code_counts[code] += 1
            
            for code1, code2 in combinations(sorted(list(unique_codes_in_video)), 2):
                pair = tuple(sorted((code1, code2)))
                cooccurrence_counts[pair] = cooccurrence_counts.get(pair, 0) + 1
        
        # Calculate Jaccard score for each pair that co-occurred
        for (code1, code2), intersection in cooccurrence_counts.items():
            union = code_counts.get(code1, 0) + code_counts.get(code2, 0) - intersection
            jaccard_score = intersection / union if union > 0 else 0.0
            
            all_scores.append({
                'element': element,
                'quarter': str(quarter),
                'code_pair': f"{code1} & {code2}",
                'jaccard': jaccard_score
            })
            
    return all_scores


In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

def plot_smoothed_jaccard_heatmap(jaccard_scores, target_element, rolling_window=2):
    """
    Generates a heatmap of smoothed Jaccard scores for a specific element.
    - Applies a rolling average to the scores over time.
    - Rows are code pairs, columns are quarters.
    """
    # Convert the list of scores to a DataFrame
    scores_df = pd.DataFrame(jaccard_scores)
    
    # Filter for the specific element we want to plot
    element_df = scores_df[scores_df['element'] == target_element]
    
    if element_df.empty:
        print(f"No data available to plot for element: {target_element}")
        return

    # Pivot the data to create the base for the heatmap (code pairs vs. time)
    heatmap_base = element_df.pivot_table(
        index='code_pair', 
        columns='quarter', 
        values='jaccard'
    ).fillna(0) # Fill missing pairs in a quarter with 0

    # Ensure quarters are sorted chronologically
    heatmap_base = heatmap_base.reindex(sorted(heatmap_base.columns), axis=1)

    # --- Apply the rolling average to smooth the data across time (the key step) ---
    smoothed_heatmap_df = heatmap_base.rolling(
        window=rolling_window, 
        axis=1, # Apply rolling window across columns (time)
        min_periods=1
    ).mean()

    # --- Plotting ---
    plt.style.use('seaborn-v0_8-whitegrid')
    fig, ax = plt.subplots(figsize=(18, 10)) # Adjust size as needed

    sns.heatmap(
        smoothed_heatmap_df,
        ax=ax,
        cmap='YlOrRd', # A yellow-orange-red colormap similar to the example
        linewidths=.5,
        annot=True, # Display the Jaccard scores on the cells
        fmt=".2f", # Format annotations to two decimal places
        cbar_kws={'label': f'Jaccard Score ({rolling_window}-Quarter Rolling Avg)'}
    )
    
    ax.set_title(f"Smoothed Temporal Trend of Jaccard Scores for: {target_element}", fontsize=16)
    ax.set_xlabel("Time (Quarter)", fontsize=12)
    ax.set_ylabel("Code Pair", fontsize=12)
    plt.xticks(rotation=45, ha='right')
    plt.tight_layout()
    plt.show()


In [None]:
import pandas as pd
import numpy as np
from itertools import combinations
import matplotlib.pyplot as plt
import seaborn as sns

# --- Data Processing Function from Previous Step ---
def build_jaccard_cooccurrence_per_element(enriched_data, needs_data):
    """
    Builds Jaccard co-occurrence matrices for codes, grouped by the 'element'
    (e.g., Tourist, Migrant, Pilgrim, Worker).

    Args:
        enriched_data (pd.DataFrame): DataFrame with 'hashed_videoId' and 'codes' columns.
        needs_data (pd.DataFrame): DataFrame with 'hashed_videoId' and 'element' columns.

    Returns:
        dict: A dictionary where keys are elements and values are the Jaccard matrices.
    """
    merged_data = pd.merge(enriched_data, needs_data[['hashed_videoId', 'element']], on='hashed_videoId', how='inner')
    cooccurrence_matrices = {}

    for element, group in merged_data.groupby('element'):
        all_codes = set()
        for codes in group['codes']:
            all_codes.update(codes)
        all_codes_list = sorted(list(all_codes))
        
        if not all_codes_list:
            continue

        matrix = pd.DataFrame(0, index=all_codes_list, columns=all_codes_list, dtype=float)
        code_counts = {code: 0 for code in all_codes_list}
        
        for codes_list in group['codes']:
            unique_codes = set(codes_list)
            for code in unique_codes:
                if code in code_counts:
                    code_counts[code] += 1
            
            for code1, code2 in combinations(unique_codes, 2):
                if code1 in all_codes_list and code2 in all_codes_list:
                    matrix.loc[code1, code2] += 1
                    matrix.loc[code2, code1] += 1

        for i in all_codes_list:
            for j in all_codes_list:
                if i == j:
                    continue
                
                intersection = matrix.loc[i, j]
                union = code_counts[i] + code_counts[j] - intersection
                
                if union > 0:
                    matrix.loc[i, j] = intersection / union
                else:
                    matrix.loc[i, j] = 0.0
        
        np.fill_diagonal(matrix.values, 0)
        cooccurrence_matrices[element] = matrix
        
    return cooccurrence_matrices

# --- Updated Visualization Function ---
def plot_jaccard_heatmap(matrix, element_name):
    """
    Generates and displays a heatmap for a Jaccard co-occurrence matrix.

    Args:
        matrix (pd.DataFrame): The Jaccard co-occurrence matrix.
        element_name (str): The name of the element (e.g., 'Tourist') for the plot title.
    """
    # Do not plot if the matrix is empty or too small
    if matrix.empty or len(matrix.columns) < 2:
        print(f"Skipping heatmap for '{element_name}' due to insufficient data.")
        return

    plt.figure(figsize=(12, 10))
    sns.heatmap(matrix, annot=True, fmt='.2f', cmap='YlOrRd', 
                cbar_kws={'label': 'Jaccard Index'})
    plt.title(f'Jaccard Index Co-occurrence Matrix â€“ {element_name}', fontsize=16)
    plt.xticks(rotation=45, ha='right')
    plt.yticks(rotation=0)
    plt.tight_layout()
    plt.show()
    
# --- Example Usage ---

# 1. Load your data into pandas DataFrames
# This assumes you have 'analysis_results.json' and 'needs_results.json' in the same directory.
# Since the files are line-delimited JSON, use lines=True.
try:
    # 2. Generate the co-occurrence matrices per element
    jaccard_matrices_per_element = build_jaccard_cooccurrence_per_element(enriched_data, needs_df)

    # 3. Iterate through the generated matrices and plot a heatmap for each one
    print("Generating heatmaps for each element...")
    for element, matrix in jaccard_matrices_per_element.items():
        plot_jaccard_heatmap(matrix, element)
    print("Done.")

except FileNotFoundError:
    print("Make sure 'analysis_results.json' and 'needs_results.json' are in the correct directory.")
except Exception as e:
    print(f"An error occurred: {e}")


Maslow Needs Projections by Elements

In [None]:
from itertools import combinations
import pandas as pd
import numpy as np

def build_jaccard_scores_longitudinal(enriched_data, needs_df):
    merged_data = pd.merge(enriched_data, needs_df[['hashed_hashed_videoId', 'element']], on='hashed_hashed_videoId', how='inner')
    records = []
    for (element, quarter), group in merged_data.groupby(['element', 'quarter']):
        code_counts = {}
        code_pairs = {}
        for codes_list in group['codes']:
            unique_codes = set(codes_list)
            for code in unique_codes:
                code_counts[code] = code_counts.get(code, 0) + 1
            for code1, code2 in combinations(sorted(unique_codes), 2):
                pair = f"{code1} & {code2}"
                code_pairs[pair] = code_pairs.get(pair, 0) + 1
        for pair_str, intersection in code_pairs.items():
            code1, code2 = pair_str.split(" & ")
            union = code_counts.get(code1, 0) + code_counts.get(code2, 0) - intersection
            if union > 0:
                jaccard = intersection / union
                records.append({
                    'element': element,
                    'quarter': str(quarter),
                    'code_pair': pair_str,
                    'jaccard': jaccard
                })
    return pd.DataFrame(records)


In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

def plot_longitudinal_heatmap_for_distinct_pairs(jaccard_df, element, threshold=0.44, rolling_window=2):
    # Filter for this element
    edf = jaccard_df[jaccard_df['element'] == element]
    if edf.empty:
        print(f"No data for {element}")
        return
    # Identify all code pairs that ever exceed the threshold
    mask = edf.groupby('code_pair')['jaccard'].max() > threshold
    kept_pairs = mask[mask].index.tolist()
    if not kept_pairs:
        print(f"No code pairs exceed threshold for {element}")
        return
    # Filter to these code pairs, but include ALL their Jaccard values (not just those above threshold)
    plot_df = edf[edf['code_pair'].isin(kept_pairs)]
    # Pivot for heatmap
    pivot = plot_df.pivot(index='code_pair', columns='quarter', values='jaccard').fillna(0)
    pivot = pivot.reindex(sorted(pivot.columns), axis=1)
    # Smooth with rolling mean across time axis (use .T.rolling().mean().T to avoid warnings)
    pivot_smooth = pivot.T.rolling(window=rolling_window, min_periods=1).mean().T
    # Keep zeros (or NaNs if you want them to be blank) for pairs never present in a given quarter
    # (If you want blanks for missing, swap .fillna(0) for .fillna(np.nan) above)
    plt.figure(figsize=(16, min(1+len(pivot_smooth)*0.45, 16)))
    sns.heatmap(
        pivot_smooth, cmap='YlOrRd', annot=True, fmt='.2f', linewidths=0.5,
        cbar_kws={'label': f'Jaccard Score (Rolling avg, {rolling_window}q)'}
    )
    plt.title(f"Most Distinct Code Co-Occurrence Trends for {element}\n(Code pairs ever >{threshold}, all values shown)", fontsize=15)
    plt.xlabel("Quarter")
    plt.ylabel("Code Pair")
    plt.xticks(rotation=45, ha='right')
    plt.tight_layout()
    plt.show()


In [None]:
from itertools import combinations
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# --- 1. Adapted function to generate matrices with a time component ---
def build_jaccard_matrices_over_time(enriched_data, needs_df):
    """
    Builds Jaccard co-occurrence matrices for codes, grouped by element, need, and quarter.

    Args:
        enriched_data (pd.DataFrame): DataFrame with 'hashed_videoId', 'codes', and 'quarter' columns.
        needs_df (pd.DataFrame): DataFrame with 'hashed_videoId', 'element', and 'need' columns.

    Returns:
        dict: A dictionary where keys are tuples of (element, need, quarter) and
              values are the corresponding Jaccard co-occurrence matrices.
    """
    # Merge the dataframes to bring all necessary columns together
    merged_data = pd.merge(enriched_data, needs_df[['hashed_videoId', 'element', 'need']], on='hashed_videoId', how='inner')

    cooccurrence_matrices = {}

    # Group by element, need, AND quarter
    for (element, need, quarter), group in merged_data.groupby(['element', 'need', 'quarter']):
        # --- The logic below is the same as your original function ---
        all_codes = set()
        for codes in group['codes']:
            all_codes.update(codes)
        all_codes_list = sorted(list(all_codes))

        if not all_codes_list or len(all_codes_list) < 2:
            continue

        matrix = pd.DataFrame(0, index=all_codes_list, columns=all_codes_list, dtype=float)
        code_counts = {code: 0 for code in all_codes_list}

        for codes_list in group['codes']:
            unique_codes = set(codes_list)
            for code in unique_codes:
                if code in code_counts:
                    code_counts[code] += 1
            for code1, code2 in combinations(unique_codes, 2):
                if code1 in all_codes_list and code2 in all_codes_list:
                    matrix.loc[code1, code2] += 1
                    matrix.loc[code2, code1] += 1

        for i in all_codes_list:
            for j in all_codes_list:
                if i == j:
                    continue
                intersection = matrix.loc[i, j]
                union = code_counts.get(i, 0) + code_counts.get(j, 0) - intersection
                matrix.loc[i, j] = intersection / union if union > 0 else 0.0
        
        np.fill_diagonal(matrix.values, 0)
        cooccurrence_matrices[(element, need, quarter)] = matrix
        
    return cooccurrence_matrices

# --- 2. Function to plot the trends (largely the same as before) ---
def plot_jaccard_trends_by_element(cooccurrence_matrices_over_time):
    import matplotlib.pyplot as plt
    import pandas as pd

    trend_data = []
    for (element, need, quarter), matrix in cooccurrence_matrices_over_time.items():
        if matrix.empty or matrix.shape[0] < 2:
            avg_score = 0
        else:
            num_pairs = matrix.shape[0] * (matrix.shape[1] - 1)
            total_jaccard_sum = matrix.values.sum()
            avg_score = total_jaccard_sum / num_pairs if num_pairs > 0 else 0

        trend_data.append({
            'quarter': quarter,
            'need': need,
            'element': element,
            'average_jaccard': avg_score
        })

    if not trend_data:
        print("No data available to plot.")
        return

    trends_df = pd.DataFrame(trend_data)
    trends_df = trends_df.sort_values('quarter')
    trends_df = trends_df[trends_df['quarter'] >= '2020-Q1']

    elements = sorted(trends_df['element'].unique())
    desired_order = ['Basic', 'Safety', 'Esteem', 'Social', 'Self-actualization']

    num_elements = len(elements)
    ncols, nrows = 2, 2  # Assumes 4 elements -- adjust if needed!
    fig, axes = plt.subplots(nrows=nrows, ncols=ncols, figsize=(18, 12), sharex=True, sharey=True)

    for i, element in enumerate(elements):
        ax = axes[i // ncols, i % ncols]
        group_df = trends_df[trends_df['element'] == element]
        pivot_df = group_df.pivot_table(
            index='quarter', columns='need', values='average_jaccard', aggfunc='mean'
        )

        # Keep only columns present for this element, in the desired order
        pivot_df = pivot_df[[col for col in desired_order if col in pivot_df.columns]]
        rolling_avg_df = pivot_df.rolling(window=3, min_periods=1).mean()
        # Format quarter labels as 'YYYY-Qx'
        rolling_avg_df.index = pd.PeriodIndex(rolling_avg_df.index, freq='Q').strftime('%Y-Q%q')
        rolling_avg_df.plot(ax=ax, marker='o')

        # Dynamic subplot title with element name
        ax.set_title(f'Average Jaccard Score Trends by Need for {element}', fontsize=14)
        ax.set_xlabel('Quarter')
        ax.set_ylabel('Avg. Jaccard Score')
        ax.legend(title='Need Category', bbox_to_anchor=(1.05, 1), loc='upper left', fontsize=10)
        ax.tick_params(axis='x', rotation=45)
        ax.grid(True)

    plt.tight_layout()
    plt.show()


#Generate the matrices, now grouped by quarter
jaccard_matrices_by_quarter = build_jaccard_matrices_over_time(enriched_data, needs_df)

#Plot the trends from the generated data
plot_jaccard_trends_by_element(jaccard_matrices_by_quarter)



#Needs by Element

In [None]:
from itertools import combinations
import pandas as pd
import numpy as np

def build_jaccard_matrices_over_time(enriched_data, needs_df):
    merged_data = pd.merge(enriched_data, needs_df[['hashed_videoId', 'element', 'need']], on='hashed_videoId', how='inner')
    cooccurrence_matrices = {}
    for (element, need, quarter), group in merged_data.groupby(['element', 'need', 'quarter']):
        all_codes = set()
        for codes in group['codes']:
            all_codes.update(codes)
        all_codes_list = sorted(list(all_codes))
        if not all_codes_list or len(all_codes_list) < 2:
            continue
        matrix = pd.DataFrame(0, index=all_codes_list, columns=all_codes_list, dtype=float)
        code_counts = {code: 0 for code in all_codes_list}
        for codes_list in group['codes']:
            unique_codes = set(codes_list)
            for code in unique_codes:
                if code in code_counts:
                    code_counts[code] += 1
            for code1, code2 in combinations(unique_codes, 2):
                if code1 in all_codes_list and code2 in all_codes_list:
                    matrix.loc[code1, code2] += 1
                    matrix.loc[code2, code1] += 1
        for i in all_codes_list:
            for j in all_codes_list:
                if i == j:
                    continue
                intersection = matrix.loc[i, j]
                union = code_counts.get(i, 0) + code_counts.get(j, 0) - intersection
                matrix.loc[i, j] = intersection / union if union > 0 else 0.0
        np.fill_diagonal(matrix.values, 0)
        cooccurrence_matrices[(element, need, quarter)] = matrix
    return cooccurrence_matrices


In [None]:
import matplotlib.pyplot as plt

def plot_jaccard_trends_for_element(cooccurrence_matrices_over_time, target_element):
    trend_data = []
    for (element, need, quarter), matrix in cooccurrence_matrices_over_time.items():
        if matrix.empty or matrix.shape[0] < 2:
            avg_score = 0
        else:
            num_pairs = matrix.shape[0] * (matrix.shape[1] - 1)
            total_jaccard_sum = matrix.values.sum()
            avg_score = total_jaccard_sum / num_pairs if num_pairs > 0 else 0
        trend_data.append({
            'quarter': quarter,
            'need': need,
            'element': element,
            'average_jaccard': avg_score
        })
    if not trend_data:
        print(f"No data available to plot for {target_element}.")
        return
    trends_df = pd.DataFrame(trend_data)
    trends_df = trends_df[(trends_df['quarter'] >= '2020-Q1') & (trends_df['element'] == target_element)]
    trends_df = trends_df.sort_values('quarter')
    desired_order = ['Basic', 'Safety', 'Esteem', 'Social', 'Self-actualization']
    pivot_df = trends_df.pivot_table(
        index='quarter', columns='need', values='average_jaccard', aggfunc='mean'
    )
    pivot_df = pivot_df[[col for col in desired_order if col in pivot_df.columns]]
    rolling_avg_df = pivot_df.rolling(window=3, min_periods=1).mean()
    rolling_avg_df.index = pd.PeriodIndex(rolling_avg_df.index, freq='Q').strftime('%Y-Q%q')
    plt.style.use('seaborn-v0_8-whitegrid')
    fig, ax = plt.subplots(figsize=(12, 7))
    rolling_avg_df.plot(ax=ax, marker='o')
    ax.set_title(f'Rolling Average Jaccard Score Trends by Need for {target_element}', fontsize=16)
    ax.set_xlabel('Quarter')
    ax.set_xticks(range(len(rolling_avg_df.index)))
    ax.set_xticklabels(rolling_avg_df.index, rotation=45, ha='right')
    ax.set_ylabel('Average Jaccard Score')
    ax.legend(title='Need Category', bbox_to_anchor=(1.05, 1), loc='upper left', fontsize=10)
    ax.tick_params(axis='x', rotation=45)
    ax.grid(True)
    plt.tight_layout()
    plt.show()


In [None]:
# Use your actual data variables for enriched_data and needs_df
cooccurrence_matrices_over_time = build_jaccard_matrices_over_time(enriched_data, needs_df)


In [None]:
elements = sorted({el for (el, _, _) in cooccurrence_matrices_over_time.keys()})


In [None]:
for element in elements:
    plot_jaccard_trends_for_element(cooccurrence_matrices_over_time, element)


This script defines a function that takes the dictionary of smoothed heatmap data, calculates the average Jaccard score for each need category across all time periods, and generates a line chart to visualize these trends.

In [None]:
import pandas as pd
import numpy as np
from itertools import combinations
import matplotlib.pyplot as plt
import seaborn as sns

# Assume 'merged_data' is the DataFrame you have loaded into your environment.

def calculate_average_jaccard_for_period(data_period):
    """
    Calculates the average Jaccard score ONLY for pairs of codes that
    actually appear together in a given dataframe slice (quarter).
    """
    all_codes_in_period = sorted(list(set(code for codes_list in data_period['codes'] for code in codes_list)))

    if len(all_codes_in_period) < 2:
        return 0.0

    intersections = {}
    for code1, code2 in combinations(all_codes_in_period, 2):
        intersection_count = sum(1 for codes in data_period['codes'] if code1 in codes and code2 in codes)
        intersections[(code1, code2)] = intersection_count

    code_counts = {code: sum(1 for codes in data_period['codes'] if code in codes) for code in all_codes_in_period}

    jaccard_scores = []
    for (code1, code2), intersection in intersections.items():
        # Only calculate score for pairs that co-occur (intersection > 0)
        if intersection > 0:
            union = code_counts.get(code1, 0) + code_counts.get(code2, 0) - intersection
            score = intersection / union if union > 0 else 0.0
            jaccard_scores.append(score)

    # Return the average of the scores from co-occurring pairs.
    return np.mean(jaccard_scores) if jaccard_scores else 0.0

def generate_smoothed_needs_trends(merged_data, freq='QE', window_size=2, start_date='2020-01-01'):
    """
    Generates a DataFrame of smoothed temporal trends for each need category
    using a trailing rolling average.
    """
    merged_data['dateTime'] = pd.to_datetime(merged_data['dateTime'])
    data = merged_data[merged_data['dateTime'] >= start_date].copy()

    needs = sorted(data['need'].dropna().unique())
    trends_data = {need: {} for need in needs}

    data.set_index('dateTime', inplace=True)
    grouped_by_period = data.groupby(pd.Grouper(freq=freq))

    for period_end, period_group in grouped_by_period:
        if period_group.empty: continue
        period_label = f"{period_end.year}-Q{period_end.quarter}"
        for need in needs:
            need_period_data = period_group[period_group['need'] == need]
            avg_jaccard = calculate_average_jaccard_for_period(need_period_data) if not need_period_data.empty else 0.0
            trends_data[need][period_label] = avg_jaccard

    trends_df = pd.DataFrame(trends_data).fillna(0)
    if trends_df.empty:
        print("Warning: No data to plot after processing.")
        return pd.DataFrame()

    trends_df.index = pd.PeriodIndex(trends_df.index, freq='Q').to_timestamp()
    trends_df = trends_df.sort_index()

    # Use a standard "trailing" rolling average (center=False is the default)
    smoothed_trends_df = trends_df.rolling(window=window_size, min_periods=1).mean()
    
    smoothed_trends_df.index = smoothed_trends_df.index.to_period('Q').strftime('%Y-Q%q')
    
    return smoothed_trends_df

def plot_sorted_needs_trends(trends_df, chart_title='Smoothed Temporal Need Trends No Identity'):
    """
    Generates and displays a line plot with a sorted legend based on Maslow's hierarchy.
    """
    if trends_df.empty:
        print("Cannot plot an empty DataFrame.")
        return
        
    # --- ROBUST SORTING IMPLEMENTATION ---
    desired_order = ['Basic', 'Safety', 'Esteem', 'Social', 'Self-actualization']
    
    # Get the actual needs present in the data, preserving their original casing
    actual_needs = trends_df.columns.tolist()
    
    # Create a mapping from lowercase desired needs to their original casing in the data
    actual_needs_map = {need.lower(): need for need in actual_needs}
    
    # Build the final list of columns in the desired order
    ordered_columns = [actual_needs_map[d_need.lower()] for d_need in desired_order if d_need.lower() in actual_needs_map]

    # Re-index the DataFrame columns to match the desired order
    sorted_trends_df = trends_df[ordered_columns]

    # Plotting
    plt.style.use('seaborn-v0_8-whitegrid')
    fig, ax = plt.subplots(figsize=(18, 10))
    palette = sns.color_palette("viridis", n_colors=len(sorted_trends_df.columns))
    
    sorted_trends_df.plot(kind='line', marker='o', ax=ax, color=palette)
    
    ax.set_title(chart_title, fontsize=20, pad=20)
    ax.set_xlabel('Time (Quarter)', fontsize=14)
    ax.set_ylabel('Average Jaccard Score (Smoothed)', fontsize=14)
    ax.legend(title='Need Category', fontsize=12)
    plt.xticks(rotation=45, ha='right')
    plt.yticks(fontsize=12)
    plt.setp(ax.get_xticklabels(), rotation=45, ha='right', fontsize=12)
    plt.tight_layout()
    plt.show()

# --- Example of How to Run the Script ---

# # 1. Generate the smoothed data from your 'merged_data' DataFrame
smoothed_data = generate_smoothed_needs_trends(merged_data, window_size=2)
#
# 2. Plot the final chart with the sorted legend 
if not smoothed_data.empty:
    plot_sorted_needs_trends(smoothed_data)


#Qualitative analysis of cases

In [None]:
import pandas as pd

# Assuming enriched_data, needs_df, and dataset are pre-existing DataFrames

# Step 1: Merge the first two DataFrames using the correct column selection syntax
merged_temp = pd.merge(
    enriched_data[['videoId', 'reason', 'codes','dateTime']],
    needs_df[['videoId', 'element', 'need','reasoning']],
    on='videoId',
    how='inner'
)

# Step 2: Merge the result of the first merge with the third DataFrame
# Note: We select 'videoId' from 'dataset' to perform the merge
merged_data = pd.merge(
    merged_temp,
    dataset[['videoId', 'videoLink']],
    on='videoId',
    how='inner'
)

# Define the target videoId
target_video_id = 7312832352240028974

# Select the row matching the target videoId
row = merged_data[merged_data['videoId'] == target_video_id]

if not row.empty:
    # Extract the relevant qualitative assessment fields
    # Note: 'reasoning' was not in the original merge. If this column exists
    # in one of the source DataFrames, you should add it to the merge list.
    result = {
        'videoLink': row.iloc[0].get('videoLink', ''),
        'codes': row.iloc[0].get('codes', ''),
        'reason': row.iloc[0].get('reason', ''),
        'element': row.iloc[0].get('element', ''),
        'need': row.iloc[0].get('need', ''),
        'dateTime': row.iloc[0].get('dateTime', ''),
        'reasoning': row.iloc[0].get('reasoning', '') # This might return empty if not merged
    }
    # Display the results
    from pprint import pprint
    pprint(result)
else:
    print(f"No entry found for videoId {target_video_id}")



In [None]:
#looping over dataset

import pandas as pd

# Convert 'dateTime' to pandas datetime type
merged_data['dateTime'] = pd.to_datetime(merged_data['dateTime'])

# Create a period column denoting the Year-Quarter (e.g., '2021Q1')
merged_data['quarter'] = merged_data['dateTime'].dt.to_period('Q')

# Define target quarter as a string
target_quarter = '2023-Q1'

# Filter for the target quarter and (optionally) element
target_element = 'Migrant'
filtered_rows = merged_data[
    (merged_data['quarter'] == target_quarter) &
    (merged_data['element'] == target_element)
]

if not filtered_rows.empty:
    # Example output for the first matched row
    for idx, row in filtered_rows.iterrows():
        print({
        'videoLink': row.get('videoLink', ''),
        'codes': row.get('codes', ''),
        'reason': row.get('reason', ''),
        'element': row.get('element', ''),
        'need': row.get('need', ''),
        'dateTime': row.get('dateTime', ''),
        'reasoning': row.get('reasoning', ''),
        'quarter': row.get('quarter', '')
    })

else:
    print(f"No entries found for quarter {target_quarter} and element {target_element}")
