In [42]:
import pandas as pd
import json
import numpy as np
from pyprojroot import here

from datetime import datetime
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter
import pickle

## Process and format data for general policy analysis

Includes data for acknowledgement of anthropogenic climate change, climate concern, and climate policy.

In [43]:

# Read the annotation JSON file into a DataFrame
df = pd.read_json(here("data/recoded_transcripts/main-at-2024-05-28-recoded.json"))

#get intercoder reliability IDs list
all_inner_IDs=pd.read_csv(here("data/recoded_transcripts/main_project_IDs.csv"))

# Load JSON transcript data
with open(here("data/transcripts/news_segments_2020-04_2021-04_unique_IDs.json")) as file:
    data_list = json.load(file)

# Assuming each item in the list has a 'data' key, we extract those
extracted_data = [item['data'] for item in data_list]

# Normalize the data to flatten the nested structure
transcript_df = pd.json_normalize(extracted_data)

# unpack the meta_info
expanded_columns = df['meta_info'].apply(pd.Series)
df = pd.concat([df.drop('meta_info', axis=1), expanded_columns], axis=1)
#filter the df based on index of annotations
df = df[df['internal_id'].isin(all_inner_IDs["internal_id"])]

# define annotators
annotator_list=df['annotator'].unique()
annotator_dict = {
    2: 'JE',
    3: 'EK',
    10: 'VR',
    4: 'CA',
    11: 'LU',
    13: 'PI',
    12: 'GE',
    14: 'TR',
    15: 'KA'
}

## create dataframe
df= pd.merge(df, all_inner_IDs[['internal_id', 'project_id']], on='internal_id', how='left')
df['media_outlet'] = df['source'].str.split().str[0]
question_labels=['climate_change','attitude','policy']
# Convert the 'meta_info.date' column to datetime format
df['date'] = pd.to_datetime(df['date'], format='%B %d, %Y %A')


df.to_csv("output/all_Q.csv",index=False)


In [44]:

## define separate dataframes for each question
Q1_df = df.pivot(index='annotator', columns=['project_id','media_outlet'], values=question_labels[0]).fillna(value=np.nan)
Q2_df = df.pivot(index='annotator', columns=['project_id','media_outlet'], values=question_labels[1])
Q3_df = df.pivot(index='annotator', columns=['project_id','media_outlet'], values=question_labels[2])

# Save DataFrames as CSV files in the output directory
Q1_df.to_csv("output/Q1_df.csv")
Q2_df.to_csv("output/Q2_df.csv")
Q3_df.to_csv("output/Q3_df.csv")

  Q1_df = df.pivot(index='annotator', columns=['project_id','media_outlet'], values=question_labels[0]).fillna(value=np.nan)


In [45]:

## define choices for each question
Q1_choices=["Acknowledges","Neutral","Denies","Debate","Unclear"]
Q2_choices=["Expresses climate concern","Neutral","Expresses opposition to climate concern","Debate","Unclear"]
Q3_choices=["Supports","Neutral","Opposes","Debate","Unclear","Does not mention"]


# define short choices for displaying
short_Q2_choices=['Concern',
 'Neutral',
 'Opposition',
 'Debate',
 'Unclear'] 
short_Q3_choices=["Supports","Neutral","Opposes","Debate","Unclear","No mention"]


# Save lists to the output directory
with open("output/Q1_choices.pkl", "wb") as f:
    pickle.dump(Q1_choices, f)

with open("output/Q2_choices.pkl", "wb") as f:
    pickle.dump(Q2_choices, f)

with open("output/Q3_choices.pkl", "wb") as f:
    pickle.dump(Q3_choices, f)

with open("output/short_Q2_choices.pkl", "wb") as f:
    pickle.dump(short_Q2_choices, f)

with open("output/short_Q3_choices.pkl", "wb") as f:
    pickle.dump(short_Q3_choices, f)

In [46]:
## count responses 
def count_responses(Q_df):
   # Flatten the DataFrame to consider all the responses
    responses = Q_df.values.flatten()
    # Filter out NaN values
    responses = responses[~pd.isnull(responses)]

    # Count occurrences of each unique response
    response_counts = pd.Series(responses).value_counts()

    # Assign the value 0.5 to each response because there are two annotators per response
    response_values = response_counts * 0.5

    # Convert to DataFrame for better presentation
    response_values_df = response_values.reset_index()
    
    response_values_df.columns = ['Response', 'Value']
    # Calculating the total value
    total_value = response_values_df['Value'].sum()

    # Adding the Proportion column
    response_values_df['Proportion'] = response_values_df['Value'] / total_value

    return response_values_df

In [47]:
response_count_Q1=count_responses(Q1_df)
response_count_Q2=count_responses(Q2_df)
response_count_Q3=count_responses(Q3_df)

response_count_Q1.to_csv("output/response_count_Q1.csv",index=False)
response_count_Q2.to_csv("output/response_count_Q2.csv",index=False)
response_count_Q3.to_csv("output/response_count_Q3.csv",index=False)

In [48]:
#Identify unique media outlets
media_outlets = df['media_outlet'].unique()

#Split into dictionaries

Q1_outlet_dfs={}
for outlet in media_outlets:
    # Selecting only the columns corresponding to the current media outlet
    Q1_outlet_df = Q1_df.xs(outlet, level='media_outlet', axis=1)
    Q1_outlet_dfs[outlet] = count_responses(Q1_outlet_df)

Q2_outlet_dfs={}
for outlet in media_outlets:
    # Selecting only the columns corresponding to the current media outlet
    Q2_outlet_df = Q2_df.xs(outlet, level='media_outlet', axis=1)
    Q2_outlet_dfs[outlet] = count_responses(Q2_outlet_df)

Q3_outlet_dfs={}
for outlet in media_outlets:
    # Selecting only the columns corresponding to the current media outlet
    Q3_outlet_df = Q3_df.xs(outlet, level='media_outlet', axis=1)
    Q3_outlet_dfs[outlet] = count_responses(Q3_outlet_df)

In [49]:

# Convert each DataFrame to a JSON-compatible format
Q1_outlet_json = {outlet: df.to_dict(orient='records') for outlet, df in Q1_outlet_dfs.items()}
Q2_outlet_json = {outlet: df.to_dict(orient='records') for outlet, df in Q2_outlet_dfs.items()}
Q3_outlet_json = {outlet: df.to_dict(orient='records') for outlet, df in Q3_outlet_dfs.items()}


# Save the dictionary of JSON data to a file
with open("output/Q1_outlet_dfs.json", "w") as f:
    json.dump(Q1_outlet_json, f, indent=4)

with open("output/Q2_outlet_dfs.json", "w") as f:
    json.dump(Q2_outlet_json, f, indent=4)

with open("output/Q3_outlet_dfs.json", "w") as f:
    json.dump(Q3_outlet_json, f, indent=4)



In [51]:

# Define the response categories and their order

Q1_choices_to_use=["Acknowledges","Neutral","Denies","Debate"]
Q2_choices_to_use=["Expresses climate concern","Neutral","Expresses opposition to climate concern","Debate"]
Q3_choices_to_use=["Supports","Neutral","Opposes","Debate","Does not mention"]

# Build the Q3_counts dictionary
Q1_counts = {}
Q2_counts = {}
Q3_counts = {}


# Function to extract counts
def extract_counts(outlet_json, choices_to_use, Q3=False):
    counts_dict = {}
    for outlet, responses in outlet_json.items():
        if Q3==False:
            counts = [0, 0, 0, 0]
        else:
            counts = [0, 0, 0, 0, 0]
        for resp in responses:
            response_type = resp["Response"]
            value = resp["Value"]
            if response_type in choices_to_use:
                idx = choices_to_use.index(response_type)
                counts[idx] = value
        counts_dict[outlet] = counts
    return counts_dict

# Build the counts dictionaries
Q1_counts = extract_counts(Q1_outlet_json, Q1_choices_to_use)
Q2_counts = extract_counts(Q2_outlet_json, Q2_choices_to_use)
Q3_counts = extract_counts(Q3_outlet_json, Q3_choices_to_use, Q3=True)

# Optionally dump to files
with open("output/Q1_counts.json", "w") as f:
    json.dump(Q1_counts, f, indent=4)
with open("output/Q2_counts.json", "w") as f:
    json.dump(Q2_counts, f, indent=4)
with open("output/Q3_counts.json", "w") as f:
    json.dump(Q3_counts, f, indent=4)
