In [3]:
!pip install transformers nltk vaderSentiment

Collecting vaderSentiment
  Using cached vaderSentiment-3.3.2-py2.py3-none-any.whl.metadata (572 bytes)
Using cached vaderSentiment-3.3.2-py2.py3-none-any.whl (125 kB)
Installing collected packages: vaderSentiment
Successfully installed vaderSentiment-3.3.2


In [4]:
from google.colab import drive
drive.mount('/content/drive')

import json
import glob
import os
import numpy as np
import pandas as pd

Mounted at /content/drive


In [5]:
df = pd.read_csv("data/responses.csv")
df.head()

Unnamed: 0,age,gender,politics,religion,prompt,response,model,temperature,response_filled,is_blank
0,15,female,democrat,atheist,art,Art profoundly shapes society and culture by r...,grok,1.5,Art profoundly shapes society and culture by r...,False
1,50,non-binary,republican,atheist,art,Art profoundly shapes society and culture by r...,grok,1.5,Art profoundly shapes society and culture by r...,False
2,50,female,republican,atheist,art,Art profoundly influences society and culture ...,grok,1.5,Art profoundly influences society and culture ...,False
3,25,male,democrat,atheist,art,Art profoundly influences society and culture ...,grok,1.5,Art profoundly influences society and culture ...,False
4,15,non-binary,republican,atheist,art,Art profoundly influences society and culture ...,grok,1.5,Art profoundly influences society and culture ...,False


## FACTUAL VS. OPINIONATED LANGUAGE

In [6]:
def detect_factual_language(text):
    factual = [
        'fact', 'data', 'evidence', 'research', 'study', 'studies', 'statistics',
        'proven', 'demonstrated', 'shows', 'indicates', 'confirmed', 'measured',
        'observed', 'documented', 'verified', 'recorded', 'analyzed', 'reported'
    ]
    text_lower = text.lower()
    return sum(text_lower.count(word) for word in factual)

def detect_opinion_language(text):
    opinion = [
        'think', 'believe', 'feel', 'opinion', 'view', 'perspective',
        'seems', 'appears', 'suggest', 'might', 'consider', 'assume', 'guess',
        'estimate', 'interpret', 'likely', 'argue', 'assume'
    ]
    text_lower = text.lower()
    return sum(text_lower.count(word) for word in opinion)


In [8]:
def create_factual_opinion_table(df, characteristic, metric="factual_prop"):
    if metric not in df.columns:
        raise ValueError(f"Metric '{metric}' not found. Compute factual/opinion metrics first.")

    models = sorted(df['model'].unique())
    categories = sorted(df[characteristic].unique())

    results = []

    for model in models:
        model_df = df[df['model'] == model]
        row = {'Model': model}

        # Add metric for each category
        for category in categories:
            value = model_df[model_df[characteristic] == category][metric].mean()
            row[str(category)] = value

        # Add average across all categories
        row['Average'] = model_df[metric].mean()

        results.append(row)

    results_df = pd.DataFrame(results)

    # Round numeric columns
    numeric_cols = results_df.select_dtypes(include=[np.number]).columns
    results_df[numeric_cols] = results_df[numeric_cols].round(4)

    return results_df


def export_all_factual_opinion_tables_html(df, output_file='factual_opinion_tables.html',
                                           metric='factual_prop'):
    characteristics = ['age', 'gender', 'religion', 'politics']

    # Define color schemes for each characteristic
    colors = {
        'age': '#3498db',       # Blue
        'gender': '#e74c3c',    # Red
        'religion': '#9b59b6',  # Purple
        'politics': '#f39c12'   # Orange
    }

    # Metric display name
    metric_names = {
        'factual_count': 'Factual Word Count',
        'opinion_count': 'Opinion Word Count',
        'factual_prop': 'Factual Language Proportion',
        'opinion_prop': 'Opinion Language Proportion'
    }
    metric_name = metric_names.get(metric, metric)

    html_content = f"""
    <html>
    <head>
        <style>
            body {{
                font-family: Arial, sans-serif;
                margin: 20px;
            }}
            table {{
                border-collapse: collapse;
                margin-bottom: 30px;
                width: auto;
            }}
            th, td {{
                border: 1px solid #ddd;
                padding: 8px 12px;
                text-align: center;
            }}
            tr:nth-child(even) {{
                background-color: #f9f9f9;
            }}
            tr:hover:not(.title-row) {{
                background-color: #f0f0f0;
            }}
            .model-col {{
                text-align: left;
                font-weight: bold;
            }}

            /* Title row styling */
            .title-row {{
                background-color: #2c3e50;
                color: white;
                font-size: 16px;
                font-weight: bold;
            }}

            /* Model header - always gray */
            .model-header {{
                background-color: #5d6d7e;
                color: white;
                font-weight: bold;
            }}

            /* Age headers - blue */
            .age-header {{
                background-color: #3498db;
                color: white;
                font-weight: bold;
            }}

            /* Gender headers - red */
            .gender-header {{
                background-color: #e74c3c;
                color: white;
                font-weight: bold;
            }}

            /* Religion headers - purple */
            .religion-header {{
                background-color: #9b59b6;
                color: white;
                font-weight: bold;
            }}

            /* Politics headers - orange */
            .politics-header {{
                background-color: #f39c12;
                color: white;
                font-weight: bold;
            }}
        </style>
    </head>
    <body>
        <h1>{metric_name} by Characteristic</h1>
    """

    for char in characteristics:
        char_df = create_factual_opinion_table(df, char, metric)

        # Start table
        html_content += '<table>\n'

        # Add title row that spans all columns
        num_cols = len(char_df.columns)
        html_content += f'<thead>\n<tr class="title-row">\n<th colspan="{num_cols}">{metric_name} by {char.capitalize()}</th>\n</tr>\n'

        # Add column headers
        html_content += '<tr>\n'
        for col in char_df.columns:
            if col == 'Model':
                html_content += f'<th class="model-header">{col}</th>\n'
            else:
                html_content += f'<th class="{char}-header">{col}</th>\n'
        html_content += '</tr>\n</thead>\n'

        # Add data rows
        html_content += '<tbody>\n'
        for _, row in char_df.iterrows():
            html_content += '<tr>\n'
            for col, val in row.items():
                if col == 'Model':
                    html_content += f'<td class="model-col">{val}</td>\n'
                else:
                    html_content += f'<td>{val}</td>\n'
            html_content += '</tr>\n'

        html_content += '</tbody>\n</table>\n\n'

    html_content += """
    </body>
    </html>
    """

    # Save to file
    with open(output_file, 'w') as f:
        f.write(html_content)

    print(f"All tables saved to {output_file}")
    print("Open the file in a browser, then copy each table into Google Docs")


def export_individual_factual_opinion_tables_html(df, metric='factual_prop'):
    characteristics = ['age', 'gender', 'religion', 'politics']

    colors = {
        'age': '#3498db',       # Blue
        'gender': '#e74c3c',    # Red
        'religion': '#9b59b6',  # Purple
        'politics': '#f39c12'   # Orange
    }

    metric_names = {
        'factual_count': 'Factual Word Count',
        'opinion_count': 'Opinion Word Count',
        'factual_prop': 'Factual Language Proportion',
        'opinion_prop': 'Opinion Language Proportion'
    }
    metric_name = metric_names.get(metric, metric)

    for char in characteristics:
        char_df = create_factual_opinion_table(df, char, metric)

        html_content = f"""
        <html>
        <head>
            <style>
                table {{
                    border-collapse: collapse;
                    margin: 20px;
                }}
                th, td {{
                    border: 1px solid #ddd;
                    padding: 8px 12px;
                    text-align: center;
                }}
                tr:nth-child(even) {{
                    background-color: #f9f9f9;
                }}
                tr:hover:not(.title-row) {{
                    background-color: #f0f0f0;
                }}
                .model-col {{
                    text-align: left;
                    font-weight: bold;
                }}
                .title-row {{
                    background-color: #2c3e50;
                    color: white;
                    font-size: 16px;
                    font-weight: bold;
                }}
                .model-header {{
                    background-color: #5d6d7e;
                    color: white;
                    font-weight: bold;
                }}
                .data-header {{
                    background-color: {colors[char]};
                    color: white;
                    font-weight: bold;
                }}
            </style>
        </head>
        <body>
        """

        # Build table with title row
        num_cols = len(char_df.columns)
        html_content += '<table>\n<thead>\n'
        html_content += f'<tr class="title-row">\n<th colspan="{num_cols}">{metric_name} by {char.capitalize()}</th>\n</tr>\n'

        # Column headers
        html_content += '<tr>\n'
        for col in char_df.columns:
            if col == 'Model':
                html_content += f'<th class="model-header">{col}</th>\n'
            else:
                html_content += f'<th class="data-header">{col}</th>\n'
        html_content += '</tr>\n</thead>\n'

        # Data rows
        html_content += '<tbody>\n'
        for _, row in char_df.iterrows():
            html_content += '<tr>\n'
            for col, val in row.items():
                if col == 'Model':
                    html_content += f'<td class="model-col">{val}</td>\n'
                else:
                    html_content += f'<td>{val}</td>\n'
            html_content += '</tr>\n'

        html_content += '</tbody>\n</table>\n</body>\n</html>'

        filename = f'{metric}_by_{char}.html'
        with open(filename, 'w') as f:
            f.write(html_content)

        print(f"Table saved to {filename}")


df["factual_count"] = df["response"].apply(detect_factual_language)
df["opinion_count"] = df["response"].apply(detect_opinion_language)
df["total_count"] = df["factual_count"] + df["opinion_count"]
df["factual_prop"] = df["factual_count"] / df["total_count"].replace(0, 1)
df["opinion_prop"] = df["opinion_count"] / df["total_count"].replace(0, 1)


export_all_factual_opinion_tables_html(df, 'factual_prop_all_characteristics.html',
                                       metric='factual_prop')
export_all_factual_opinion_tables_html(df, 'opinion_prop_all_characteristics.html',
                                       metric='opinion_prop')

All tables saved to factual_prop_all_characteristics.html
Open the file in a browser, then copy each table into Google Docs
All tables saved to opinion_prop_all_characteristics.html
Open the file in a browser, then copy each table into Google Docs
