In [10]:
!pip install nltk vaderSentiment



In [11]:
from google.colab import drive
drive.mount('/content/drive')

import json
import glob
import os
import pandas as pd
import numpy as np

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [12]:
df = pd.read_csv("data/responses.csv")

In [13]:
import nltk
from nltk.tokenize import word_tokenize
nltk.download('punkt_tab')

def ttr(text):
    tokens = word_tokenize(text.lower())
    if len(tokens) == 0:
        return 0
    return len(set(tokens)) / len(tokens)

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


In [14]:
df["ttr"] = df["response"].apply(ttr)

In [15]:
def compare_lexical_diversity(df, group_col, metric="ttr"):
    if metric not in df.columns:
        raise ValueError(f"Metric '{metric}' not found. Add it first.")

    summary = df.groupby(group_col)[metric].mean().sort_values(ascending=False)
    return summary

In [18]:
import pandas as pd
import numpy as np

def create_lexical_diversity_table(df, characteristic, metric="ttr"):
    if metric not in df.columns:
        raise ValueError(f"Metric '{metric}' not found. Run df['ttr'] = df['response'].apply(ttr) first.")

    models = sorted(df['model'].unique())
    categories = sorted(df[characteristic].unique())

    results = []

    for model in models:
        model_df = df[df['model'] == model]
        row = {'Model': model}

        # Add lexical diversity for each category
        for category in categories:
            diversity = model_df[model_df[characteristic] == category][metric].mean()
            row[str(category)] = diversity

        # Add average across all categories
        row['Average'] = model_df[metric].mean()

        results.append(row)

    results_df = pd.DataFrame(results)

    # Round numeric columns
    numeric_cols = results_df.select_dtypes(include=[np.number]).columns
    results_df[numeric_cols] = results_df[numeric_cols].round(4)

    return results_df


def export_all_lexical_diversity_tables_html(df, output_file='lexical_diversity_tables.html', metric='ttr'):
    characteristics = ['age', 'gender', 'religion', 'politics']

    # Define color schemes for each characteristic
    colors = {
        'age': '#3498db',       # Blue
        'gender': '#e74c3c',    # Red
        'religion': '#9b59b6',  # Purple
        'politics': '#f39c12'   # Orange
    }

    # Metric display name
    metric_name = metric.upper() if metric == 'ttr' else metric.capitalize()

    html_content = f"""
    <html>
    <head>
        <style>
            body {{
                font-family: Arial, sans-serif;
                margin: 20px;
            }}
            table {{
                border-collapse: collapse;
                margin-bottom: 30px;
                width: auto;
            }}
            th, td {{
                border: 1px solid #ddd;
                padding: 8px 12px;
                text-align: center;
            }}
            tr:nth-child(even) {{
                background-color: #f9f9f9;
            }}
            tr:hover:not(.title-row) {{
                background-color: #f0f0f0;
            }}
            .model-col {{
                text-align: left;
                font-weight: bold;
            }}

            /* Title row styling */
            .title-row {{
                background-color: #2c3e50;
                color: white;
                font-size: 16px;
                font-weight: bold;
            }}

            /* Model header - always gray */
            .model-header {{
                background-color: #5d6d7e;
                color: white;
                font-weight: bold;
            }}

            /* Age headers - blue */
            .age-header {{
                background-color: #3498db;
                color: white;
                font-weight: bold;
            }}

            /* Gender headers - red */
            .gender-header {{
                background-color: #e74c3c;
                color: white;
                font-weight: bold;
            }}

            /* Religion headers - purple */
            .religion-header {{
                background-color: #9b59b6;
                color: white;
                font-weight: bold;
            }}

            /* Politics headers - orange */
            .politics-header {{
                background-color: #f39c12;
                color: white;
                font-weight: bold;
            }}
        </style>
    </head>
    <body>
        <h1>Lexical Diversity ({metric_name}) by Characteristic</h1>
    """

    for char in characteristics:
        char_df = create_lexical_diversity_table(df, char, metric)

        # Start table
        html_content += '<table>\n'

        # Add title row that spans all columns
        num_cols = len(char_df.columns)
        html_content += f'<thead>\n<tr class="title-row">\n<th colspan="{num_cols}">Lexical Diversity ({metric_name}) by {char.capitalize()}</th>\n</tr>\n'

        # Add column headers
        html_content += '<tr>\n'
        for col in char_df.columns:
            if col == 'Model':
                html_content += f'<th class="model-header">{col}</th>\n'
            else:
                html_content += f'<th class="{char}-header">{col}</th>\n'
        html_content += '</tr>\n</thead>\n'

        # Add data rows
        html_content += '<tbody>\n'
        for _, row in char_df.iterrows():
            html_content += '<tr>\n'
            for col, val in row.items():
                if col == 'Model':
                    html_content += f'<td class="model-col">{val}</td>\n'
                else:
                    html_content += f'<td>{val}</td>\n'
            html_content += '</tr>\n'

        html_content += '</tbody>\n</table>\n\n'

    html_content += """
    </body>
    </html>
    """

    # Save to file
    with open(output_file, 'w') as f:
        f.write(html_content)

    print(f"All tables saved to {output_file}")
    print("Open the file in a browser, then copy each table into Google Docs")


def export_individual_lexical_diversity_tables_html(df, metric='ttr'):
    characteristics = ['age', 'gender', 'religion', 'politics']

    colors = {
        'age': '#3498db',       # Blue
        'gender': '#e74c3c',    # Red
        'religion': '#9b59b6',  # Purple
        'politics': '#f39c12'   # Orange
    }

    metric_name = metric.upper() if metric == 'ttr' else metric.capitalize()

    for char in characteristics:
        char_df = create_lexical_diversity_table(df, char, metric)

        html_content = f"""
        <html>
        <head>
            <style>
                table {{
                    border-collapse: collapse;
                    margin: 20px;
                }}
                th, td {{
                    border: 1px solid #ddd;
                    padding: 8px 12px;
                    text-align: center;
                }}
                tr:nth-child(even) {{
                    background-color: #f9f9f9;
                }}
                tr:hover:not(.title-row) {{
                    background-color: #f0f0f0;
                }}
                .model-col {{
                    text-align: left;
                    font-weight: bold;
                }}
                .title-row {{
                    background-color: #2c3e50;
                    color: white;
                    font-size: 16px;
                    font-weight: bold;
                }}
                .model-header {{
                    background-color: #5d6d7e;
                    color: white;
                    font-weight: bold;
                }}
                .data-header {{
                    background-color: {colors[char]};
                    color: white;
                    font-weight: bold;
                }}
            </style>
        </head>
        <body>
        """

        # Build table with title row
        num_cols = len(char_df.columns)
        html_content += '<table>\n<thead>\n'
        html_content += f'<tr class="title-row">\n<th colspan="{num_cols}">Lexical Diversity ({metric_name}) by {char.capitalize()}</th>\n</tr>\n'

        # Column headers
        html_content += '<tr>\n'
        for col in char_df.columns:
            if col == 'Model':
                html_content += f'<th class="model-header">{col}</th>\n'
            else:
                html_content += f'<th class="data-header">{col}</th>\n'
        html_content += '</tr>\n</thead>\n'

        # Data rows
        html_content += '<tbody>\n'
        for _, row in char_df.iterrows():
            html_content += '<tr>\n'
            for col, val in row.items():
                if col == 'Model':
                    html_content += f'<td class="model-col">{val}</td>\n'
                else:
                    html_content += f'<td>{val}</td>\n'
            html_content += '</tr>\n'

        html_content += '</tbody>\n</table>\n</body>\n</html>'

        filename = f'lexical_diversity_by_{char}.html'
        with open(filename, 'w') as f:
            f.write(html_content)

        print(f"Table saved to {filename}")


df["ttr"] = df["response"].apply(ttr)

export_all_lexical_diversity_tables_html(df, 'lexical_diversity_all_characteristics.html', metric='ttr')
export_individual_lexical_diversity_tables_html(df, metric='ttr')

All tables saved to lexical_diversity_all_characteristics.html
Open the file in a browser, then copy each table into Google Docs
Table saved to lexical_diversity_by_age.html
Table saved to lexical_diversity_by_gender.html
Table saved to lexical_diversity_by_religion.html
Table saved to lexical_diversity_by_politics.html
