# Social Listening Data Export

This notebook connects to BigQuery and exports social media listening data to CSV.

## 1. Environment Setup

In [None]:
# Import required libraries
import os
import pandas as pd
from google.cloud import bigquery
from google.oauth2 import service_account
from datetime import datetime
from pathlib import Path

# Import helper functions from main.py
import sys
sys.path.append(os.path.dirname(os.path.abspath('')))
from main import get_bigquery_client, query_to_csv

In [ ]:
# Using Google Cloud CLI authentication (Application Default Credentials)
# Make sure you've run: gcloud auth application-default login

project_id = os.getenv('GOOGLE_CLOUD_PROJECT', 'sinnia-gnp')  # Default to sinnia-gnp if not set

# Check if Application Default Credentials are available
try:
    from google.auth import default
    credentials, project = default()
    print("✓ Using Application Default Credentials")
    if project:
        print(f"✓ Default project from gcloud: {project}")
        if not project_id:
            project_id = project
except Exception as e:
    print("❌ Application Default Credentials not found")
    print("   Please run: gcloud auth application-default login")
    print("   Or install gcloud CLI from: https://cloud.google.com/sdk/docs/install")

# Initialize BigQuery client with Application Default Credentials
try:
    # No need to pass credentials when using ADC
    client = bigquery.Client(project=project_id)
    print(f"✓ Connected to BigQuery project: {client.project}")
except Exception as e:
    print(f"❌ Error connecting to BigQuery: {e}")
    print("   Please ensure you have run: gcloud auth application-default login")
    print("   And that you have access to the project")

In [None]:
# Option 1: Use environment variables
credentials_path = os.getenv('GOOGLE_APPLICATION_CREDENTIALS')
project_id = os.getenv('GOOGLE_CLOUD_PROJECT', 'sinnia-gnp')  # Default to sinnia-gnp if not set

# Option 2: Set paths directly (uncomment and update if not using .env)
# credentials_path = 'path/to/your/service-account-key.json'
# project_id = 'sinnia-gnp'

# Verify credentials path exists
if credentials_path and os.path.exists(credentials_path):
    print(f"✓ Credentials file found: {credentials_path}")
else:
    print("❌ Credentials file not found. Please update the path.")
    print("   Set GOOGLE_APPLICATION_CREDENTIALS in .env or update credentials_path above")

In [None]:
# Initialize BigQuery client
try:
    client = get_bigquery_client(credentials_path=credentials_path, project_id=project_id)
    print(f"✓ Connected to BigQuery project: {client.project}")
except Exception as e:
    print(f"❌ Error connecting to BigQuery: {e}")
    print("   Please check your credentials and project ID")

## 3. Define and Run Query

In [None]:
# Define the query
query = """
SELECT 
    platform, 
    CAST(created_at AS STRING FORMAT 'YYYY-MM-DD HH24:MI') AS creado, 
    user_id, 
    user, 
    CAST(followers AS STRING) AS seguidores,
    text, 
    CAST(likes_reactions AS STRING) AS likes, 
    CAST(comments AS STRING) AS comentarios, 
    CAST(shares_retweets AS STRING) AS compartidos,
    CAST(engagements AS STRING) AS enganches,
    CAST(views AS STRING) AS vistas
FROM `sinnia-gnp.social_dashboard_table.listening_table_prd` 
WHERE created_at >= '2025-05-12' 
    AND created_at < '2025-05-14'
    AND topic_id = 238
    AND NOT (
        CONTAINS_SUBSTR(text, "Estadio GNP") 
        OR CONTAINS_SUBSTR(text, "Auditorio GNP") 
        OR CONTAINS_SUBSTR(text, "el GNP") 
        OR CONTAINS_SUBSTR(text, "Foro GNP")
    )
"""

print("Query defined successfully")

In [None]:
# Run query and get results as DataFrame
print("Running query...")
try:
    df = client.query(query).to_dataframe()
    print(f"✓ Query completed successfully")
    print(f"✓ Retrieved {len(df):,} rows")
    print(f"\nColumns: {', '.join(df.columns)}")
except Exception as e:
    print(f"❌ Error running query: {e}")

## 4. Preview Data

In [None]:
# Display first few rows
print("\nFirst 5 rows of data:")
df.head()

In [None]:
# Basic statistics
print(f"Total rows: {len(df):,}")
print(f"\nPlatform distribution:")
print(df['platform'].value_counts())
print(f"\nDate range: {df['creado'].min()} to {df['creado'].max()}")

## 5. Export to CSV

In [None]:
# Create data directory if it doesn't exist
os.makedirs('data', exist_ok=True)

# Generate filename with timestamp
timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
filename = f'social_listening_topic238_{timestamp}.csv'
filepath = os.path.join('data', filename)

# Export to CSV
df.to_csv(filepath, index=False, encoding='utf-8')
print(f"✓ Data exported to: {filepath}")
print(f"✓ File size: {os.path.getsize(filepath) / 1024 / 1024:.2f} MB")

## 6. Alternative: Direct Query to CSV (for large datasets)

In [None]:
# Use this method for very large datasets to avoid memory issues
# Uncomment to use:

# large_filename = f'social_listening_topic238_large_{timestamp}.csv'
# large_filepath = os.path.join('data', large_filename)

# # Export in chunks of 10,000 rows
# query_to_csv(client, query, large_filepath, chunk_size=10000)

## Summary

The social listening data has been successfully exported. You can find your CSV file in the `data/` directory.

### Next Steps:
1. Check the exported CSV file in the `data/` folder
2. Modify the date range in the query to export different time periods
3. Adjust the topic_id to query different topics
4. Add additional filters or columns as needed