# Notebook 2: Target creation

# (Ethical) News Engagement Prediction Model

1. Business objective: 
    - Predict individual news engagement patterns and information
    consumption preferences to enable:
        - Personalized media literacy interventions
        - Improved news content accessibility
        - Evidence-based civic education programs
        - Responsible platform design decisions

2. Technical approach:
    - Use demographic and media consumption data to predict internal engagement traits (political interest, news interest, avoidance behaviors, fatigue levels)

3. Ethical framework: 
    - Promote information literacy and democratic participation
    - Respect user privacy and autonomy
    - Avoid exploitation or manipulation
    - Support evidence-based interventions

### _Business value proposition_

- For media companies:
    - Understand audienc eengagement patterns
    - Optimize content delivery strategies 
    - Improve user experience design 
    - Increase meaningful engagement

- For educational organizations
    - Target media literacy programs effectively
    - Personalize civic education content
    - Measure intervention success
    - Allocate resources efficiently

- For platforms and tech companies
    - Design responsible recommendation systems
    - Promote authoritative information sources
    - Support informed user decision-making 
    - Enhance democratic discourse

- For researchers and policymakers
    - Evidence-based intervention design
    - Population-level behavior insights
    - Program evaluation metrics
    - Democratic health indicators


In [92]:
# Load libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns


In [93]:
# Load the clean dataset from Notebook 01
working_data = pd.read_csv("../data/processed/working_dataset_clean.csv")


#### Quick memory optimization

In [94]:
# Show memory usage before conversion
memory_before = working_data.memory_usage(deep=True).sum() / 1024**2
print(f"   Memory usage: {memory_before:.2f} MB")


   Memory usage: 32.05 MB


In [95]:
# Show object columns
object_cols = working_data.select_dtypes(include=["object"]).columns.tolist()
print(f"   Object columns: {len(object_cols)}")

for col in object_cols:
    print(f"   {col}: {working_data[col].nunique()} unique values")


   Object columns: 24
   country: 12 unique values
   gender: 2 unique values
   education: 10 unique values
   income: 4 unique values
   use_internet_general: 9 unique values
   use_news_general: 9 unique values
   use_news_main: 12 unique values
   use_news_avoidance: 5 unique values
   use_news_worn_out: 5 unique values
   use_news_tvshows: 2 unique values
   use_news_tvchannels: 2 unique values
   use_news_radio: 2 unique values
   use_news_newspapers_print: 2 unique values
   use_news_magazines_print: 2 unique values
   use_news_newspapers_online: 2 unique values
   use_news_magazines_online: 2 unique values
   use_news_broadcasting_online: 2 unique values
   use_news_other_online: 2 unique values
   use_news_sns: 2 unique values
   use_news_blogs: 2 unique values
   use_news_none: 2 unique values
   interest_in_news: 6 unique values
   interest_in_politics: 6 unique values
   political_orientation: 8 unique values


In [96]:
# Convert object columns (except uid) to category
for col in object_cols:
    if col != "uid":  # Keep uid as is (identifier)
        working_data[col] = working_data[col].astype("category")


In [97]:
# Validate data types after conversion
for col in working_data.columns:
    dtype = working_data[col].dtype
    unique_count = working_data[col].nunique()
    print(f"  {col}: {dtype} ({unique_count:,} unique values)")


  uid: int64 (24,190 unique values)
  country: category (12 unique values)
  weight: float64 (2,914 unique values)
  gender: category (2 unique values)
  age: int64 (79 unique values)
  education: category (10 unique values)
  income: category (4 unique values)
  use_internet_general: category (9 unique values)
  use_news_general: category (9 unique values)
  use_news_main: category (12 unique values)
  use_news_avoidance: category (5 unique values)
  use_news_worn_out: category (5 unique values)
  use_news_tvshows: category (2 unique values)
  use_news_tvchannels: category (2 unique values)
  use_news_radio: category (2 unique values)
  use_news_newspapers_print: category (2 unique values)
  use_news_magazines_print: category (2 unique values)
  use_news_newspapers_online: category (2 unique values)
  use_news_magazines_online: category (2 unique values)
  use_news_broadcasting_online: category (2 unique values)
  use_news_other_online: category (2 unique values)
  use_news_sns: categ

In [98]:
# Check memory usage after conversion
memory_after = working_data.memory_usage(deep=True).sum() / 1024**2
memory_saved = memory_before - memory_after
memory_reduction = (memory_saved / memory_before) * 100

print(f"   Memory before conversion: {memory_before:.2f} MB")
print(f"   Memory usage after conversion: {memory_after:.2f} MB")
print()
print(f"   Memory saved: {memory_saved:.2f} MB")
print(f"   Reduction: {memory_reduction:.1f}%")


   Memory before conversion: 32.05 MB
   Memory usage after conversion: 1.12 MB

   Memory saved: 30.93 MB
   Reduction: 96.5%


In [99]:
print(f"Data types: {working_data.dtypes.value_counts().to_dict()}")


Data types: {CategoricalDtype(categories=['No', 'Yes'], ordered=False, categories_dtype=object): 12, dtype('int64'): 2, CategoricalDtype(categories=['10+ times a day', '2-3 days a week', '2-5 times a day',
                  '4-6 days a week', '6-10 times a day', 'don't know',
                  'less than once a week', 'once a day', 'once a week'],
, ordered=False, categories_dtype=object): 2, CategoricalDtype(categories=['don't know', 'extremely interested',
                  'not at all interested', 'not very interested',
                  'somewhat interested', 'very interested'],
, ordered=False, categories_dtype=object): 2, CategoricalDtype(categories=['AT', 'AU', 'BR', 'DE', 'ES', 'JP', 'KR', 'NL', 'NO', 'RO',
                  'UK', 'US'],
, ordered=False, categories_dtype=object): 1, dtype('float64'): 1, CategoricalDtype(categories=['f', 'm'], ordered=False, categories_dtype=object): 1, CategoricalDtype(categories=['bachelors or equivalent', 'doctoral or equivalent',
           

In [100]:
dtype_summary = {
    "Binary (Yes/No)": 12,
    "Frequency scales": 2,
    "Interest scales": 2,
    "Country": 1,
    "Gender": 1,
    "Education": 1,
    "Income": 1,
    "News main source": 1,
    "News avoidance": 1,
    "News fatigue": 1,
    "Political orientation": 1,
    "Numeric (int64)": 2,
    "Numeric (float64)": 1,
}

for dtype_name, count in dtype_summary.items():
    print(f"   {dtype_name:<20}: {count:>2} columns")

print(f"    {sum(dtype_summary.values())} columns")


   Binary (Yes/No)     : 12 columns
   Frequency scales    :  2 columns
   Interest scales     :  2 columns
   Country             :  1 columns
   Gender              :  1 columns
   Education           :  1 columns
   Income              :  1 columns
   News main source    :  1 columns
   News avoidance      :  1 columns
   News fatigue        :  1 columns
   Political orientation:  1 columns
   Numeric (int64)     :  2 columns
   Numeric (float64)   :  1 columns
    27 columns


# Explaining rationale behind target variable strategy

## To avoid circular reasoning and data leakage

- Research (e.g.: Pennycook & Rand (2019), Guess et al. (2020)) shows that  low political interest leads to reduced critical evaluation, thus, likely increasing a person's risk of being vulnerable to misinformation. 
- Meaning, if we are not careful about excluding attitude/interest variables from our target variable creation, this would bias the model because it will learn that correlation. 

- We _can_ help mitigate thus by establishing external ground truth and using external misinformation suceptibility measures such as behavioral measures like sharing fake news. We do have access to this information in Reuters' `usenews.mediacloud.2019` and `usenews.crowdtangle.2019` datasets, which capture actual media content and their associated engagement metrics (from 1.71 million Facebook posts). However, they are far too large for our current scale (~700MB and over 5GB, respectively). They would allow for much more sophisticated research insight, certainly. 

- Working within our constraints, we are instead maintaining strict separation of the attitute/interest variables, and instead creating the vulnerability score based on demographics and media consumption patterns alone. 

- As such, our division is as follows
    - Target = Internal attitudes/traits
    - Features = External behaviors/demographics

#### Separation of features and target variables 

In [101]:
# Separation of features and target variables

# Target components (internal traits)
target_components = [
    "interest_in_politics",  # Political disengagement
    "interest_in_news",  # News disengagement
    "use_news_avoidance",  # Active news avoidance
    "use_news_worn_out",  # News fatigue
    "education",  # Media literacy proxy
]

print(target_components)


['interest_in_politics', 'interest_in_news', 'use_news_avoidance', 'use_news_worn_out', 'education']


In [102]:
# Predictive features (observable behaviors)
feature_candidates = [
    # Demographics (not attitudes)
    "country",
    "gender",
    "age",
    "income",
    # Usage frequency (behavioral)
    "use_internet_general",
    "use_news_general",
    # Main source choice (behavioral)
    "use_news_main",
    # Media channel usage (behavioral patterns)
    "use_news_tvshows",
    "use_news_tvchannels",
    "use_news_radio",
    "use_news_newspapers_print",
    "use_news_magazines_print",
    "use_news_newspapers_online",
    "use_news_magazines_online",
    "use_news_broadcasting_online",
    "use_news_other_online",
    "use_news_sns",
    "use_news_blogs",
    "use_news_none",
]

print(feature_candidates)


['country', 'gender', 'age', 'income', 'use_internet_general', 'use_news_general', 'use_news_main', 'use_news_tvshows', 'use_news_tvchannels', 'use_news_radio', 'use_news_newspapers_print', 'use_news_magazines_print', 'use_news_newspapers_online', 'use_news_magazines_online', 'use_news_broadcasting_online', 'use_news_other_online', 'use_news_sns', 'use_news_blogs', 'use_news_none']


## Vulnerability theory and scoring 

Based on research, the following factor seem to contribute heavily to misinformation vulnerability: 

1. Political disengagement
    - Rationale: low political interest -> reduced critical evaluation
    - Research: Pennycook, G., & Rand, D. G. (2019). Lazy, not biased: Susceptibility to partisan fake news is better explained by lack of reasoning than by motivated reasoning. Cognition, 188, 39–50. https://doi.org/10.1016/j.cognition.2018.06.011


2. News disengagement
    - Low news interest -> reduced information seeking
    - Research: Prior, M. (2007). Post-Broadcast Democracy: How Media Choice Increases Inequality in Political Involvement and Polarizes Elections. Cambridge University Press. https://doi.org/10.1017/CBO9781139878425


3. News avoidance 
    - Active avoidance -> information gaps
    - Research: 
        - Toff, B., & Nielsen, R. K. (2018). “I Just Google It”: Folk Theories of Distributed Discovery. Journal of Communication, 68(3), 636–657. https://doi.org/10.1093/joc/jqy009
        - Toff, B., & Nielsen, R. K. (2022). How News Feels: Anticipated Anxiety as a Factor in News Avoidance and a Barrier to Political Engagement. Political Communication, 39(6), 697–714. https://doi.org/10.1080/10584609.2022.2123073


4. News fatigue 
    - Worn out by news -> reduced attention
    - Research: Park, C. S. (2019). Does Too Much News on Social Media Discourage News Seeking? Mediating Role of News Efficacy Between Perceived News Overload and News Avoidance on Social Media. Social Media + Society, 5(3), 2056305119872956. https://doi.org/10.1177/2056305119872956


5. Low education
    - Reduced media literacy
    - Research: Allcott, H., & Gentzkow, M. (2017). Social Media and Fake News in the 2016 Election. Journal of Economic Perspectives, 31(2), 211–236. https://doi.org/10.1257/jep.31.2.211


### Component weights based on research literature

In [103]:
# Component weights based on research literature
vulnerability_weights = {
    "political_disengagement": 0.25,  # Strongest predictor
    "news_disengagement": 0.20,  # Core engagement
    "news_avoidance": 0.20,  # Behavioral indicator
    "news_fatigue": 0.15,  # Attention factor
    "education_vulnerability": 0.20,  # Fundamental capability
}


for component, weight in vulnerability_weights.items():
    print(f"   {component}: {weight:.0%}")


   political_disengagement: 25%
   news_disengagement: 20%
   news_avoidance: 20%
   news_fatigue: 15%
   education_vulnerability: 20%


### Converting categorical responses to vulnerability scores (0-100)

In [104]:
# Converting categorical responses to vulnerability scores (0-100)

# Political Interest → Political Disengagement (reversed)
political_interest_map = {
    "extremely interested": 0,
    "very interested": 20,
    "somewhat interested": 40,
    "not very interested": 70,
    "not at all interested": 100,
    "don't know": 60,
}

# News Interest → News Disengagement (reversed)
news_interest_map = {
    "extremely interested": 0,
    "very interested": 20,
    "somewhat interested": 40,
    "not very interested": 70,
    "not at all interested": 100,
    "don't know": 60,
}

# News Avoidance → Direct mapping
news_avoidance_map = {
    "never": 0,
    "occassionally": 30,
    "sometimes": 60,
    "often": 100,
    "don't know": 40,
}

# News Fatigue → Direct mapping (agreement = higher vulnerability)
news_fatigue_map = {
    "strongly disagree": 0,
    "tend to disagree": 25,
    "neither agree nor disagree": 50,
    "tend to agree": 75,
    "strongly agree": 100,
}

# Education → Education Vulnerability (reversed)
education_map = {
    "doctoral or equivalent": 0,
    "masters or equivalent": 10,
    "bachelors or equivalent": 20,
    "short-cycle tertiary": 30,
    "post secondary": 40,
    "upper secondary": 50,
    "lower secondary": 70,
    "primary": 85,
    "early childhood": 95,
    "none": 100,
}


In [105]:
# Validate mappings cover all categories
mappings_to_check = [
    ("interest_in_politics", political_interest_map),
    ("interest_in_news", news_interest_map),
    ("use_news_avoidance", news_avoidance_map),
    ("use_news_worn_out", news_fatigue_map),
    ("education", education_map),
]

for var, mapping in mappings_to_check:
    if var in working_data.columns:
        categories = set(working_data[var].cat.categories)
        mapped_categories = set(mapping.keys())
        missing = categories - mapped_categories
        if missing:
            print(f"    {var}: Missing mappings for {missing}")
        else:
            print(f"    {var}: All {len(categories)} categories mapped")


    interest_in_politics: All 6 categories mapped
    interest_in_news: All 6 categories mapped
    use_news_avoidance: All 5 categories mapped
    use_news_worn_out: All 5 categories mapped
    education: All 10 categories mapped


### Calculate vulnerability components based on mappings

In [106]:
# Calculate vulnerability components based on mappings

# Component 1: Political Disengagement
working_data["political_disengagement_score"] = working_data[
    "interest_in_politics"
].map(political_interest_map)
working_data["political_disengagement_score"] = pd.to_numeric(
    working_data["political_disengagement_score"]
)
print(f"Political disengagement score calculated")

# Component 2: News Disengagement
working_data["news_disengagement_score"] = working_data["interest_in_news"].map(
    news_interest_map
)
working_data["news_disengagement_score"] = pd.to_numeric(
    working_data["news_disengagement_score"]
)
print(f"News disengagement score calculated")

# Component 3: News Avoidance
working_data["news_avoidance_score"] = working_data["use_news_avoidance"].map(
    news_avoidance_map
)
working_data["news_avoidance_score"] = pd.to_numeric(
    working_data["news_avoidance_score"]
)
print(f"News avoidance score calculated")

# Component 4: News Fatigue
working_data["news_fatigue_score"] = working_data["use_news_worn_out"].map(
    news_fatigue_map
)
working_data["news_fatigue_score"] = pd.to_numeric(working_data["news_fatigue_score"])
print(f"News fatigue score calculated")

# Component 5: Education Vulnerability
working_data["education_vulnerability_score"] = working_data["education"].map(
    education_map
)
working_data["education_vulnerability_score"] = pd.to_numeric(
    working_data["education_vulnerability_score"]
)
print(f"Education vulnerability score calculated")


# Check for missing values in components
component_cols = [
    "political_disengagement_score",
    "news_disengagement_score",
    "news_avoidance_score",
    "news_fatigue_score",
    "education_vulnerability_score",
]

for col in component_cols:
    missing = working_data[col].isnull().sum()
    if missing > 0:
        print(f"    {col}: {missing} missing values")
    else:
        print(f"    {col}: gucci")


Political disengagement score calculated
News disengagement score calculated
News avoidance score calculated
News fatigue score calculated
Education vulnerability score calculated
    political_disengagement_score: gucci
    news_disengagement_score: gucci
    news_avoidance_score: gucci
    news_fatigue_score: gucci
    education_vulnerability_score: gucci


### Calculate composite vulnerability score

In [107]:
# Calculate weighted composite score
working_data["vulnerability_score"] = (
    working_data["political_disengagement_score"]
    * vulnerability_weights["political_disengagement"]
    + working_data["news_disengagement_score"]
    * vulnerability_weights["news_disengagement"]
    + working_data["news_avoidance_score"] * vulnerability_weights["news_avoidance"]
    + working_data["news_fatigue_score"] * vulnerability_weights["news_fatigue"]
    + working_data["education_vulnerability_score"]
    * vulnerability_weights["education_vulnerability"]
)

# Round to 1 decimal place for interpretability
working_data["vulnerability_score"] = working_data["vulnerability_score"].round(1)


print(f"    Composite vulnerability score calculated")
print(f"        Mean: {working_data['vulnerability_score'].mean():.1f}")
print(f"        Std: {working_data['vulnerability_score'].std():.1f}")
print(f"        Min: {working_data['vulnerability_score'].min():.1f}")
print(f"        Max: {working_data['vulnerability_score'].max():.1f}")
print(f"        Median: {working_data['vulnerability_score'].median():.1f}")


    Composite vulnerability score calculated
        Mean: 35.6
        Std: 15.2
        Min: 0.0
        Max: 100.0
        Median: 35.5


- Engagement levels

    - Score 35.6: 'Medium-Low vulnerability'; 'Medium-High news engagement'
    - Most people are reasonably engaged

- Population insights

    - Average person: Moderately engaged with news/politics
    - Standard deviation: 15.2 points variation
    - ~68% of people: 20.4 - 50.8 range (1 std)
    - ~95% of people: 5.2 - 66.0 range (2 std)

### Create risk categories

In [108]:
# Define risk categories based on quartile-like thresholds
def categorize_vulnerability(score):
    if score < 25:
        return "Low"
    elif score < 50:
        return "Medium"
    elif score < 75:
        return "High"
    else:
        return "Very High"


working_data["vulnerability_category"] = working_data["vulnerability_score"].apply(
    categorize_vulnerability
)
working_data["vulnerability_category"] = working_data["vulnerability_category"].astype(
    "category"
)


In [109]:
# Set proper category order for plotting and analysis
working_data["vulnerability_category"] = working_data[
    "vulnerability_category"
].cat.reorder_categories(["Low", "Medium", "High", "Very High"])

print(f"RISK CATEGORY DISTRIBUTION:")
category_counts = working_data["vulnerability_category"].value_counts()
total_count = len(working_data)

for category in ["Low", "Medium", "High", "Very High"]:
    count = category_counts.get(category, 0)
    pct = (count / total_count) * 100
    print(f"   {category:>9}: {count:>6,} ({pct:>5.1f}%)")


RISK CATEGORY DISTRIBUTION:
         Low:  6,037 ( 25.0%)
      Medium: 14,056 ( 58.1%)
        High:  3,855 ( 15.9%)
   Very High:    242 (  1.0%)


In [110]:
# Population weighted stats (using `weight` variable we saved earlier)

if "weight" in working_data.columns:
    # Calculate population-weighted vulnerability statistics
    weighted_mean = np.average(
        working_data["vulnerability_score"], weights=working_data["weight"]
    )

    # Calculate weighted standard deviation
    weighted_variance = np.average(
        (working_data["vulnerability_score"] - weighted_mean) ** 2,
        weights=working_data["weight"],
    )
    weighted_std = np.sqrt(weighted_variance)

    # Weighted category distribution
    weighted_category_dist = {}
    total_weight = working_data["weight"].sum()

    for category in ["Low", "Medium", "High", "Very High"]:
        mask = working_data["vulnerability_category"] == category
        weighted_pct = (working_data.loc[mask, "weight"].sum() / total_weight) * 100
        weighted_category_dist[category] = weighted_pct

    print(f"Population weighted statistics:")
    print(f"   Weighted mean vulnerability: {weighted_mean:.1f}")
    print(f"   Weighted std deviation: {weighted_std:.1f}")
    print(f"   Unweighted mean: {working_data['vulnerability_score'].mean():.1f}")
    print(f"   Unweighted std: {working_data['vulnerability_score'].std():.1f}")
    print(
        f"   Mean difference: {abs(weighted_mean - working_data['vulnerability_score'].mean()):.1f} points"
    )

    print()
    print(f"Weighted vs unweighted risk distribution:")
    print(f"{'Category':<12} {'Unweighted %':<13} {'Weighted %':<11} {'Difference'}")
    print("-" * 50)
    for category in ["Low", "Medium", "High", "Very High"]:
        unweighted_pct = (category_counts.get(category, 0) / total_count) * 100
        weighted_pct = weighted_category_dist[category]
        diff = weighted_pct - unweighted_pct
        print(
            f"{category:<12} {unweighted_pct:<13.1f} {weighted_pct:<11.1f} {diff:>+6.1f}"
        )

    print()
    print(f"Population weights show representative estimates")

else:
    print(f"No survey weights available - using unweighted statistics")


Population weighted statistics:
   Weighted mean vulnerability: 36.0
   Weighted std deviation: 15.2
   Unweighted mean: 35.6
   Unweighted std: 15.2
   Mean difference: 0.4 points

Weighted vs unweighted risk distribution:
Category     Unweighted %  Weighted %  Difference
--------------------------------------------------
Low          25.0          24.2          -0.8
Medium       58.1          58.3          +0.2
High         15.9          16.5          +0.6
Very High    1.0           1.0           +0.0

Population weights show representative estimates


### Feature-Target Separation Validation

In [111]:
# Feature-Target Separation Validation

# Target components (excluded from features)
target_components = [
    "interest_in_politics",  # Political disengagement
    "interest_in_news",  # News disengagement
    "use_news_avoidance",  # Active news avoidance
    "use_news_worn_out",  # News fatigue
    "education",  # Media literacy proxy
]

print(f"target components (separated from features):")
for i, comp in enumerate(target_components, 1):
    print(f"   {i}. {comp}")


target components (separated from features):
   1. interest_in_politics
   2. interest_in_news
   3. use_news_avoidance
   4. use_news_worn_out
   5. education


In [112]:
# Feature candidates (for ML model)
feature_candidates = [
    # Demographics (not attitudes)
    "country",
    "gender",
    "age",
    "income",
    # Usage frequency (behavioral)
    "use_internet_general",
    "use_news_general",
    # Main source choice (behavioral)
    "use_news_main",
    # Media channel usage (behavioral patterns)
    "use_news_tvshows",
    "use_news_tvchannels",
    "use_news_radio",
    "use_news_newspapers_print",
    "use_news_magazines_print",
    "use_news_newspapers_online",
    "use_news_magazines_online",
    "use_news_broadcasting_online",
    "use_news_other_online",
    "use_news_sns",
    "use_news_blogs",
    "use_news_none",
]

print(f"Feature candidates:")
print(f"    Demographics: country, gender, age, income")
print(f"    Usage patterns: use_internet_general, use_news_general")
print(f"    Main source: use_news_main")
print(
    f"    Media channels: {len([f for f in feature_candidates if 'use_news_' in f and f != 'use_news_general' and f != 'use_news_main'])} channel variables"
)
print(f"    Total features: {len(feature_candidates)} variables")


Feature candidates:
    Demographics: country, gender, age, income
    Usage patterns: use_internet_general, use_news_general
    Main source: use_news_main
    Media channels: 12 channel variables
    Total features: 19 variables


In [113]:
# Variables excluded for being too close to target
excluded_vars = ["political_orientation"]  # Too similar to political interest
print(f"Excluded variable (too close to target):")
for var in excluded_vars:
    print(f"   • {var} (similar to political interest)")


Excluded variable (too close to target):
   • political_orientation (similar to political interest)


### Validation checks

In [114]:
# Validation checks
print(f"Separation validation:")
overlap = set(target_components) & set(feature_candidates)
if overlap:
    print(f"    Overlap detected: {overlap}")
else:
    print(f"    No overlap between target and feature variables")

# Check all target components exist
missing_target_components = [
    comp for comp in target_components if comp not in working_data.columns
]
if missing_target_components:
    print(f"    Missing target components: {missing_target_components}")
else:
    print(f"    All target components present in dataset")

# Check all feature candidates exist
missing_feature_candidates = [
    feat for feat in feature_candidates if feat not in working_data.columns
]
if missing_feature_candidates:
    print(f"    Missing feature candidates: {missing_feature_candidates}")
else:
    print(f"    All feature candidates present in dataset")

print(f"    Clean separation maintained")


Separation validation:
    No overlap between target and feature variables
    All target components present in dataset
    All feature candidates present in dataset
    Clean separation maintained


## Checkpoint: Saving dataset with target variables

In [115]:
working_data.to_csv("../data/processed/dataset_with_target.csv", index=False)
print("Main dataset saved: dataset_with_target.csv")


Main dataset saved: dataset_with_target.csv


# Initial feature engineering (simple transformation)

### Create feature-only dataset

In [116]:
# Create feature-only dataset
feature_data = working_data[feature_candidates].copy()
print(f"Feature dataset created: {feature_data.shape}")

# Check for missing values in features
missing_summary = feature_data.isnull().sum()
features_with_missing = missing_summary[missing_summary > 0]

if len(features_with_missing) > 0:
    print("Features with missing values:")
    for feature, count in features_with_missing.items():
        pct = (count / len(feature_data)) * 100
        print(f"  {feature}: {count} ({pct:.1f}%)")
else:
    print("No missing values in feature candidates")


Feature dataset created: (24190, 19)
No missing values in feature candidates


### Categorical encoding

In [117]:
#  Categorical encoding

# Identify categorical features
categorical_features = []
for col in feature_candidates:
    if col in feature_data.columns and feature_data[col].dtype == "category":
        categorical_features.append(col)

print(f"Categorical features to encode: {len(categorical_features)}")

# Binary encoding for Yes/No variables
binary_features = []
for col in categorical_features:
    unique_vals = feature_data[col].cat.categories.tolist()
    if set(unique_vals) == {"Yes", "No"} or set(unique_vals) == {"No", "Yes"}:
        binary_features.append(col)

print(f"Binary features (Yes/No): {len(binary_features)}")

# Apply binary encoding
for col in binary_features:
    feature_data[f"{col}_binary"] = (feature_data[col] == "Yes").astype(int)
    print(f"  {col} -> {col}_binary")


Categorical features to encode: 18
Binary features (Yes/No): 12
  use_news_tvshows -> use_news_tvshows_binary
  use_news_tvchannels -> use_news_tvchannels_binary
  use_news_radio -> use_news_radio_binary
  use_news_newspapers_print -> use_news_newspapers_print_binary
  use_news_magazines_print -> use_news_magazines_print_binary
  use_news_newspapers_online -> use_news_newspapers_online_binary
  use_news_magazines_online -> use_news_magazines_online_binary
  use_news_broadcasting_online -> use_news_broadcasting_online_binary
  use_news_other_online -> use_news_other_online_binary
  use_news_sns -> use_news_sns_binary
  use_news_blogs -> use_news_blogs_binary
  use_news_none -> use_news_none_binary


### Ordinal encoding for ordered categories

In [118]:
# Ordinal encoding for ordered categories
ordinal_features = []

# Income is ordinal
if "income" in categorical_features:
    income_order = ["low", "medium", "high", "Unknown"]
    income_mapping = {level: i for i, level in enumerate(income_order)}
    feature_data["income_ordinal"] = feature_data["income"].map(income_mapping)
    ordinal_features.append("income")
    print(f"    income -> income_ordinal (ordinal)")

# Education is ordinal
if "education" in categorical_features:
    education_order = [
        "none",
        "primary",
        "early childhood",
        "lower secondary",
        "upper secondary",
        "post secondary",
        "short-cycle tertiary",
        "bachelors or equivalent",
        "masters or equivalent",
        "doctoral or equivalent",
    ]
    education_mapping = {level: i for i, level in enumerate(education_order)}
    feature_data["education_ordinal"] = feature_data["education"].map(education_mapping)
    ordinal_features.append("education")
    print(f"    education -> education_ordinal (ordinal)")


    income -> income_ordinal (ordinal)


### One-hot encoding for nominal categories

In [119]:
# One-hot encoding for nominal categories
nominal_features = [
    col
    for col in categorical_features
    if col not in binary_features and col not in ordinal_features
]

print(f"Nominal features for one-hot encoding: {len(nominal_features)}")

# Apply one-hot encoding
encoded_dfs = []
for col in nominal_features:
    # Create one-hot encoded columns
    dummies = pd.get_dummies(feature_data[col], prefix=col, prefix_sep="_")
    encoded_dfs.append(dummies)
    print(f"  {col} -> {dummies.shape[1]} dummy variables")

# Combine all encoded features
if encoded_dfs:
    encoded_features = pd.concat(encoded_dfs, axis=1)
    print(f"One-hot encoded features: {encoded_features.shape}")
else:
    encoded_features = pd.DataFrame(index=feature_data.index)
    print("No nominal features to one-hot encode")


Nominal features for one-hot encoding: 5
  country -> 12 dummy variables
  gender -> 2 dummy variables
  use_internet_general -> 9 dummy variables
  use_news_general -> 9 dummy variables
  use_news_main -> 12 dummy variables
One-hot encoded features: (24190, 44)


### Numeric feature engineering

In [121]:
# Age is already numeric, but create age groups as additional feature
if "age" in feature_data.columns:
    feature_data["age_group"] = pd.cut(
        feature_data["age"],
        bins=[0, 30, 45, 60, 100],
        labels=["18-30", "31-45", "46-60", "60+"],
    )
    # Convert age groups to dummy variables
    age_dummies = pd.get_dummies(feature_data["age_group"], prefix="age_group")


In [None]:
# Create media usage count features
media_usage_cols = [
    col for col in binary_features if "use_news_" in col and col != "use_news_general"
]
if media_usage_cols:
    # Count total media channels used
    media_usage_binary = feature_data[[f"{col}_binary" for col in media_usage_cols]]
    feature_data["total_media_channels"] = media_usage_binary.sum(axis=1)

    # Traditional vs digital media usage
    traditional_cols = [
        col
        for col in media_usage_cols
        if any(x in col for x in ["tv", "radio", "print"])
    ]
    digital_cols = [
        col
        for col in media_usage_cols
        if any(x in col for x in ["online", "sns", "blogs"])
    ]

    if traditional_cols:
        traditional_binary = feature_data[[f"{col}_binary" for col in traditional_cols]]
        feature_data["traditional_media_count"] = traditional_binary.sum(axis=1)

    if digital_cols:
        digital_binary = feature_data[[f"{col}_binary" for col in digital_cols]]
        feature_data["digital_media_count"] = digital_binary.sum(axis=1)

    print(f"Media usage features created:")
    print(f"  total_media_channels")
    print(f"  traditional_media_count ({len(traditional_cols)} channels)")
    print(f"  digital_media_count ({len(digital_cols)} channels)")


Media usage features created:
  total_media_channels
  traditional_media_count (5 channels)
  digital_media_count (6 channels)


### Combine all engineered features

In [123]:
# Combine all engineered features

# Collect all engineered features
engineered_features = []

# Original numeric features
numeric_features = ["age"] if "age" in feature_data.columns else []
if numeric_features:
    engineered_features.append(feature_data[numeric_features])

# Binary encoded features
binary_encoded_cols = [f"{col}_binary" for col in binary_features]
if binary_encoded_cols:
    engineered_features.append(feature_data[binary_encoded_cols])

# Ordinal encoded features
ordinal_encoded_cols = []
if "income" in ordinal_features:
    ordinal_encoded_cols.append("income_ordinal")
if "education" in ordinal_features:
    ordinal_encoded_cols.append("education_ordinal")
if ordinal_encoded_cols:
    engineered_features.append(feature_data[ordinal_encoded_cols])

# One-hot encoded features
if not encoded_features.empty:
    engineered_features.append(encoded_features)

# Age group dummies
if not age_dummies.empty:
    engineered_features.append(age_dummies)

# Media usage count features
usage_count_cols = []
if "total_media_channels" in feature_data.columns:
    usage_count_cols.append("total_media_channels")
if "traditional_media_count" in feature_data.columns:
    usage_count_cols.append("traditional_media_count")
if "digital_media_count" in feature_data.columns:
    usage_count_cols.append("digital_media_count")
if usage_count_cols:
    engineered_features.append(feature_data[usage_count_cols])

# Combine all features
if engineered_features:
    final_features = pd.concat(engineered_features, axis=1)
else:
    final_features = pd.DataFrame(index=feature_data.index)

print(f"Final engineered features: {final_features.shape}")
print(f"Feature types:")
print(f"  Numeric: {len(numeric_features)}")
print(f"  Binary: {len(binary_encoded_cols)}")
print(f"  Ordinal: {len(ordinal_encoded_cols)}")
print(f"  One-hot: {encoded_features.shape[1] if not encoded_features.empty else 0}")
print(f"  Age groups: {age_dummies.shape[1] if not age_dummies.empty else 0}")
print(f"  Usage counts: {len(usage_count_cols)}")


Final engineered features: (24190, 65)
Feature types:
  Numeric: 1
  Binary: 12
  Ordinal: 1
  One-hot: 44
  Age groups: 4
  Usage counts: 3


## Create final dataset


In [None]:
# Combine features with target variable
final_dataset = final_features.copy()
final_dataset["vulnerability_score"] = working_data["vulnerability_score"]
final_dataset["vulnerability_category"] = working_data["vulnerability_category"]

# Add weight column (for population statistics)
if "weight" in working_data.columns:
    final_dataset["weight"] = working_data["weight"]
    print("Survey weight included (for statistics only)")


Survey weight included (for statistics only)


In [125]:
# Check for any remaining missing values
missing_final = final_dataset.isnull().sum()
features_with_missing_final = missing_final[missing_final > 0]

if len(features_with_missing_final) > 0:
    print("Warning: Missing values in final dataset:")
    for feature, count in features_with_missing_final.items():
        pct = (count / len(final_dataset)) * 100
        print(f"  {feature}: {count} ({pct:.1f}%)")
else:
    print("No missing values in final dataset")


No missing values in final dataset


In [126]:
# Save engineered dataset
final_dataset.to_csv("../data/processed/dataset_engineered.csv", index=False)
