# Notebook 1: Data Loading and Initial Exploration

In [18]:
# Load libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns


In [19]:
# Load the survey dataset
survey_data = pd.read_csv("../data/raw/reuters_survey2019.csv", sep=";").copy()

# Initial memory usage
memory_mb = survey_data.memory_usage(deep=True).sum() / 1024**2
print(f"   Memory usage: {memory_mb:.1f} MB")


   Memory usage: 62.5 MB


## Cleaning

### Renaming some column for better legibility 

- The survey contains questions on the media consumption patterns of specific outlets. They are coded with `media_id` as a suffix. For better readability in case it is needed later, we're replacing the `media_id` suffix with the `media_name`. We matched the id to the name by extracting both from an accompanying dataset. This was done separately (in a Kaggle notebook) because the file was quite large (~500MB). 

In [20]:
# Media mapping dictionary
media_dict = {
    "New York Times": 1,
    "nrk.no": 105926,
    "bandnewstv.band.uol.com.br": 107106,
    "tv2.no": 109079,
    "FOX News": 1092,
    "BBC": 1094,
    "CNN": 1095,
    "marca.com": 110083,
    "web.de": 110307,
    "romaniatv.net": 110410,
    "meinbezirk.at": 110719,
    "click.ro": 110824,
    "t-online.de": 110952,
    "stiripesurse.ro": 110963,
    "rtlnieuws.nl": 119661,
    "abcnyheter.no": 123356,
    "Daily Mail": 1747,
    "The Sydney Morning Herald": 19320,
    "Sky News": 19524,
    "spiegel.de": 19831,
    "Washington Post": 2,
    "Australian Broadcast Company (ABC)": 20775,
    "naver.com": 208880,
    "lasexta.com": 212150,
    "elconfidencial.com": 21886,
    "bild.de": 22009,
    "n-tv.de": 23538,
    "heute.at": 23588,
    "Huffington Post": 27502,
    "stirileprotv.ro": 282092,
    "digi24.ro": 282212,
    "gmx.net": 282257,
    "msnbc.com 1": 293951,
    "The Guardian UK": 300560,
    "okdiario.com": 326845,
    "nettavisen": 38749,
    "realitatea": 38830,
    "derStandard": 39179,
    "krone": 39194,
    "Focus": 39463,
    "ORF": 39480,
    "dagbladet": 39516,
    "abc": 39848,
    "adevarul": 39952,
    "mediafax-en": 40094,
    "cancan": 40221,
    "kleinezeitung": 40421,
    "elperiodico": 40450,
    "20minutos": 40499,
    "observator": 40555,
    "kurier-at": 40620,
    "evz": 40809,
    "kbs-ko": 40932,
    "aftenposten": 41062,
    "antena3noticias": 41255,
    "libertatea": 41366,
    "elMundo": 41526,
    "telecinco": 41589,
    "hotnews": 41650,
    "antena3": 41653,
    "folha": 41736,
    "ziare": 41831,
    "News.com.au": 41991,
    "lavanguardia": 42090,
    "gmx.at": 515086,
    "au.yahoo.com": 540452,
    "telegraaf.nl": 55612,
    "Globo": 60427,
    "Record News": 623507,
    "MSN - Brazil (Portuguese)": 623515,
    "UOL": 65509,
    "news.chosun.com": 66295,
    "9 News": 68328,
    "eldiario.es": 68711,
    "Yahoo News - Latest News & Headlines": 751082,
    "Terra.com.br": 83354,
}

# Create reverse mapping (media_id to media_name)
id_to_name = {v: k for k, v in media_dict.items()}


# Function to rename columns
def rename_columns(df, id_to_name_dict):
    new_columns = {}

    for col in df.columns:
        # Check if column starts with weekly_use_ or heavier_use_
        if col.startswith("weekly_use_") or col.startswith("heavier_use_"):
            # Extract the prefix and media_id
            parts = col.split("_")
            if len(parts) >= 3:
                prefix = "_".join(parts[:-1])  # everything except the last part
                media_id = int(parts[-1])  # the last part should be media_id

                # Replace with media name from the dictionary
                if media_id in id_to_name_dict:
                    media_name = id_to_name_dict[media_id]
                    new_columns[col] = f"{prefix}_{media_name}"

    return df.rename(columns=new_columns)


# Apply the renaming
survey_data_renamed = rename_columns(survey_data, id_to_name)

# Check the renamed columns
print("Original columns with media_id:")
print(
    [
        col
        for col in survey_data.columns
        if col.startswith(("weekly_use_", "heavier_use_"))
    ][:10]
)
print()

print("Renamed columns with media names:")
print(
    [
        col
        for col in survey_data_renamed.columns
        if col.startswith(("weekly_use_", "heavier_use_"))
    ][:10]
)
print()

survey_data_renamed.to_csv(
    "../data/processed/reuters_survey2019_renamed.csv", index=False
)


Original columns with media_id:
['weekly_use_1', 'weekly_use_105926', 'weekly_use_107106', 'weekly_use_109079', 'weekly_use_1092', 'weekly_use_1094', 'weekly_use_1095', 'weekly_use_110083', 'weekly_use_110307', 'weekly_use_110410']

Renamed columns with media names:
['weekly_use_New York Times', 'weekly_use_nrk.no', 'weekly_use_bandnewstv.band.uol.com.br', 'weekly_use_tv2.no', 'weekly_use_FOX News', 'weekly_use_BBC', 'weekly_use_CNN', 'weekly_use_marca.com', 'weekly_use_web.de', 'weekly_use_romaniatv.net']



In [21]:
# Look at all column names and decide on variables of interest
print(f"All column names ({len(survey_data_renamed.columns)} total):")
for i, col in enumerate(survey_data_renamed.columns):
    print(f"{i + 1:2d}. {col}")


All column names (180 total):
 1. uid
 2. country
 3. weight
 4. gender
 5. age
 6. education
 7. income
 8. use_internet_general
 9. use_news_general
10. use_news_main
11. use_news_avoidance
12. use_news_worn_out
13. use_news_tvshows
14. use_news_tvchannels
15. use_news_radio
16. use_news_newspapers_print
17. use_news_magazines_print
18. use_news_newspapers_online
19. use_news_magazines_online
20. use_news_broadcasting_online
21. use_news_other_online
22. use_news_sns
23. use_news_blogs
24. use_news_onlinecommunities
25. use_news_none
26. interest_in_news
27. interest_in_politics
28. political_orientation
29. weekly_use_New York Times
30. weekly_use_nrk.no
31. weekly_use_bandnewstv.band.uol.com.br
32. weekly_use_tv2.no
33. weekly_use_FOX News
34. weekly_use_BBC
35. weekly_use_CNN
36. weekly_use_marca.com
37. weekly_use_web.de
38. weekly_use_romaniatv.net
39. weekly_use_meinbezirk.at
40. weekly_use_click.ro
41. weekly_use_t-online.de
42. weekly_use_stiripesurse.ro
43. weekly_use_rtlnie

In [22]:
# Define a list of sensible variables to keep for analysis
# This list includes identifiers, demographics, media usage patterns, and engagement metrics
selected_variables = [
    # Identifiers & Survey Metadata
    "uid",
    "country",
    "weight",
    # Demographics
    "gender",
    "age",
    "education",
    "income",
    # General Internet & News Usage
    "use_internet_general",
    "use_news_general",
    "use_news_main",
    # News Behavior Patterns
    "use_news_avoidance",
    "use_news_worn_out",
    # Traditional Media
    "use_news_tvshows",
    "use_news_tvchannels",
    "use_news_radio",
    "use_news_newspapers_print",
    "use_news_magazines_print",
    # Digital Media
    "use_news_newspapers_online",
    "use_news_magazines_online",
    "use_news_broadcasting_online",
    "use_news_other_online",
    # Social & Alternative Media
    "use_news_sns",
    "use_news_blogs",
    "use_news_onlinecommunities",
    "use_news_none",
    # Engagement & Attitudes
    "interest_in_news",
    "interest_in_politics",
    "political_orientation",
]


In [23]:
# Create working dataset with selected variables
working_data = survey_data_renamed[selected_variables].copy()


In [24]:
print(
    f"Memory usage reduction: {(survey_data_renamed.memory_usage(deep=True).sum() - working_data.memory_usage(deep=True).sum()) / survey_data_renamed.memory_usage(deep=True).sum() * 100:.1f}%"
)


Memory usage reduction: 44.9%


In [25]:
# Check data types and unique values

print("Current data types:")
for var in working_data.columns:
    dtype = working_data[var].dtype
    unique_count = working_data[var].nunique()
    # Print variable name, data type, and unique value count
    print(f"{var}: {dtype} ({unique_count:,} unique values)")
    # Unique values and summary for core variables
    if working_data[var].dtype == "object":
        print(f"    unique values: {working_data[var].unique()}")


Current data types:
uid: int64 (24,190 unique values)
country: object (12 unique values)
    unique values: ['UK' 'ES' 'DE' 'AU' 'US' 'JP' 'BR' 'AT' 'NL' 'NO' 'KR' 'RO']
weight: object (2,914 unique values)
    unique values: ['0,929872634328135' '0,753771738300143' '0,815144220929541' ...
 '0,892653700280155' '1,054105562932110' '0,927531113044941']
gender: object (2 unique values)
    unique values: ['f' 'm']
age: int64 (79 unique values)
education: object (10 unique values)
    unique values: ['masters or equivalent' 'short-cycle tertiary' 'post secondary'
 'upper secondary' 'bachelors or equivalent' 'doctoral or equivalent'
 'lower secondary' 'early childhood' 'primary' 'none']
income: object (3 unique values)
    unique values: ['low' 'medium' 'high' nan]
use_internet_general: object (9 unique values)
    unique values: ['6-10 times a day' '2-5 times a day' '10+ times a day' 'once a day'
 '4-6 days a week' "don't know" '2-3 days a week' 'once a week'
 'less than once a week']
use_

In [26]:
# Drop the `use_news_onlinecommunities` column because it has 100% "No" response
working_data = working_data.drop("use_news_onlinecommunities", axis=1)

print(f"Working with {len(working_data.columns)} variables")


Working with 27 variables


In [27]:
# Survey weights
## Keeping weights for population estimates
## Excluding from ML features to prevent leakage and because they are not predictive
## Will use for weighted statistics later

### First, convert from European to standard dot format
working_data["weight"] = working_data["weight"].str.replace(",", ".").astype(float)
### Second, separate for later use
survey_weights = working_data[["weight"]].copy()


### Handle missing values

In [28]:
# Check missing values
## for `income`, replace missing data cells with 'Unknown'
## for `use_news_main`, replace missing data cells with 'No main source'
missing_data = working_data.isnull().sum()
missing_data = missing_data[missing_data > 0]

for col, count in missing_data.items():
    pct = (count / len(working_data)) * 100
    print(f"   {col}: {count:,} missing ({pct:.1f}%)")

    if col == "income":
        # Fill missing values with 'Unknown'
        working_data[col] = working_data[col].fillna("Unknown")
        print(f"     Filled with 'Unknown' category")
    elif col == "use_news_main":
        # Fill missing values with 'No main source'
        working_data[col] = working_data[col].fillna("No main source")
        print(f"     Filled with 'No main source' category")

print(f"   All missing data handled")

# Verify no missing data remains
final_missing_check = working_data.isnull().sum().sum()
print(f"   Final missing data check: {final_missing_check} missing values")


   income: 3,586 missing (14.8%)
     Filled with 'Unknown' category
   use_news_main: 997 missing (4.1%)
     Filled with 'No main source' category
   All missing data handled
   Final missing data check: 0 missing values


# Save cleaned dataset and save survey weights separately

In [29]:
# Save the cleaned working dataset
working_data.to_csv("../data/processed/working_dataset_clean.csv", index=False)

# Save survey weights separately
if "weight" in working_data.columns:
    survey_weights.to_csv("../data/processed/survey_weights.csv", index=False)
