<a href="https://colab.research.google.com/github/kumpaten/masters-thesis-code/blob/main/Interbrand_Data.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Install BeautifulSoup
!pip install beautifulsoup4

In [None]:
# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

import json
import html
from bs4 import BeautifulSoup
import pandas as pd

# Specify the filename of your text file containing the HTML content. This assumes that you copy pasted the html content from the website
# https://interbrand.com/best-global-brands/apple/ and its element div.m06__chart into a txt file for further processing.
# Adjust the file path if necessary.
file_path = "/content/drive/MyDrive/Interbrand_HTML_content.txt"

# Read the HTML content from the file
with open(file_path, "r", encoding="utf-8") as file:
    html_content = file.read()

# Parse the HTML content using BeautifulSoup
soup = BeautifulSoup(html_content, "html.parser")
div_tag = soup.find("div", class_="m06__chart")
if div_tag is None:
    raise ValueError("No <div> with class 'm06__chart' found in the file.")

# Extract and unescape the JSON string from the 'data-brands' attribute
data_brands_str = div_tag.get("data-brands")
data_brands_unescaped = html.unescape(data_brands_str)

# Convert the unescaped JSON string to a Python dictionary
data_dict = json.loads(data_brands_unescaped)

# Define the companies to filter
companies_to_filter = [
    "Apple", "Microsoft", "Facebook", "Google", "Oracle",
    "Cisco", "Salesforce", "SAP", "Accenture", "IBM"
]

# Define the year range filter (inclusive)
start_year = 2008
end_year = 2021

# Prepare a list to store the filtered records
records = []

# Loop through each company in the data
for company, info in data_dict.items():
    # Only process companies from our filter list
    if company not in companies_to_filter:
        continue

    years = info.get("study_years", [])
    values = info.get("brand_values", [])

    # Loop over the paired years and values
    for year, value in zip(years, values):
        # Only keep the records for years between 2008 and 2021
        if start_year <= year <= end_year:
            # Convert "null" strings to Python None
            if isinstance(value, str) and value.lower() == "null":
                value = None
            records.append({
                "Company": company,
                "Year": year,
                "Interbrand_Value": value
            })

# Create a DataFrame from the records
df = pd.DataFrame(records)
print(df)

# Save the DataFrame to a CSV file
output_path = '/content/drive/My Drive/Interbrand_data_filtered.csv'
df.to_csv(output_path, index=False)

until here was the code for the web scraper, now it is the data manipulation that was done directly on the CSV file

In [None]:
import pandas as pd

# Define the path to your filtered CSV file on Google Drive.
csv_path = '/content/drive/My Drive/Interbrand_data_filtered.csv'

# Load the CSV file
df = pd.read_csv(csv_path)

# -------------------------------
# For Oracle, add the known 2023 value first.
# According to the source data, Oracle's brand value for 2023 is 34622.
# This extra row will serve as an anchor to interpolate the missing 2020 and 2021 values.
oracle_extra = pd.DataFrame({
    'Company': ['Oracle'],
    'Year': [2023],
    'Interbrand_Value': [34622]
})
df = pd.concat([df, oracle_extra], ignore_index=True)

# Now, isolate Oracle's data and sort by Year.
mask_oracle = df['Company'] == 'Oracle'
df_oracle = df[mask_oracle].sort_values('Year')

print("Oracle data BEFORE interpolation:")
print(df_oracle)

# Perform linear interpolation on Oracle's Value column.
# limit_direction='both' ensures that missing values at either end (if any) are also filled.
df_oracle['Interbrand_Value'] = df_oracle['Interbrand_Value'].interpolate(method='linear', limit_direction='both')

print("\nOracle data AFTER interpolation:")
print(df_oracle)

# Replace the Oracle rows in the main DataFrame with the interpolated values.
# (Sorting again by Year ensures correct assignment.)
df.loc[mask_oracle, 'Interbrand_Value'] = df_oracle.sort_values('Year')['Interbrand_Value']

# Since your dataset should only span 2008 to 2021, remove the extra 2023 row.
df = df[~((df['Company'] == 'Oracle') & (df['Year'] == 2023))]

# Save the updated DataFrame to a new CSV file on your Google Drive.
output_path = '/content/drive/My Drive/Interbrand_data_filtered_oracle.csv'
df.to_csv(output_path, index=False)
print("\nUpdated data saved to:", output_path)

In [None]:
import pandas as pd

# Define the path to your filtered oracle CSV file on Google Drive.
# Here we assume the file already contains Oracle’s interpolated values
# and earlier imputation for Salesforce (if any).
csv_path = '/content/drive/My Drive/Interbrand_data_filtered_oracle.csv'
df = pd.read_csv(csv_path)

# Convert 'Year' and 'Value' to numeric.
df['Year'] = pd.to_numeric(df['Year'], errors='coerce')
df['Interbrand_Value'] = pd.to_numeric(df['Interbrand_Value'], errors='coerce')

# Ensure the imputation indicator column exists (if not, initialize to 0).
if 'is_imputed' not in df.columns:
    df['is_imputed'] = 0

# List companies that had a delayed debut (i.e., their first observed value is well after 2008)
# For these companies, we will set pre-debut years to 0.
companies_with_delayed_IPO = ['Facebook', 'Salesforce']

for company in companies_with_delayed_IPO:
    mask = df['Company'] == company
    # Find the first year with an observed (non-NA) value.
    debut_year = df.loc[mask & df['Interbrand_Value'].notna(), 'Year'].min()
    # For years before the debut year, set the Value to 0.
    df.loc[mask & (df['Year'] < debut_year), 'Interbrand_Value'] = 0
    # Mark these rows as imputed.
    df.loc[mask & (df['Year'] < debut_year), 'is_imputed'] = 1

# (For Oracle, we leave the imputation indicator as 0 because the interpolated values are trusted.)


# Print a summary for each company to verify
for company in companies_with_delayed_IPO:
    print(f"\n{company} data after adjusting pre-debut values:")
    print(df[df['Company'] == company].sort_values('Year'))

# Save the final dataset with consistent imputation to a new CSV file.
output_path = '/content/drive/My Drive/Interbrand_data_filtered_complete.csv'
df.to_csv(output_path, index=False)
print("\nFinal complete dataset saved to:", output_path)
