# Package Installation Section #

In [2]:
'''
pip install yfinance  # Install the yfinance library
pip install openpyxl
pip install pandas
pip install re
'''

'\npip install yfinance  # Install the yfinance library\npip install openpyxl\npip install pandas\npip install re\n'

# Package Import Section #

In [1]:
# import yfinance as yf
import pandas as pd
from openpyxl import load_workbook
import re

# Download S&P500 Return Index and Volatility Index (VIX) from Yahoo Finance #

In [5]:
# Define the tickers for S&P 500 (^GSPC) and VIX (^VIX)
tickers = ["^GSPC", "^VIX"]

# Download weekly data from Yahoo Finance
data = yf.download(tickers, start="2014-12-01", end="2024-11-30", interval="1wk", group_by="ticker")

# Extract closing prices
sp500_close = data["^GSPC"]["Close"]
vix_close = data["^VIX"]["Close"]

# Calculate weekly returns for S&P 500 (percentage change)
sp500_return = sp500_close.pct_change() * 100

# Combine data into a single DataFrame
sp500_vix = pd.DataFrame({
    "Date": sp500_close.index,
    "SP500_Return (%)": sp500_return,
    "VIX_Close": vix_close
}).set_index("Date")

# Drop rows with NaN values (e.g., first row for returns calculation)
sp500_vix.dropna(inplace=True)

# Save to a CSV file
#combined_data.to_csv("weekly_sp500_vix.csv")

sp500_vix

[*********************100%***********************]  2 of 2 completed


Unnamed: 0_level_0,SP500_Return (%),VIX_Close
Date,Unnamed: 1_level_1,Unnamed: 2_level_1
2014-12-08,-3.519380,21.080000
2014-12-15,3.412022,16.490000
2014-12-22,0.875093,14.500000
2014-12-29,-1.463544,17.790001
2015-01-05,-0.650563,17.549999
...,...,...
2024-10-28,-1.365680,21.879999
2024-11-04,4.656128,14.940000
2024-11-11,-2.083547,16.139999
2024-11-18,1.681589,15.240000


# Import Downloaded Dataset from Refinitiv (weekly from 01/12/2014 to 30/11/2024) #

In [3]:
# Load the workbook
file_path = "Request finish.xlsm"
load_file = load_workbook(file_path, keep_vba=True)

# Access the worksheet
stock_data = load_file['full']

# Extract data from the worksheet into a pandas DataFrame
data = stock_data.values  # Extract the data as a generator of rows
columns = next(data)  # Get the first row as column names
df_stock = pd.DataFrame(data, columns=columns)  # Create DataFrame

# Display the DataFrame
df_stock.head()

Unnamed: 0,Name,Code,2014-01-10 00:00:00,2014-01-17 00:00:00,2014-01-24 00:00:00,2014-01-31 00:00:00,2014-02-07 00:00:00,2014-02-14 00:00:00,2014-02-21 00:00:00,2014-02-28 00:00:00,...,2024-09-27 00:00:00,2024-10-04 00:00:00,2024-10-11 00:00:00,2024-10-18 00:00:00,2024-10-25 00:00:00,2024-11-01 00:00:00,2024-11-08 00:00:00,2024-11-15 00:00:00,2024-11-22 00:00:00,2024-11-29 00:00:00
0,PALANTIR TECHNOLOGIES A,US69608A1088(P),,,,,,,,,...,36.84,40.01,43.51,42.97,44.86,41.92,58.39,65.77,64.35,67.08
1,PALANTIR TECHNOLOGIES A - ASK PRICE,US69608A1088(PA),,,,,,,,,...,36.86,40.01,43.5,42.96,44.86,41.94,58.44,65.79,64.33,67.09
2,PALANTIR TECHNOLOGIES A - BID PRICE,US69608A1088(PB),,,,,,,,,...,36.85,40.0,43.49,42.95,44.85,41.93,58.43,65.78,64.32,67.06
3,PALANTIR TECHNOLOGIES A - PRICE HIGH,US69608A1088(PH),,,,,,,,,...,37.47,40.29,44.38,42.99,45.07,42.57,58.48,66.0,64.44,67.16
4,PALANTIR TECHNOLOGIES A - PRICE LOW,US69608A1088(PL),,,,,,,,,...,36.59,39.4,42.62,41.65,43.645,41.59,55.3,60.91,61.37,65.47


In [4]:
# Make a copy of df_stock
df = df_stock.copy()

# Transpose the dataset to long format #

In [5]:
# Step 1: delete data before December 2024 to align with the index data

# Define the date range
start_date = "2014-01-10 00:00:00"
end_date = "2014-11-28 00:00:00"

# Separate datetime columns
datetime_cols = pd.to_datetime(df.columns[2:], format="%Y-%m-%d %H:%M:%S", errors='coerce')

# Filter columns outside the specified date range
columns_to_keep = ~datetime_cols.to_series().between(start_date, end_date)

# Keep only the non-datetime columns and filtered datetime columns
df = df.iloc[:, :2].join(df.iloc[:, 2:].loc[:, columns_to_keep])

# Reconstruct the column names: combine non-datetime and filtered datetime columns
#df.columns = list(df.columns[:2]) + list(datetime_cols[columns_to_keep])

# Display the resulting DataFrame
df.head()

Unnamed: 0,Name,Code,2014-12-05 00:00:00,2014-12-12 00:00:00,2014-12-19 00:00:00,2014-12-26 00:00:00,2015-01-02 00:00:00,2015-01-09 00:00:00,2015-01-16 00:00:00,2015-01-23 00:00:00,...,2024-09-27 00:00:00,2024-10-04 00:00:00,2024-10-11 00:00:00,2024-10-18 00:00:00,2024-10-25 00:00:00,2024-11-01 00:00:00,2024-11-08 00:00:00,2024-11-15 00:00:00,2024-11-22 00:00:00,2024-11-29 00:00:00
0,PALANTIR TECHNOLOGIES A,US69608A1088(P),,,,,,,,,...,36.84,40.01,43.51,42.97,44.86,41.92,58.39,65.77,64.35,67.08
1,PALANTIR TECHNOLOGIES A - ASK PRICE,US69608A1088(PA),,,,,,,,,...,36.86,40.01,43.5,42.96,44.86,41.94,58.44,65.79,64.33,67.09
2,PALANTIR TECHNOLOGIES A - BID PRICE,US69608A1088(PB),,,,,,,,,...,36.85,40.0,43.49,42.95,44.85,41.93,58.43,65.78,64.32,67.06
3,PALANTIR TECHNOLOGIES A - PRICE HIGH,US69608A1088(PH),,,,,,,,,...,37.47,40.29,44.38,42.99,45.07,42.57,58.48,66.0,64.44,67.16
4,PALANTIR TECHNOLOGIES A - PRICE LOW,US69608A1088(PL),,,,,,,,,...,36.59,39.4,42.62,41.65,43.645,41.59,55.3,60.91,61.37,65.47


In [6]:
# Step 2: create a new column with only company names and a new column name with only variable names

# Create a new column 'company_name' and set it to None
df['company_name'] = None

# Iterate every 35 rows to extract the company name and forward-fill
for start in range(0, len(df), 35):
    # Extract the company name at the start of each 35-row block
    company_name = df.loc[start, 'Name']

    # Assign this company name to all rows in the current block
    df.loc[start:start+34, 'company_name'] = company_name

# Forward-fill the remaining rows
df['company_name'] = df['company_name'].ffill()

# Create a new column 'var_name' and set it to None
df['var_name'] = None

# Create a list of 35 different variables
variables = ["close_price", "ask_price", "bid_price", "price_high", "price_low", "price_open",
             "turnover_value", "turnover_volume", "number_trades", "vwap", "trading_volume_wa",
             "total_return", "pe", "ptbv", "dividend_yield", "operating_pm", "gross_pm",
             "net_operating_income", "net_income", "roe", "roic", "roa", "ebit", "ebitda", "dpps",
             "current_ratio", "quick_ratio", "inventory_turnover", "asset_turnover",
             "tdce", "ltdce", "interst_cover1", "interst_cover2", "cash_dividend", "shares_outstanding"]

# Iterate over every 35 rows to assign variable names
for start in range(0, len(df), 35):
    # Assign variable names to the 'var_name' column for the current block
    var_names_to_assign = variables[:len(df.loc[start:start+34])]
    df.loc[start:start+34, 'var_name'] = var_names_to_assign

# Move 'company_name' and 'var_name' to the correct positions
df = df[['company_name', 'var_name'] + [col for col in df.columns if col not in ['company_name', 'var_name']]]

# View the resulting DataFrame
df

Unnamed: 0,company_name,var_name,Name,Code,2014-12-05 00:00:00,2014-12-12 00:00:00,2014-12-19 00:00:00,2014-12-26 00:00:00,2015-01-02 00:00:00,2015-01-09 00:00:00,...,2024-09-27 00:00:00,2024-10-04 00:00:00,2024-10-11 00:00:00,2024-10-18 00:00:00,2024-10-25 00:00:00,2024-11-01 00:00:00,2024-11-08 00:00:00,2024-11-15 00:00:00,2024-11-22 00:00:00,2024-11-29 00:00:00
0,PALANTIR TECHNOLOGIES A,close_price,PALANTIR TECHNOLOGIES A,US69608A1088(P),,,,,,,...,36.84,40.01,43.51,42.97,44.860,41.92,58.39,65.77,64.35,67.08
1,PALANTIR TECHNOLOGIES A,ask_price,PALANTIR TECHNOLOGIES A - ASK PRICE,US69608A1088(PA),,,,,,,...,36.86,40.01,43.50,42.96,44.860,41.94,58.44,65.79,64.33,67.09
2,PALANTIR TECHNOLOGIES A,bid_price,PALANTIR TECHNOLOGIES A - BID PRICE,US69608A1088(PB),,,,,,,...,36.85,40.00,43.49,42.95,44.850,41.93,58.43,65.78,64.32,67.06
3,PALANTIR TECHNOLOGIES A,price_high,PALANTIR TECHNOLOGIES A - PRICE HIGH,US69608A1088(PH),,,,,,,...,37.47,40.29,44.38,42.99,45.070,42.57,58.48,66.00,64.44,67.16
4,PALANTIR TECHNOLOGIES A,price_low,PALANTIR TECHNOLOGIES A - PRICE LOW,US69608A1088(PL),,,,,,,...,36.59,39.40,42.62,41.65,43.645,41.59,55.30,60.91,61.37,65.47
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6995,MASTERCARD,ltdce,"MASTERCARD, INC. - LONG TERM DEBT % COMMON EQUITY",US57636Q1040(WC08226),22.090,22.090,22.090,22.090,54.610,54.610,...,,,,,,,,,,
6996,MASTERCARD,interst_cover1,"MASTERCARD, INC. - EBIT/TOT INT EXPENSE RATIO",US57636Q1040(WC08291),106.813,106.813,106.813,106.813,82.279,82.279,...,,,,,,,,,,
6997,MASTERCARD,interst_cover2,"MASTERCARD, INC. - INTEREST COVER",US57636Q1040(ICBT),106.810,106.810,106.810,106.810,82.280,82.280,...,24.72,24.72,24.72,24.72,24.720,24.72,24.72,24.72,24.72,24.72
6998,MASTERCARD,cash_dividend,"MASTERCARD, INC. - CASH DIVIDENDS PAID - TOTAL",US57636Q1040(WC04551),515000.000,515000.000,515000.000,515000.000,727000.000,727000.000,...,,,,,,,,,,


In [7]:
# Step 3: identify and count #ERROR occurrences:

# Filter rows where the 'Name' column contains '#ERROR'
error_rows = df[df['Name'] == '#ERROR']

# Group by 'var_name' and count occurrences of '#ERROR'
error_counts = error_rows.groupby('var_name').size().reset_index(name='error_count')

# View the result
error_counts

Unnamed: 0,var_name,error_count
0,current_ratio,43
1,dpps,2
2,gross_pm,30
3,interst_cover1,4
4,inventory_turnover,76
5,ltdce,3
6,net_operating_income,200
7,ptbv,2
8,quick_ratio,43
9,roe,3


In [8]:
# Step 4: drop variables with error_count > 60 (30%) for all companies

# Identify variables with error_count > 60
variables_to_drop = error_counts[error_counts['error_count'] > 60]['var_name']

# Drop these variables from the dataset
df = df[~df['var_name'].isin(variables_to_drop)]

# View the updated DataFrame
df                         # Two variables(inventory_turnover and net_operating_income) are dropped at this stage

Unnamed: 0,company_name,var_name,Name,Code,2014-12-05 00:00:00,2014-12-12 00:00:00,2014-12-19 00:00:00,2014-12-26 00:00:00,2015-01-02 00:00:00,2015-01-09 00:00:00,...,2024-09-27 00:00:00,2024-10-04 00:00:00,2024-10-11 00:00:00,2024-10-18 00:00:00,2024-10-25 00:00:00,2024-11-01 00:00:00,2024-11-08 00:00:00,2024-11-15 00:00:00,2024-11-22 00:00:00,2024-11-29 00:00:00
0,PALANTIR TECHNOLOGIES A,close_price,PALANTIR TECHNOLOGIES A,US69608A1088(P),,,,,,,...,36.84,40.01,43.51,42.97,44.860,41.92,58.39,65.77,64.35,67.08
1,PALANTIR TECHNOLOGIES A,ask_price,PALANTIR TECHNOLOGIES A - ASK PRICE,US69608A1088(PA),,,,,,,...,36.86,40.01,43.50,42.96,44.860,41.94,58.44,65.79,64.33,67.09
2,PALANTIR TECHNOLOGIES A,bid_price,PALANTIR TECHNOLOGIES A - BID PRICE,US69608A1088(PB),,,,,,,...,36.85,40.00,43.49,42.95,44.850,41.93,58.43,65.78,64.32,67.06
3,PALANTIR TECHNOLOGIES A,price_high,PALANTIR TECHNOLOGIES A - PRICE HIGH,US69608A1088(PH),,,,,,,...,37.47,40.29,44.38,42.99,45.070,42.57,58.48,66.00,64.44,67.16
4,PALANTIR TECHNOLOGIES A,price_low,PALANTIR TECHNOLOGIES A - PRICE LOW,US69608A1088(PL),,,,,,,...,36.59,39.40,42.62,41.65,43.645,41.59,55.30,60.91,61.37,65.47
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6995,MASTERCARD,ltdce,"MASTERCARD, INC. - LONG TERM DEBT % COMMON EQUITY",US57636Q1040(WC08226),22.090,22.090,22.090,22.090,54.610,54.610,...,,,,,,,,,,
6996,MASTERCARD,interst_cover1,"MASTERCARD, INC. - EBIT/TOT INT EXPENSE RATIO",US57636Q1040(WC08291),106.813,106.813,106.813,106.813,82.279,82.279,...,,,,,,,,,,
6997,MASTERCARD,interst_cover2,"MASTERCARD, INC. - INTEREST COVER",US57636Q1040(ICBT),106.810,106.810,106.810,106.810,82.280,82.280,...,24.72,24.72,24.72,24.72,24.720,24.72,24.72,24.72,24.72,24.72
6998,MASTERCARD,cash_dividend,"MASTERCARD, INC. - CASH DIVIDENDS PAID - TOTAL",US57636Q1040(WC04551),515000.000,515000.000,515000.000,515000.000,727000.000,727000.000,...,,,,,,,,,,


In [9]:
# Step 5: clean the company code column and drop the "Name" and "Code" column

# Reset index to ensure integer-based indexing
df = df.reset_index(drop=True)

# Initialize the 'company_code' column
df['company_code'] = None  # Avoid SettingWithCopyWarning as this directly modifies the DataFrame

# Iterate through the DataFrame in blocks of 33 rows
for start in range(0, len(df), 33):
    # Extract the first value in the current block
    first_value = df.iloc[start, df.columns.get_loc('Code')]  # Use .iloc for integer-based indexing

    # Initialize company_code
    company_code = None

    # Check if first_value is a string and matches the desired pattern
    if isinstance(first_value, str):
        match = re.match(r'([^\(]+)', first_value)  # Match company codes starting with "US"
        if match:
            company_code = match.group(1)

    # Fill the current block (33 rows) with the extracted company code
    df.loc[start:start+32, 'company_code'] = company_code

# Move the 'company_code' column to the second position
columns = list(df.columns)  # Get the list of columns
columns.remove('company_code')  # Remove 'company_code_filled' from the list
columns.insert(1, 'company_code')  # Insert it at the second position
df = df[columns]  # Reorder the DataFrame

# View the result
df

Unnamed: 0,company_name,company_code,var_name,Name,Code,2014-12-05 00:00:00,2014-12-12 00:00:00,2014-12-19 00:00:00,2014-12-26 00:00:00,2015-01-02 00:00:00,...,2024-09-27 00:00:00,2024-10-04 00:00:00,2024-10-11 00:00:00,2024-10-18 00:00:00,2024-10-25 00:00:00,2024-11-01 00:00:00,2024-11-08 00:00:00,2024-11-15 00:00:00,2024-11-22 00:00:00,2024-11-29 00:00:00
0,PALANTIR TECHNOLOGIES A,US69608A1088,close_price,PALANTIR TECHNOLOGIES A,US69608A1088(P),,,,,,...,36.84,40.01,43.51,42.97,44.860,41.92,58.39,65.77,64.35,67.08
1,PALANTIR TECHNOLOGIES A,US69608A1088,ask_price,PALANTIR TECHNOLOGIES A - ASK PRICE,US69608A1088(PA),,,,,,...,36.86,40.01,43.50,42.96,44.860,41.94,58.44,65.79,64.33,67.09
2,PALANTIR TECHNOLOGIES A,US69608A1088,bid_price,PALANTIR TECHNOLOGIES A - BID PRICE,US69608A1088(PB),,,,,,...,36.85,40.00,43.49,42.95,44.850,41.93,58.43,65.78,64.32,67.06
3,PALANTIR TECHNOLOGIES A,US69608A1088,price_high,PALANTIR TECHNOLOGIES A - PRICE HIGH,US69608A1088(PH),,,,,,...,37.47,40.29,44.38,42.99,45.070,42.57,58.48,66.00,64.44,67.16
4,PALANTIR TECHNOLOGIES A,US69608A1088,price_low,PALANTIR TECHNOLOGIES A - PRICE LOW,US69608A1088(PL),,,,,,...,36.59,39.40,42.62,41.65,43.645,41.59,55.30,60.91,61.37,65.47
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6595,MASTERCARD,US57636Q1040,ltdce,"MASTERCARD, INC. - LONG TERM DEBT % COMMON EQUITY",US57636Q1040(WC08226),22.090,22.090,22.090,22.090,54.610,...,,,,,,,,,,
6596,MASTERCARD,US57636Q1040,interst_cover1,"MASTERCARD, INC. - EBIT/TOT INT EXPENSE RATIO",US57636Q1040(WC08291),106.813,106.813,106.813,106.813,82.279,...,,,,,,,,,,
6597,MASTERCARD,US57636Q1040,interst_cover2,"MASTERCARD, INC. - INTEREST COVER",US57636Q1040(ICBT),106.810,106.810,106.810,106.810,82.280,...,24.72,24.72,24.72,24.72,24.720,24.72,24.72,24.72,24.72,24.72
6598,MASTERCARD,US57636Q1040,cash_dividend,"MASTERCARD, INC. - CASH DIVIDENDS PAID - TOTAL",US57636Q1040(WC04551),515000.000,515000.000,515000.000,515000.000,727000.000,...,,,,,,,,,,


In [10]:
# Drop the "Name" and "Code" columns
df = df.drop(columns=['Name', 'Code'])

# View the result
df

Unnamed: 0,company_name,company_code,var_name,2014-12-05 00:00:00,2014-12-12 00:00:00,2014-12-19 00:00:00,2014-12-26 00:00:00,2015-01-02 00:00:00,2015-01-09 00:00:00,2015-01-16 00:00:00,...,2024-09-27 00:00:00,2024-10-04 00:00:00,2024-10-11 00:00:00,2024-10-18 00:00:00,2024-10-25 00:00:00,2024-11-01 00:00:00,2024-11-08 00:00:00,2024-11-15 00:00:00,2024-11-22 00:00:00,2024-11-29 00:00:00
0,PALANTIR TECHNOLOGIES A,US69608A1088,close_price,,,,,,,,...,36.84,40.01,43.51,42.97,44.860,41.92,58.39,65.77,64.35,67.08
1,PALANTIR TECHNOLOGIES A,US69608A1088,ask_price,,,,,,,,...,36.86,40.01,43.50,42.96,44.860,41.94,58.44,65.79,64.33,67.09
2,PALANTIR TECHNOLOGIES A,US69608A1088,bid_price,,,,,,,,...,36.85,40.00,43.49,42.95,44.850,41.93,58.43,65.78,64.32,67.06
3,PALANTIR TECHNOLOGIES A,US69608A1088,price_high,,,,,,,,...,37.47,40.29,44.38,42.99,45.070,42.57,58.48,66.00,64.44,67.16
4,PALANTIR TECHNOLOGIES A,US69608A1088,price_low,,,,,,,,...,36.59,39.40,42.62,41.65,43.645,41.59,55.30,60.91,61.37,65.47
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6595,MASTERCARD,US57636Q1040,ltdce,22.090,22.090,22.090,22.090,54.610,54.610,54.610,...,,,,,,,,,,
6596,MASTERCARD,US57636Q1040,interst_cover1,106.813,106.813,106.813,106.813,82.279,82.279,82.279,...,,,,,,,,,,
6597,MASTERCARD,US57636Q1040,interst_cover2,106.810,106.810,106.810,106.810,82.280,82.280,82.280,...,24.72,24.72,24.72,24.72,24.720,24.72,24.72,24.72,24.72,24.72
6598,MASTERCARD,US57636Q1040,cash_dividend,515000.000,515000.000,515000.000,515000.000,727000.000,727000.000,727000.000,...,,,,,,,,,,


In [11]:
# Make a copy of df
df_short = df.copy()

df_short['company_name'].nunique()   # Check the number of unique company names

200

In [12]:
df_short['company_code'].nunique()   # # Check the number of unique company codes

200

In [14]:
# Step 6: transpose the dataset from short form to long form

# Identify date columns
date_columns = df_short.columns[3:]

# Transform from Short Form to Long Form using melt()
df_tran = df_short.melt(
    id_vars=['company_name', 'company_code', 'var_name'],  # Columns to keep
    value_vars=date_columns,  # Date columns to "unpivot"
    var_name='date',  # New column for date names
    value_name='value'  # Column for values
)

df_tran

Unnamed: 0,company_name,company_code,var_name,date,value
0,PALANTIR TECHNOLOGIES A,US69608A1088,close_price,2014-12-05,
1,PALANTIR TECHNOLOGIES A,US69608A1088,ask_price,2014-12-05,
2,PALANTIR TECHNOLOGIES A,US69608A1088,bid_price,2014-12-05,
3,PALANTIR TECHNOLOGIES A,US69608A1088,price_high,2014-12-05,
4,PALANTIR TECHNOLOGIES A,US69608A1088,price_low,2014-12-05,
...,...,...,...,...,...
3445195,MASTERCARD,US57636Q1040,ltdce,2024-11-29,
3445196,MASTERCARD,US57636Q1040,interst_cover1,2024-11-29,
3445197,MASTERCARD,US57636Q1040,interst_cover2,2024-11-29,24.72
3445198,MASTERCARD,US57636Q1040,cash_dividend,2024-11-29,


In [15]:
# Ensure 'date' column is in datetime format for consistency
df_tran['date'] = pd.to_datetime(df_tran['date'])

# Reshape: Pivot to separate variables (var1, var2, ..., var33) into columns
df_long = df_tran.pivot_table(
    index=['company_name', 'company_code', 'date'],  # Group by company and date
    columns='var_name',  # Pivot on variable names
    values='value'  # Values column
).reset_index()

# Clean column names
df_long.columns.name = None  # Remove column grouping name

# The resulting `df_long` is in long form with desired structure.
df_long

Unnamed: 0,company_name,company_code,date,ask_price,asset_turnover,bid_price,cash_dividend,close_price,current_ratio,dividend_yield,...,roa,roe,roic,shares_outstanding,tdce,total_return,trading_volume_wa,turnover_value,turnover_volume,vwap
0,3M,US88579Y1010,2014-12-05,135.6365,1.05,135.6114,2216000.0,135.6699,1.96,2.11,...,15.80,32.38,22.79,635135.0,52.61,5622.58,1782344.0,,14274.8,135.7130
1,3M,US88579Y1010,2014-12-12,131.4896,1.05,131.4812,2216000.0,131.3642,1.96,2.18,...,15.80,32.38,22.79,635135.0,52.61,5444.14,1782344.0,,14913.8,132.0684
2,3M,US88579Y1010,2014-12-19,138.3705,1.05,138.3621,2216000.0,138.3537,1.96,2.48,...,15.80,32.38,22.79,635135.0,52.61,5733.81,1782344.0,1006228.0,21761.2,138.1173
3,3M,US88579Y1010,2014-12-26,139.0477,1.05,139.0226,2216000.0,139.0059,1.96,2.47,...,15.80,32.38,22.79,635135.0,52.61,5760.84,1782344.0,394893.9,8217.5,139.2474
4,3M,US88579Y1010,2015-01-02,137.1414,0.94,137.1163,2561000.0,137.1665,1.54,2.50,...,15.77,38.95,23.05,609330.0,92.61,5684.61,1821209.0,349564.0,7354.4,136.9276
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
103205,ZEBRA TECHNOLOGIES 'A',US9892071054,2024-11-01,384.6499,,384.4900,,384.6399,,0.00,...,,,,,,9484.27,,1593302.0,4194.4,384.1338
103206,ZEBRA TECHNOLOGIES 'A',US9892071054,2024-11-08,399.3000,,399.0500,,399.3101,,0.00,...,,,,,,9846.00,,798172.1,2052.8,399.8765
103207,ZEBRA TECHNOLOGIES 'A',US9892071054,2024-11-15,385.8899,,385.7100,,385.9099,,0.00,...,,,,,,9515.59,,688635.1,1732.4,386.1726
103208,ZEBRA TECHNOLOGIES 'A',US9892071054,2024-11-22,397.1899,,396.9900,,397.1899,,0.00,...,,,,,,9793.72,,710837.3,1836.0,396.1980


In [16]:
# Check the count of observations for a company (e.g., "PALANTIR TECHNOLOGIES A")

company_name = 'PALANTIR TECHNOLOGIES A'  # Replace with the company name you're interested in

# Filter the dataset to select only rows for the given company
company_data = df_long[df_long['company_name'] == company_name]

# Get the count of observations (rows) for that company
company_observation_count = company_data.shape[0]

print(f"Number of observations for {company_name}: {company_observation_count}")

Number of observations for PALANTIR TECHNOLOGIES A: 361


In [17]:
df_long.groupby('company_name').size().sort_values(ascending=True)  # Observation number ranked in ascending order

company_name
GE VERNOVA                 205
VERALTO                    257
PALANTIR TECHNOLOGIES A    361
CONSTELLATION ENERGY       361
CARRIER GLOBAL             413
                          ... 
F5                         522
FAIR ISAAC                 522
FASTENAL                   522
JOHNSON CONTROLS INTL.     522
ZEBRA TECHNOLOGIES 'A'     522
Length: 200, dtype: int64

In [18]:
df_long.groupby('company_name').size().sum() # Check the total obeservation number for all companies

103210