Clean Capital IQ credit rating data and prepare standardized variables for downstream use.


In [None]:
import pandas as pd
import numpy as np

In [None]:
maindir= '/path/to/project/'
rawdir = maindir + 'rawdata_jfi_fin/capiq/'
outdir = maindir + 'data_jfi_fin/'

In [None]:
# read dataset
filepath = rawdir + 'CapIQ_SP_Rating.xlsx'
df = pd.read_excel(filepath, sheet_name="SP_ISSUER_RATING", header=2)

# keep relevant rows
mask = (df['SP_ENTITY_ID'].notnull()) & (df['IQ_GVKEY'].notnull()) & (df['SP_CIQ_ID'].notnull())
df = df[mask].reset_index(drop=True)

# Expand out GVKEY
df['gvkey'] = df['IQ_GVKEY'].str.split(',')
df_expanded = df.explode('gvkey').drop(['IQ_GVKEY'], axis=1)

# Define aggregation rules
character_columns = df_expanded.filter(like='FQ').columns
def keep_character_over_number(group):
    result = {}
    for col in group.columns:
        if col in character_columns:
            # Check if there are non-numeric (character) values and prioritize them
            non_numeric = group[col].apply(lambda x: x if isinstance(x, str) else np.nan)
            # Forward and backward fill non-numeric values, then take the first valid one
            result[col] = non_numeric.ffill().bfill().iloc[0]
        else:
            # For non-selected columns, keep the first value
            result[col] = group[col].iloc[0]
    return pd.Series(result)

df_unique = df_expanded.groupby('gvkey').apply(keep_character_over_number).reset_index(drop=True)

# Reshape to long format
df_long = pd.wide_to_long(df_unique, stubnames='FQ', i='gvkey', j='fyq_str').reset_index()

# Select and rename relevant columns
df_long = df_long[['gvkey', 'fyq_str', 'FQ']]
df_long.rename(columns={'FQ': 'SP_ISSUER_RATING'}, inplace=True)
df_long['SP_ISSUER_RATING'] = df_long['SP_ISSUER_RATING'].fillna(0)

# Construct fiscal quarter variables
df_long['fyq_str'] = df_long['fyq_str'].astype(str)
df_long['fyearq'] = df_long['fyq_str'].str[1:]
df_long['fqtr'] = df_long['fyq_str'].str[0]
df_long['gvkey'] = df_long['gvkey'].str[3:]
df_long.drop(['fyq_str'], axis=1, inplace=True)

In [None]:
# convert ratings to numerical values
rating_mapping = {
    "AAA": 1, "AA+": 2, "AA": 3, "AA-": 4, "A+": 5,
    "A": 6, "A-": 7, "BBB+": 8, "BBB": 9, "BBB-": 10,
    "BB+": 11, "BB": 12, "BB-": 13, "B+": 14, "B": 15,
    "B-": 16, "CCC+": 17, "CCC": 18, "CCC-": 19, "CC": 20,
    "C": 21, "D": 22, "SD": 22, "NR": 99, "Api": 6, "Bpi": 15,
    "BBBpi": 9,"BBB/NR": 9
}
df_long['rating_numeric'] = df_long['SP_ISSUER_RATING'].map(rating_mapping)
df_long['rating_numeric'] = df_long['rating_numeric'].fillna(99)

# other indicators
df_long['rating_hy'] = df_long['rating_numeric'].apply(lambda x: 1 if x in np.arange(11, 23) else 0)
df_long['rating_ig'] = df_long['rating_numeric'].apply(lambda x: 1 if x in np.arange(1, 11) else 0)
df_long['rating_unrated'] = df_long['rating_numeric'].apply(lambda x: 1 if x==99 else 0)


In [None]:
# housekeeping

# handle date type
df_long.sort_values(by=['gvkey','fyearq','fqtr'], inplace=True)
df_long['fyearq'] = df_long['fyearq'].astype(int)
df_long['fqtr'] = df_long['fqtr'].astype(int)

# lower column nnames
df_long.columns = [x.lower() for x in df_long.columns]

# handle sp issuer rating type
df_long.loc[df_long['sp_issuer_rating']==0,'sp_issuer_rating'] = 'UR'
df_long['sp_issuer_rating'] = df_long['sp_issuer_rating'].astype(str)

# handle gvkey type
df_long['gvkey'] = df_long['gvkey'].astype(str).str.lstrip('0')
df_long['gvkey'] = df_long['gvkey'].astype(int)

df_long.info()

In [None]:
df_long.head()

In [None]:
# plot histogram

df_long[df_long.rating_numeric!=99].sp_issuer_rating.value_counts().plot(kind='bar')

In [None]:
filepath = outdir + 'CapIQ_SP_Rating_processed.dta'
df_long.to_stata(filepath, write_index=False)