In [23]:
import wrds
import pandas as pd

In [24]:
db = wrds.Connection()

WRDS recommends setting up a .pgpass file.
Created .pgpass file successfully.
You can create this file yourself at any time with the create_pgpass_file() function.
Loading library list...
Done


In [25]:
# Pull monthly market cap to identify top 100 PERMNOs with data over the period
monthly = db.raw_sql("""
    SELECT permno, date, abs(prc)*shrout/1000 as mktcap
    FROM crsp.msf
    WHERE date BETWEEN '2004-01-01' AND '2024-01-01'
""")


In [26]:
# Count months of data per PERMNO
coverage = (
    monthly.groupby('permno')['date']
    .count()
    .reset_index(name='n_months')
)

# Full coverage = 12 months × 20 years = 240 months
full_data = coverage[coverage['n_months'] >= 240]

# Join back to get avg market cap and pick top 100
avg_mktcap = (
    monthly[monthly['permno'].isin(full_data['permno'])]
    .groupby('permno')['mktcap']
    .mean()
    .reset_index(name='avg_mktcap')
)

top100 = avg_mktcap.sort_values(by='avg_mktcap', ascending=False).head(100)
permnos = top100['permno'].tolist()


In [27]:
permno_str = ','.join(map(str, permnos))

prices_df = db.raw_sql(f"""
    SELECT permno, date, prc, shrout
    FROM crsp.dsf
    WHERE date BETWEEN '2004-01-01' AND '2024-12-31'
      AND permno IN ({permno_str})
""")

In [28]:
print(prices_df)

       permno        date        prc     shrout
0       10104  2004-01-02      13.14  5227587.0
1       10104  2004-01-05      13.55  5227587.0
2       10104  2004-01-06       13.6  5227587.0
3       10104  2004-01-07      13.97  5227587.0
4       10104  2004-01-08      14.24  5227587.0
...       ...         ...        ...        ...
28495   92655  2024-12-24  506.10001   920284.0
28496   92655  2024-12-26  511.14999   920284.0
28497   92655  2024-12-27  509.98999   920284.0
28498   92655  2024-12-30  507.79999   920284.0
28499   92655  2024-12-31  505.85999   920284.0

[528500 rows x 4 columns]


In [29]:
fundamentals = db.raw_sql("""
    SELECT gvkey, datadate, fqtr, fyr, tic, conm,
           atq, dlcq, dlttq
    FROM comp.fundq
    WHERE indfmt = 'INDL'
      AND datafmt = 'STD'
      AND popsrc = 'D'
      AND consol = 'C'
      AND datadate BETWEEN '2004-01-01' AND '2024-12-31'
""")

print(fundamentals)

         gvkey    datadate  fqtr  fyr     tic                          conm  \
0       001013  2004-01-31     1   10  ADCT.1    ADC TELECOMMUNICATIONS INC   
1       001082  2004-01-31     3    4  SERV.1                 SERVIDYNE INC   
2       001173  2004-01-31     4    1   AIM.1                AEROSONIC CORP   
3       001183  2004-01-31     4    1   IDAI.                      IDNA INC   
4       001240  2004-01-31     4    1   ABS.1               ALBERTSON'S INC   
...        ...         ...   ...  ...     ...                           ...   
466066  351590  2024-12-31     4   12   DTRUY      DAIMLER TRUCK HOLDING AG   
466067  352262  2024-12-31     4   12    CLCO              COOL COMPANY LTD   
466068  354003  2024-12-31     4   12    BEMB   ISHARES JP MORGAN B U E M B   
466069  356128  2024-12-31     4   12    KSPI  JOINT STOCK COMPANY KASPI KZ   
466070  356289  2024-12-31     2    6    SUUN                SOLARBANK CORP   

              atq       dlcq      dlttq  
0        

In [30]:
ccmlink = db.raw_sql("""
    SELECT gvkey, lpermno as permno, linktype, linkprim,
           linkdt, linkenddt
    FROM crsp.ccmxpf_linktable
    WHERE linktype IN ('LU', 'LC') AND linkprim IN ('P', 'C')
""")

fundamentals['datadate'] = pd.to_datetime(fundamentals['datadate'])
ccmlink['linkdt'] = pd.to_datetime(ccmlink['linkdt'])
ccmlink['linkenddt'] = pd.to_datetime(ccmlink['linkenddt'])

fundamentals_merged = fundamentals.merge(ccmlink, on='gvkey', how='left')

# Filter only rows where Compustat date falls within the valid link period
fundamentals_merged = fundamentals_merged[
    (fundamentals_merged['datadate'] >= fundamentals_merged['linkdt']) &
    ((fundamentals_merged['datadate'] <= fundamentals_merged['linkenddt']) | fundamentals_merged['linkenddt'].isna())
]

In [31]:
prices_df['date'] = pd.to_datetime(prices_df['date'])
fundamentals_merged['datadate'] = pd.to_datetime(fundamentals_merged['datadate'])
prices_df['permno'] = prices_df['permno'].astype(int)
fundamentals_merged['permno'] = fundamentals_merged['permno'].astype(int)
prices_df = prices_df.sort_values(['permno', 'date'])
fundamentals_merged = fundamentals_merged.sort_values(['permno', 'datadate'])
merged_list = []

for permno in prices_df['permno'].unique():
    price_sub = prices_df[prices_df['permno'] == permno].copy()
    fin_sub = fundamentals_merged[fundamentals_merged['permno'] == permno].copy()

    if not price_sub.empty and not fin_sub.empty:
        merged = pd.merge_asof(
            price_sub.sort_values('date'),
            fin_sub.sort_values('datadate'),
            left_on='date',
            right_on='datadate',
            direction='backward'
        )
        merged_list.append(merged)

merged_df = pd.concat(merged_list, ignore_index=True)
merged_df.to_csv("raw_data.csv")