# Housekeeping


In [4]:
import pandas as pd
import numpy as np
import datetime as dt
import os
import sqlite3

from jsonschema.benchmarks.const_vs_enum import value
from pandas.tseries.offsets import *
from datetime import timedelta
from datetime import datetime
from dateutil.relativedelta import relativedelta
import warnings

warnings.filterwarnings('ignore')

today = dt.datetime.today().strftime('%Y-%m-%d')
today

'2025-12-18'

# Pull data from WRDS - as of December 18, 2025 (fyear > 2000)

In [1]:
import wrds
db = wrds.Connection()

WRDS recommends setting up a .pgpass file.
pgpass file created at C:\Users\jessf\AppData\Roaming\postgresql\pgpass.conf
Created .pgpass file successfully.
You can create this file yourself at any time with the create_pgpass_file() function.
Loading library list...
Done


**Business Segments**

In [5]:
seg1 = db.raw_sql("""
                    SELECT gvkey,stype,sid,datadate,srcdate,sales,revts
                    FROM compseg.seg_annfund
                    WHERE datadate >= '01/01/2000'
                    AND gvkey IS NOT NULL
                    AND sid IS NOT NULL
                    """, date_cols=['datadate','srcdate'])

seg1['year']=seg1['datadate'].dt.year
len(seg1)

2059258

In [8]:
seg1.to_pickle(f"../Data/raw/compustat_bus_seg_raw_{today}.pkl")

**Geographic Segments**

In [9]:
seggeo1 = db.raw_sql("""
                    SELECT gvkey,stype,sid,datadate,gareat,gareag
                    FROM compseg.seg_geo
                    WHERE datadate >= '01/01/2000'
                    AND gvkey IS NOT NULL
                    AND sid IS NOT NULL
                    """, date_cols=['datadate'])

seggeo1['year']=seggeo1['datadate'].dt.year

len(seggeo1)

432354

In [10]:
seggeo1.to_pickle(f"../Data/raw/compustat_geo_seg_raw_{today}.pkl")

**Merged Segments**

In [11]:
segmerge1 = db.raw_sql("""
                    SELECT gvkey,stype,sid,datadate,srcdate,sales,revts,geotp,snms,soptp1,soptp2,curcds,isosrc
                    FROM compseg.wrds_segmerged
                    WHERE datadate >= '01/01/2000'
                    AND gvkey IS NOT NULL
                    AND sid IS NOT NULL
                    """, date_cols=['datadate','srcdate'])

segmerge1['year']=segmerge1['datadate'].dt.year

len(segmerge1)

2059258

In [12]:
segmerge1.to_pickle(f"../Data/raw/compustat_merged_seg_raw_{today}.pkl")

# Create segments variables

**Step 1a: Normal compustat segments file:**

*The database is duplicated based on the reporting year. Retain historical information*

In [13]:
segmerge2 = segmerge1.loc[(segmerge1['datadate'] == segmerge1['srcdate'])]
len(segmerge2)

770858

*Verify no duplicate by segment ID - detailed location information*

In [14]:
segmerge3 = segmerge2.drop_duplicates(subset=['gvkey','sid','datadate','stype'])
len(segmerge3)

770858

*Get count of geographic segments*

In [16]:
segmerge3_geo = segmerge3.loc[(segmerge3['stype'] == 'GEOSEG')]
len(segmerge3_geo)

359449

In [17]:
segmerge3_geo.sort_values(by=['gvkey','datadate'], inplace=True)
num_gsegs1 = segmerge3_geo.groupby(['gvkey','datadate']).size().reset_index(name='num_gsegs1')

**Step 1B: Using a detailed file bc of known issue documented in WRDS**

*Ensure no dups by segment ID - detailed location information*

In [19]:
seggeo2 = seggeo1.drop_duplicates(subset=['gvkey','sid','datadate','stype','gareag'])
len(seggeo2)

432354

*Get count of geographic segments*

In [20]:
seggeo3 = seggeo2.loc[(seggeo2['stype'] == 'GEOSEG')]
len(seggeo3)

385759

In [21]:
seggeo3.sort_values(by=['gvkey','datadate'], inplace=True)
num_gsegs2 = seggeo3.groupby(['gvkey','datadate']).size().reset_index(name='num_gsegs2')

**Step 2: Calc total foreign sales - need simplified geo code**

*Note: Don't want detailed geo information duplicated sales so only need one per segid number*

In [22]:
seggeo4 = seggeo3.drop_duplicates(subset=['gvkey','datadate','sid'])
len(seggeo4)

352364

In [23]:
segmerge4_geo = segmerge3_geo.drop_duplicates(subset=['gvkey','datadate','sid'])
len(segmerge4_geo)

359449

In [24]:
foreign1 = pd.merge(segmerge4_geo, seggeo4[['gvkey','datadate','sid','gareag']], on=['gvkey','datadate','sid'], how='left')
len(foreign1)

359449

In [25]:
foreign1['sales']=foreign1['sales'].fillna(0)

In [26]:
foreign2 = foreign1.loc[(foreign1['gareag'] != 'USA')]
len(foreign2)

240800

In [27]:
foreign3 = foreign2.groupby(['gvkey','datadate'])['sales'].sum().reset_index(name = 'foreign_sales')

**Step 3: Create a data file for business/operating segments**

In [28]:
segmerge3_bus = segmerge3.loc[(segmerge3['stype'] != 'GEOSEG')]
len(segmerge3_bus)

411409

In [29]:
segmerge3_bus.sort_values(by=['gvkey','datadate'], inplace=True)
num_bsegs = segmerge3_bus.groupby(['gvkey','datadate']).size().reset_index(name='num_bsegs')

**Step 4: Merge all together**

In [30]:
segments = segmerge1.drop_duplicates(subset=['gvkey','datadate'])
len(segments)

186899

In [31]:
segments2 = pd.merge(segments[['gvkey','datadate']], num_gsegs1[['gvkey','datadate','num_gsegs1']], on=['gvkey','datadate'], how='left')

In [32]:
segments3 = pd.merge(segments2, num_gsegs2[['gvkey','datadate','num_gsegs2']], on=['gvkey','datadate'], how='left')

In [33]:
segments4 = pd.merge(segments3, num_bsegs[['gvkey','datadate','num_bsegs']], on=['gvkey','datadate'], how='left')

In [34]:
segments5 = pd.merge(segments4, foreign3[['gvkey','datadate','foreign_sales']], on=['gvkey','datadate'], how='left')

In [35]:
segments6 = segments5.copy()

In [36]:
for var in ['num_gsegs1','num_gsegs2','num_bsegs']:
    segments6[var] = segments6[var].fillna(1)

In [37]:
segments6['foreign_sales'] = segments6['foreign_sales'].fillna(0)
segments6['has_foreign_sales'] = (segments6['foreign_sales'] > 0).astype(int)

In [39]:
segments6.tail()

Unnamed: 0,gvkey,datadate,num_gsegs1,num_gsegs2,num_bsegs,foreign_sales,has_foreign_sales
186894,356128,2024-12-31,3.0,3.0,4.0,4826.644,1
186895,356859,2023-12-31,3.0,2.0,1.0,409.15,1
186896,356859,2024-12-31,3.0,2.0,1.0,469.4,1
186897,366911,2023-12-31,2.0,2.0,3.0,2691.0,1
186898,366911,2024-12-31,2.0,2.0,3.0,2678.0,1


In [40]:
segments6.to_pickle(f"../Data/analysis/compustat_segments_{today}.pkl")