# Task 1-1. Add StartDate for each quarter

In [22]:
import pandas as pd

In [23]:
df = pd.read_csv('./Data/Compustat Quarterly_snp1500_headquarters.csv', parse_dates = ['apdedateq','datadate'])

In [3]:
# 1280 companies
df['gvkey'].nunique()

1280

In [24]:
df.shape

(35570, 12)

## check NaN value

In [4]:
# NaN 데이터 수
df.isna().sum()

gvkey            0
datadate         0
year             0
tic              0
conm             0
datacqtr        24
apdedateq      292
add1             0
add2         35570
add3         35570
add4         35570
addzip          28
dtype: int64

In [5]:
# Number of companiy which has no QUANTILE information
df[df['datacqtr'].isna()].conm.nunique()

7

In [6]:
# Number of company which has no ENDDATE information.
df[df['apdedateq'].isna()]['conm'].nunique()

66

In [7]:
# drop the rows which has NaN values
df.dropna(subset = ['datacqtr', 'apdedateq'], inplace = True)

In [8]:
df

Unnamed: 0,gvkey,datadate,year,tic,conm,datacqtr,apdedateq,add1,add2,add3,add4,addzip
0,1004,2015-02-28,2014,AIR,AAR CORP,2015Q1,2015-02-28,"One AAR Place, 1100 North Wood Dale Road",,,,60191
1,1004,2015-05-31,2014,AIR,AAR CORP,2015Q2,2015-05-31,"One AAR Place, 1100 North Wood Dale Road",,,,60191
2,1004,2015-08-31,2015,AIR,AAR CORP,2015Q3,2015-08-31,"One AAR Place, 1100 North Wood Dale Road",,,,60191
3,1004,2015-11-30,2015,AIR,AAR CORP,2015Q4,2015-11-30,"One AAR Place, 1100 North Wood Dale Road",,,,60191
4,1004,2016-02-29,2015,AIR,AAR CORP,2016Q1,2016-02-29,"One AAR Place, 1100 North Wood Dale Road",,,,60191
...,...,...,...,...,...,...,...,...,...,...,...,...
35565,328795,2020-12-31,2020,ACA,ARCOSA INC,2020Q4,2020-12-31,"500 North Akard Street, Suite 400",,,,75201
35566,328795,2021-03-31,2021,ACA,ARCOSA INC,2021Q1,2021-03-31,"500 North Akard Street, Suite 400",,,,75201
35567,328795,2021-06-30,2021,ACA,ARCOSA INC,2021Q2,2021-06-30,"500 North Akard Street, Suite 400",,,,75201
35568,328795,2021-09-30,2021,ACA,ARCOSA INC,2021Q3,2021-09-30,"500 North Akard Street, Suite 400",,,,75201


In [9]:
# Subtracting 90 days isn’t an exact approach to get the start of the quarter because quarters follow a structured calendar.
# So here I subtract 80 days first becuse 2 months usually never exceed 62 days and subtracting 80 days will ensure our date jump into the month of the starting month of the quarter we want to get.
# Then I get the first day of the month.

df['startdate'] = df['apdedateq'].apply(lambda x : (x - pd.Timedelta(days = 80)).replace(day=1))

In [10]:
df

Unnamed: 0,gvkey,datadate,year,tic,conm,datacqtr,apdedateq,add1,add2,add3,add4,addzip,startdate
0,1004,2015-02-28,2014,AIR,AAR CORP,2015Q1,2015-02-28,"One AAR Place, 1100 North Wood Dale Road",,,,60191,2014-12-01
1,1004,2015-05-31,2014,AIR,AAR CORP,2015Q2,2015-05-31,"One AAR Place, 1100 North Wood Dale Road",,,,60191,2015-03-01
2,1004,2015-08-31,2015,AIR,AAR CORP,2015Q3,2015-08-31,"One AAR Place, 1100 North Wood Dale Road",,,,60191,2015-06-01
3,1004,2015-11-30,2015,AIR,AAR CORP,2015Q4,2015-11-30,"One AAR Place, 1100 North Wood Dale Road",,,,60191,2015-09-01
4,1004,2016-02-29,2015,AIR,AAR CORP,2016Q1,2016-02-29,"One AAR Place, 1100 North Wood Dale Road",,,,60191,2015-12-01
...,...,...,...,...,...,...,...,...,...,...,...,...,...
35565,328795,2020-12-31,2020,ACA,ARCOSA INC,2020Q4,2020-12-31,"500 North Akard Street, Suite 400",,,,75201,2020-10-01
35566,328795,2021-03-31,2021,ACA,ARCOSA INC,2021Q1,2021-03-31,"500 North Akard Street, Suite 400",,,,75201,2021-01-01
35567,328795,2021-06-30,2021,ACA,ARCOSA INC,2021Q2,2021-06-30,"500 North Akard Street, Suite 400",,,,75201,2021-04-01
35568,328795,2021-09-30,2021,ACA,ARCOSA INC,2021Q3,2021-09-30,"500 North Akard Street, Suite 400",,,,75201,2021-07-01


In [11]:
# Get the current list of columns
cols = df.columns.tolist()

In [17]:
# Find the position of 'apdedateq'
apdedateq_index = cols.index('apdedateq')

In [18]:
# Reorder the columns to place 'startdate' before 'apdedateq'
new_cols = cols[:apdedateq_index] + ['startdate'] + cols[apdedateq_index:-1]

In [19]:
# Reorder the DataFrame with the new column order
df = df[new_cols]

In [20]:
df

Unnamed: 0,gvkey,datadate,year,tic,conm,datacqtr,startdate,apdedateq,add1,add2,add3,add4,addzip
0,1004,2015-02-28,2014,AIR,AAR CORP,2015Q1,2014-12-01,2015-02-28,"One AAR Place, 1100 North Wood Dale Road",,,,60191
1,1004,2015-05-31,2014,AIR,AAR CORP,2015Q2,2015-03-01,2015-05-31,"One AAR Place, 1100 North Wood Dale Road",,,,60191
2,1004,2015-08-31,2015,AIR,AAR CORP,2015Q3,2015-06-01,2015-08-31,"One AAR Place, 1100 North Wood Dale Road",,,,60191
3,1004,2015-11-30,2015,AIR,AAR CORP,2015Q4,2015-09-01,2015-11-30,"One AAR Place, 1100 North Wood Dale Road",,,,60191
4,1004,2016-02-29,2015,AIR,AAR CORP,2016Q1,2015-12-01,2016-02-29,"One AAR Place, 1100 North Wood Dale Road",,,,60191
...,...,...,...,...,...,...,...,...,...,...,...,...,...
35565,328795,2020-12-31,2020,ACA,ARCOSA INC,2020Q4,2020-10-01,2020-12-31,"500 North Akard Street, Suite 400",,,,75201
35566,328795,2021-03-31,2021,ACA,ARCOSA INC,2021Q1,2021-01-01,2021-03-31,"500 North Akard Street, Suite 400",,,,75201
35567,328795,2021-06-30,2021,ACA,ARCOSA INC,2021Q2,2021-04-01,2021-06-30,"500 North Akard Street, Suite 400",,,,75201
35568,328795,2021-09-30,2021,ACA,ARCOSA INC,2021Q3,2021-07-01,2021-09-30,"500 North Akard Street, Suite 400",,,,75201


In [21]:
df.to_csv('./Output/Task1-1.csv', index = False)