In [14]:
import pandas as pd
import os
from tqdm import tqdm
from datetime import datetime as dt

date_index_reference = 'issue_sheet_index.csv'
# chunksize = 1000
date_format = '%d-%b-%y'
custom_date_parser = lambda x: dt.strptime(x, date_format)

if not os.path.isfile(date_index_reference):
    print('date index needs to be instantiated, proceeding to do that')
    #Initialize the index df
    index_df = pd.DataFrame([], columns=['date_issue', 'sheet_path', 'sheet_num'])

    formatted_csv_path = '../00_The_Economist_Scraper/formatted_csvs/'
    formatted_csv_stub = "formatted_sub_article_text"
    suffix = '.csv'

    # GOING THROUGH EACH FORMATTED CSV
    for sub_csv_index in tqdm(range(0,3)):
        read_path = formatted_csv_path + formatted_csv_stub + str(sub_csv_index) + suffix
        temp_df =  pd.read_csv(read_path, parse_dates=['date'], date_parser=custom_date_parser)

        # Identifying unique dates
        temp_df = temp_df.drop_duplicates(subset=['date'], keep='first')
        temp_df = temp_df['date']

        temp_index = pd.DataFrame([], columns=['date_issue', 'sheet_path', 'sheet_num'])

        # Pulling out the info of interest 
        for issue_row in tqdm(temp_df.index.tolist()):
            # print(temp_df.head(5))
            row_obs = temp_df.loc[issue_row]
            print(row_obs)
            temp_index.loc[issue_row, 'date_issue'] = row_obs.date
            temp_index.loc[issue_row, 'sheet_path'] = formatted_csv_path + formatted_csv_stub + str(sub_csv_index) + suffix
            temp_index.loc[issue_row, 'sheet_num'] = sub_csv_index
        ### REMEMBER THAT A DATE MIGHT APPEAR IN/CROSS OVER MORE THAN ONE SHEET
        # Need to concat and reset indices because first instances of different issues may appear on the same row (i.e. "index" of temp_df) of different sheets. Need to do this to avoid conflicts
        index_df = pd.concat([index_df, temp_index], ignore_index=True)
        index_df.reset_index(drop=True, inplace=True)

    index_df.to_csv(date_index_reference, index=False)
        
else:
    print('Issues are already indexed for their dates, carry on')
    date_index_df = pd.read_csv(date_index_reference)

print(index_df.head(5))


date index needs to be instantiated, proceeding to do that


100%|██████████| 16/16 [00:00<00:00, 1782.72it/s]


1992-01-04 00:00:00
1992-01-11 00:00:00
1992-02-15 00:00:00
1992-02-22 00:00:00
1992-02-29 00:00:00
1992-03-07 00:00:00
1992-03-14 00:00:00
1992-03-21 00:00:00
1992-03-28 00:00:00
1992-04-04 00:00:00
1992-04-11 00:00:00
1992-04-18 00:00:00
1992-04-25 00:00:00
1992-05-02 00:00:00
1992-05-09 00:00:00
1992-05-16 00:00:00


100%|██████████| 14/14 [00:00<00:00, 1559.59it/s]
 67%|██████▋   | 2/3 [00:00<00:00, 19.47it/s]

1992-05-16 00:00:00
1992-05-23 00:00:00
1992-05-30 00:00:00
1992-06-06 00:00:00
1992-06-20 00:00:00
1992-07-04 00:00:00
1992-07-11 00:00:00
1992-07-18 00:00:00
1992-07-25 00:00:00
1992-08-01 00:00:00
1992-08-08 00:00:00
1992-08-22 00:00:00
1992-08-29 00:00:00
1992-09-05 00:00:00




1992-09-05 00:00:00
1992-09-12 00:00:00

100%|██████████| 14/14 [00:00<00:00, 2339.55it/s]
100%|██████████| 3/3 [00:00<00:00, 16.44it/s]


1992-09-19 00:00:00
1992-09-26 00:00:00
1992-10-03 00:00:00
1992-10-10 00:00:00
1992-10-24 00:00:00
1992-10-31 00:00:00
1992-11-07 00:00:00
1992-11-14 00:00:00
1992-11-21 00:00:00
1992-11-28 00:00:00
1992-12-12 00:00:00
1992-12-19 00:00:00
                                          date_issue  \
0  <built-in method date of Timestamp object at 0...   
1  <built-in method date of Timestamp object at 0...   
2  <built-in method date of Timestamp object at 0...   
3  <built-in method date of Timestamp object at 0...   
4  <built-in method date of Timestamp object at 0...   

                                          sheet_path sheet_num  
0  ../00_The_Economist_Scraper/formatted_csvs/../...         0  
1  ../00_The_Economist_Scraper/formatted_csvs/../...         0  
2  ../00_The_Economist_Scraper/formatted_csvs/../...         0  
3  ../00_The_Economist_Scraper/formatted_csvs/../...         0  
4  ../00_The_Economist_Scraper/formatted_csvs/../...         0  



