In [1]:
import os
import pandas as pd
from urllib import request

In [2]:
def single_read_in_text(file_path,file_name):
    '''
    input:
    file_path: path of a individule csv file e.g. location of 21centfoxinc_sec_files.csv
    file_name: name of the file e.g 21centfoxinc_sec_files.csv
    
    output:
    a sigle pandas dataframe with all the original columns from the input file + a column for the 'cleaned' data
    
    dev:
    1) can add more output columns for features
    2) can further clean the data 
    
    '''
    
    ## read in single csv to pandas 
    individule_csv = pd.read_csv(file_path+"/"+file_name)
    raw_texts = []
    clean_texts = []
    
    ## create a connection with the url link and readin the raw file
    for url in individule_csv['sec_full_path']:
        print(url)
        raw_texts.append([url,request.urlopen(url).read().decode('utf8')])
    
    ## clean the raw file by:
    ## 1. break the text by "\n"
    ## 2. remove the spaces in the front of and after each "\n" 
    for raw_file in raw_texts:
        clean_texts.append([raw_file[0],list(filter(None, [i.strip() for i in raw_file[1].split('\n')]))])
        ## more columns, features, data cleanings can be put here
    
    ## merge back to the original read in dataframe 
    clean_texts_df = pd.DataFrame(clean_texts)
    clean_texts_df.columns = ['sec_full_path', 'text']
    merged_df = pd.merge(left = individule_csv, right = clean_texts_df, on = 'sec_full_path')
    
    ## add one more column to indicate the file name 
    merged_df['file_name'] = file_name
    return merged_df

def folder_read_in_text(folder_path, ext = '.csv'):
    '''
    input: 
    folder_path: path of a individule csv file e.g. location of 21centfoxinc_sec_files.csv
    ext: extension of the files that are interested, default to be .csv 
    
    output:
    a sigle pandas dataframe with all the original columns from all the input files inside the folder
    + a column for the 'cleaned' data
    + a column for the file name 
    
    Utilize the single_read_in_text function 
    '''
    file = []
    direc = folder_path 

    # Select only files with the ext extension
    txt_files = [i for i in os.listdir(direc) if os.path.splitext(i)[1] == ext]
    temp_df = pd.DataFrame()
    
    ## Utilize the single_read_in_text function to process data
    for i in txt_files:
        temp_df = temp_df.append(single_read_in_text(file_path = folder_path,file_name = i), ignore_index=True)
    return temp_df

In [3]:
#single_read_in_text('C:/Users/li haoran/Desktop/New folder','21centfoxinc_sec_files.csv')
working_file = folder_read_in_text(folder_path='C:/Users/li haoran/Desktop/sec files', ext = '.csv')

https://www.sec.gov/Archives/edgar/data/1054263/0001308161-05-000026.txt
https://www.sec.gov/Archives/edgar/data/1054263/0001308161-05-000027.txt
https://www.sec.gov/Archives/edgar/data/732717/0000732717-94-000001.txt
https://www.sec.gov/Archives/edgar/data/732717/0000732717-94-000005.txt
https://www.sec.gov/Archives/edgar/data/732717/0000732717-94-000006.txt
https://www.sec.gov/Archives/edgar/data/732717/0000732717-94-000007.txt
https://www.sec.gov/Archives/edgar/data/732717/0000732717-94-000008.txt
https://www.sec.gov/Archives/edgar/data/732717/0000732717-94-000013.txt
https://www.sec.gov/Archives/edgar/data/732717/0000732717-94-000014.txt
https://www.sec.gov/Archives/edgar/data/732717/0000732717-95-000006.txt
https://www.sec.gov/Archives/edgar/data/732717/0000798263-95-000029.txt
https://www.sec.gov/Archives/edgar/data/732717/0000732717-95-000011.txt
https://www.sec.gov/Archives/edgar/data/732717/0000732717-95-000013.txt
https://www.sec.gov/Archives/edgar/data/732717/0000732717-95-0

https://www.sec.gov/Archives/edgar/data/732717/0000732717-04-000670.txt
https://www.sec.gov/Archives/edgar/data/732717/0000732717-04-000677.txt
https://www.sec.gov/Archives/edgar/data/732717/0000732717-04-000680.txt
https://www.sec.gov/Archives/edgar/data/732717/0000732717-04-000717.txt
https://www.sec.gov/Archives/edgar/data/732717/0000732717-05-000035.txt
https://www.sec.gov/Archives/edgar/data/732717/0000732717-05-000038.txt
https://www.sec.gov/Archives/edgar/data/732717/0001193125-05-015466.txt
https://www.sec.gov/Archives/edgar/data/732717/0000732717-05-000145.txt
https://www.sec.gov/Archives/edgar/data/732717/0000732717-05-000173.txt
https://www.sec.gov/Archives/edgar/data/732717/0000732717-05-000176.txt
https://www.sec.gov/Archives/edgar/data/732717/0000732717-05-000219.txt
https://www.sec.gov/Archives/edgar/data/732717/0000950134-05-008804.txt
https://www.sec.gov/Archives/edgar/data/732717/0000732717-05-000279.txt
https://www.sec.gov/Archives/edgar/data/732717/0000732717-05-000

https://www.sec.gov/Archives/edgar/data/732717/0000732717-09-000016.txt
https://www.sec.gov/Archives/edgar/data/732717/0000732717-09-000019.txt
https://www.sec.gov/Archives/edgar/data/732717/0000732717-09-000042.txt
https://www.sec.gov/Archives/edgar/data/732717/0000732717-09-000046.txt
https://www.sec.gov/Archives/edgar/data/732717/0000732717-09-000048.txt
https://www.sec.gov/Archives/edgar/data/732717/0000732717-09-000050.txt
https://www.sec.gov/Archives/edgar/data/732717/0000732717-09-000053.txt
https://www.sec.gov/Archives/edgar/data/732717/0000732717-09-000059.txt
https://www.sec.gov/Archives/edgar/data/732717/0000732717-09-000067.txt
https://www.sec.gov/Archives/edgar/data/732717/0000732717-09-000068.txt
https://www.sec.gov/Archives/edgar/data/732717/0000732717-09-000080.txt
https://www.sec.gov/Archives/edgar/data/732717/0000732717-09-000086.txt
https://www.sec.gov/Archives/edgar/data/732717/0000732717-09-000088.txt
https://www.sec.gov/Archives/edgar/data/732717/0000732717-09-000

https://www.sec.gov/Archives/edgar/data/732717/0000732717-13-000007.txt
https://www.sec.gov/Archives/edgar/data/732717/0000732717-13-000008.txt
https://www.sec.gov/Archives/edgar/data/732717/0000732717-13-000010.txt
https://www.sec.gov/Archives/edgar/data/732717/0000732717-13-000012.txt
https://www.sec.gov/Archives/edgar/data/732717/0001193125-13-051768.txt
https://www.sec.gov/Archives/edgar/data/732717/0000732717-13-000017.txt
https://www.sec.gov/Archives/edgar/data/732717/0001193125-13-091472.txt
https://www.sec.gov/Archives/edgar/data/732717/0001193125-13-105360.txt
https://www.sec.gov/Archives/edgar/data/732717/0000732717-13-000021.txt
https://www.sec.gov/Archives/edgar/data/732717/0000732717-13-000023.txt
https://www.sec.gov/Archives/edgar/data/732717/0000732717-13-000025.txt
https://www.sec.gov/Archives/edgar/data/732717/0000732717-13-000028.txt
https://www.sec.gov/Archives/edgar/data/732717/0000732717-13-000036.txt
https://www.sec.gov/Archives/edgar/data/732717/0001193125-13-222

https://www.sec.gov/Archives/edgar/data/732717/0001193125-16-469003.txt
https://www.sec.gov/Archives/edgar/data/732717/0001193125-16-513518.txt
https://www.sec.gov/Archives/edgar/data/732717/0000732717-16-000151.txt
https://www.sec.gov/Archives/edgar/data/732717/0000732717-16-000153.txt
https://www.sec.gov/Archives/edgar/data/732717/0000732717-16-000156.txt
https://www.sec.gov/Archives/edgar/data/732717/0000732717-16-000160.txt
https://www.sec.gov/Archives/edgar/data/732717/0000732717-16-000165.txt
https://www.sec.gov/Archives/edgar/data/732717/0001193125-16-588917.txt
https://www.sec.gov/Archives/edgar/data/732717/0000732717-16-000171.txt
https://www.sec.gov/Archives/edgar/data/732717/0000732717-16-000186.txt
https://www.sec.gov/Archives/edgar/data/732717/0000732717-16-000189.txt
https://www.sec.gov/Archives/edgar/data/732717/0000732717-16-000190.txt
https://www.sec.gov/Archives/edgar/data/732717/0000732717-16-000195.txt
https://www.sec.gov/Archives/edgar/data/732717/0001193125-16-673

https://www.sec.gov/Archives/edgar/data/813828/0000813828-94-000004.txt
https://www.sec.gov/Archives/edgar/data/813828/0000813828-95-000004.txt
https://www.sec.gov/Archives/edgar/data/813828/0000813828-95-000007.txt
https://www.sec.gov/Archives/edgar/data/813828/0000950112-95-001038.txt
https://www.sec.gov/Archives/edgar/data/813828/0000813828-95-000011.txt
https://www.sec.gov/Archives/edgar/data/813828/0000950112-95-001322.txt
https://www.sec.gov/Archives/edgar/data/813828/0000813828-95-000014.txt
https://www.sec.gov/Archives/edgar/data/813828/0000813828-95-000018.txt
https://www.sec.gov/Archives/edgar/data/813828/0000950112-95-002129.txt
https://www.sec.gov/Archives/edgar/data/813828/0000813828-95-000023.txt
https://www.sec.gov/Archives/edgar/data/813828/0000950112-95-002965.txt
https://www.sec.gov/Archives/edgar/data/813828/0000950112-95-003231.txt
https://www.sec.gov/Archives/edgar/data/813828/0000813828-96-000003.txt
https://www.sec.gov/Archives/edgar/data/813828/0000950112-96-001

https://www.sec.gov/Archives/edgar/data/813828/0001047469-05-014195.txt
https://www.sec.gov/Archives/edgar/data/813828/0000813828-05-000112.txt
https://www.sec.gov/Archives/edgar/data/813828/0000813828-05-000114.txt
https://www.sec.gov/Archives/edgar/data/813828/0000947871-05-001216.txt
https://www.sec.gov/Archives/edgar/data/813828/0000813828-05-000130.txt
https://www.sec.gov/Archives/edgar/data/813828/0000813828-05-000134.txt
https://www.sec.gov/Archives/edgar/data/813828/0000813828-05-000170.txt
https://www.sec.gov/Archives/edgar/data/813828/0001104659-05-036506.txt
https://www.sec.gov/Archives/edgar/data/813828/0001047469-05-020885.txt
https://www.sec.gov/Archives/edgar/data/813828/0000813828-05-000226.txt
https://www.sec.gov/Archives/edgar/data/813828/0001104659-05-051360.txt
https://www.sec.gov/Archives/edgar/data/813828/0001047469-05-026375.txt
https://www.sec.gov/Archives/edgar/data/813828/0001104659-05-056805.txt
https://www.sec.gov/Archives/edgar/data/813828/0000947871-05-002

https://www.sec.gov/Archives/edgar/data/813828/0000813828-10-000010.txt
https://www.sec.gov/Archives/edgar/data/813828/0000813828-10-000012.txt
https://www.sec.gov/Archives/edgar/data/813828/0001193125-10-076759.txt
https://www.sec.gov/Archives/edgar/data/813828/0001047469-10-004823.txt
https://www.sec.gov/Archives/edgar/data/813828/0001104659-10-025768.txt
https://www.sec.gov/Archives/edgar/data/813828/0000813828-10-000017.txt
https://www.sec.gov/Archives/edgar/data/813828/0001047469-10-006913.txt
https://www.sec.gov/Archives/edgar/data/813828/0001104659-10-041467.txt
https://www.sec.gov/Archives/edgar/data/813828/0001193125-10-223601.txt
https://www.sec.gov/Archives/edgar/data/813828/0001193125-10-226820.txt
https://www.sec.gov/Archives/edgar/data/813828/0001047469-10-009233.txt
https://www.sec.gov/Archives/edgar/data/813828/0001104659-10-056062.txt
https://www.sec.gov/Archives/edgar/data/813828/0000813828-11-000003.txt
https://www.sec.gov/Archives/edgar/data/813828/0001104659-11-007

https://www.sec.gov/Archives/edgar/data/813828/0000813828-17-000035.txt
https://www.sec.gov/Archives/edgar/data/813828/0001193125-17-313921.txt
https://www.sec.gov/Archives/edgar/data/813828/0001193125-17-329440.txt
https://www.sec.gov/Archives/edgar/data/813828/0000813828-17-000039.txt
https://www.sec.gov/Archives/edgar/data/813828/0000813828-17-000043.txt
https://www.sec.gov/Archives/edgar/data/813828/0000813828-17-000045.txt
https://www.sec.gov/Archives/edgar/data/813828/0000813828-17-000048.txt
https://www.sec.gov/Archives/edgar/data/813828/0001193125-17-338971.txt
https://www.sec.gov/Archives/edgar/data/813828/0001193125-17-341153.txt
https://www.sec.gov/Archives/edgar/data/813828/0001193125-17-343120.txt
https://www.sec.gov/Archives/edgar/data/813828/0000813828-17-000050.txt
https://www.sec.gov/Archives/edgar/data/813828/0001193125-17-348134.txt
https://www.sec.gov/Archives/edgar/data/813828/0001193125-17-350719.txt
https://www.sec.gov/Archives/edgar/data/813828/0000813828-18-000

https://www.sec.gov/Archives/edgar/data/22301/0000950159-03-000259.txt
https://www.sec.gov/Archives/edgar/data/22301/0000950159-03-000424.txt
https://www.sec.gov/Archives/edgar/data/22301/0000950159-03-000650.txt
https://www.sec.gov/Archives/edgar/data/22301/0000950159-03-000801.txt
https://www.sec.gov/Archives/edgar/data/22301/0000950159-03-000926.txt
https://www.sec.gov/Archives/edgar/data/22301/0000950159-04-000344.txt
https://www.sec.gov/Archives/edgar/data/22301/0000950159-04-000537.txt
https://www.sec.gov/Archives/edgar/data/22301/0000950159-04-000753.txt
https://www.sec.gov/Archives/edgar/data/22301/0001047469-04-034185.txt
https://www.sec.gov/Archives/edgar/data/22301/0001047469-05-007822.txt
https://www.sec.gov/Archives/edgar/data/22301/0001047469-05-014775.txt
https://www.sec.gov/Archives/edgar/data/22301/0001047469-05-021333.txt
https://www.sec.gov/Archives/edgar/data/22301/0001104659-05-043906.txt
https://www.sec.gov/Archives/edgar/data/732712/0000950133-94-000018.txt
https

https://www.sec.gov/Archives/edgar/data/732712/0000950134-01-504570.txt
https://www.sec.gov/Archives/edgar/data/732712/0000950134-01-505380.txt
https://www.sec.gov/Archives/edgar/data/732712/0000950134-01-507019.txt
https://www.sec.gov/Archives/edgar/data/732712/0000950134-01-507762.txt
https://www.sec.gov/Archives/edgar/data/732712/0000950134-01-508652.txt
https://www.sec.gov/Archives/edgar/data/732712/0000950134-02-000116.txt
https://www.sec.gov/Archives/edgar/data/732712/0000950134-02-000735.txt
https://www.sec.gov/Archives/edgar/data/732712/0000950134-02-001971.txt
https://www.sec.gov/Archives/edgar/data/732712/0000950109-02-001438.txt
https://www.sec.gov/Archives/edgar/data/732712/0000950134-02-003491.txt
https://www.sec.gov/Archives/edgar/data/732712/0000950134-02-003564.txt
https://www.sec.gov/Archives/edgar/data/732712/0000950134-02-004059.txt
https://www.sec.gov/Archives/edgar/data/732712/0000950134-02-004294.txt
https://www.sec.gov/Archives/edgar/data/732712/0000950134-02-004

https://www.sec.gov/Archives/edgar/data/732712/0001193125-06-105706.txt
https://www.sec.gov/Archives/edgar/data/732712/0001193125-06-105708.txt
https://www.sec.gov/Archives/edgar/data/732712/0001193125-06-141237.txt
https://www.sec.gov/Archives/edgar/data/732712/0001193125-06-143082.txt
https://www.sec.gov/Archives/edgar/data/732712/0001193125-06-151818.txt
https://www.sec.gov/Archives/edgar/data/732712/0001193125-06-157537.txt
https://www.sec.gov/Archives/edgar/data/732712/0001193125-06-162569.txt
https://www.sec.gov/Archives/edgar/data/732712/0001193125-06-173989.txt
https://www.sec.gov/Archives/edgar/data/732712/0001193125-06-178311.txt
https://www.sec.gov/Archives/edgar/data/732712/0001193125-06-198669.txt
https://www.sec.gov/Archives/edgar/data/732712/0001193125-06-210206.txt
https://www.sec.gov/Archives/edgar/data/732712/0001193125-06-217491.txt
https://www.sec.gov/Archives/edgar/data/732712/0001193125-06-225042.txt
https://www.sec.gov/Archives/edgar/data/732712/0001193125-06-238

https://www.sec.gov/Archives/edgar/data/732712/0001193125-10-170567.txt
https://www.sec.gov/Archives/edgar/data/732712/0001193125-10-208126.txt
https://www.sec.gov/Archives/edgar/data/732712/0001193125-10-209084.txt
https://www.sec.gov/Archives/edgar/data/732712/0001193125-10-213190.txt
https://www.sec.gov/Archives/edgar/data/732712/0001193125-10-226827.txt
https://www.sec.gov/Archives/edgar/data/732712/0001193125-10-234169.txt
https://www.sec.gov/Archives/edgar/data/732712/0001193125-10-239707.txt
https://www.sec.gov/Archives/edgar/data/732712/0001193125-10-290314.txt
https://www.sec.gov/Archives/edgar/data/732712/0001193125-11-011532.txt
https://www.sec.gov/Archives/edgar/data/732712/0001193125-11-013427.txt
https://www.sec.gov/Archives/edgar/data/732712/0000909518-11-000031.txt
https://www.sec.gov/Archives/edgar/data/732712/0001193125-11-015961.txt
https://www.sec.gov/Archives/edgar/data/732712/0001193125-11-023291.txt
https://www.sec.gov/Archives/edgar/data/732712/0001193125-11-049

https://www.sec.gov/Archives/edgar/data/732712/0001193125-15-088913.txt
https://www.sec.gov/Archives/edgar/data/732712/0001193125-15-139081.txt
https://www.sec.gov/Archives/edgar/data/732712/0001193125-15-151315.txt
https://www.sec.gov/Archives/edgar/data/732712/0000909518-15-000149.txt
https://www.sec.gov/Archives/edgar/data/732712/0001193125-15-185255.txt
https://www.sec.gov/Archives/edgar/data/732712/0001193125-15-215738.txt
https://www.sec.gov/Archives/edgar/data/732712/0001193125-15-257586.txt
https://www.sec.gov/Archives/edgar/data/732712/0001193125-15-265779.txt
https://www.sec.gov/Archives/edgar/data/732712/0001193125-15-311636.txt
https://www.sec.gov/Archives/edgar/data/732712/0001193125-15-321958.txt
https://www.sec.gov/Archives/edgar/data/732712/0001193125-15-347574.txt
https://www.sec.gov/Archives/edgar/data/732712/0001193125-15-354559.txt
https://www.sec.gov/Archives/edgar/data/732712/0001193125-15-367882.txt
https://www.sec.gov/Archives/edgar/data/732712/0001193125-15-400

In [4]:
working_file.to_csv('C:/Users/li haoran/Desktop/sec files/merged_all.csv', sep='\t', encoding='utf-8')