# 1.2 Cleaning and Collating of Data
- Collating the comments which are categorized by post_ids
- Cleaning and standardizing collated comments
- Remove duplicate comments

In [1]:
# Declaration of FINAL Static variables
ROOT_DATA_FOLDER = './Data/' 

# Declaration of Dynamic variables
page_list = [
    'DollarsAndSense',
    'MortgageConsultancy',
    'Seedly',
    'WokeManSalary'
]

In [2]:
import os
import csv
import traceback
import datetime

In [3]:
def read_comments_csv(filepath):
    comments_data = []
    
    with open(filepath, mode='r') as csv_file:
        csv_reader = csv.reader(csv_file, delimiter=',')
        
        line_number = 0
        user_tagging_index = None
        user_index = None
        comment_index = None
        comment_id_index = None
        
        for row in csv_reader:
            if line_number == 0:       
                
                # Get header index
                user_index = row.index('User')
                comment_index = row.index('Comment')
                comment_id_index = row.index('Comment_ID')
                if filepath.split('/')[-2] == 'Comments_Tagging': user_tagging_index = row.index('User_Tagging') 
                    
            else:
                
                # Get content from header index
                content = [row[user_index], row[comment_index], row[comment_id_index]]
                
                if filepath.split('/')[-2] == 'Comments_Tagging': 
                    user_tagging = row[user_tagging_index]    
                    content.append(user_tagging if 'https' not in user_tagging else '')
                    
                if content not in comments_data:
                    comments_data.append(content)
                
            line_number += 1
            
    return comments_data

def write_comments_csv(filepath, collated_comments_data):
    print('Writing ' + filepath.split('/')[-1] + '...')
    print('')

    with open(filepath, mode='w') as csv_file:
        csv_writer = csv.writer(csv_file, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL)

        # Write Header
        header = ['Post_ID', 'User', 'Comment', 'Comment_ID']
        if 'tagging' in filepath: header.append('User_Comment_Tagging')
        csv_writer.writerow(header)

        # Write Content
        for post_id, comments in collated_comments_data.items():
            for comment in comments:
                content = [post_id, comment[0],  comment[1], comment[2]]
                if 'tagging' in filepath: content.append(comment[3])
                csv_writer.writerow(content)

In [None]:
for page in page_list:
    collated_comments_data = {}
    comments_folder = ROOT_DATA_FOLDER + page + '/Comments_Tagging/'
    
    # Loads files in comment folders
    comment_folder_files = os.listdir(comments_folder)
    
    # Remove excess file
    if '.DS_Store' in comment_folder_files:
        comment_folder_files.remove('.DS_Store')
        
    print('Reading ' + page + 'folder...')
    
    # Itterate through each post comments
    for comment_filename in comment_folder_files:
        if len(comment_filename) > 0: 
            post_id = comment_filename.split('_')[2].split('.')[0]

            # Read and collate comments file
            collated_comments_data[post_id] = read_comments_csv(comments_folder + comment_filename)
       
    print('Compiled', len(collated_comments_data), 'post(s) with a total of', sum([len(comments) for comments in collated_comments_data.values()]),'comment(s)')
    
    # Write collated comments into one large file
    write_comments_csv(ROOT_DATA_FOLDER + page + '/' + page + '_collated_comments_tagging.txt', collated_comments_data)