# 1.2 Cleaning and Collating of Data
- Collating the comments which are categorized by post_ids
- Cleaning and standardizing collated comments
- Remove duplicate comments

In [1]:
# Declaration of FINAL Static variables
ROOT_DATA_FOLDER = './Data/' 

# Declaration of Dynamic variables
page_list = [
    'DollarsAndSense',
    'MortgageConsultancy',
    'Seedly',
    'WokeManSalary'
]

In [2]:
import os
import csv
import traceback
import datetime

In [9]:
def read_comments_csv(filepath):
    comments_data = []
    
    with open(filepath, mode='r') as csv_file:
        csv_reader = csv.reader(csv_file, delimiter=',')
        
        line_number = 0
        user_tagging_index = None
        user_index = None
        comment_index = None
        comment_id_index = None
        
        for row in csv_reader:
            if line_number == 0:       
                
                # Get header index
                user_index = row.index('User')
                comment_index = row.index('Comment')
                comment_id_index = row.index('Comment_ID')
                user_tagging_index = row.index('User_Tagging') # <--- Assuming that there are taggings, else error
                user_tagging_link_index = row.index('User_Tagging_Link') # <--- Assuming that there are taggings, else error

            else:
                
                # Get content from header index
                content = [row[user_index], row[comment_index], row[comment_id_index]]
                
                user_tagging_list = row[user_tagging_index]  
                user_tagging_link_list = row[user_tagging_link_index]   
                
                content.append(user_tagging_list if 'https' not in user_tagging_link_list else '')
                content.append(user_tagging_link_list if 'https' not in user_tagging_link_list else '')

                # Removes possible duplicates
                if content not in comments_data:
                    comments_data.append(content)
                
            line_number += 1
            
    return comments_data

def write_comments_csv(filepath, collated_comments_data):
    print('Writing ' + filepath.split('/')[-1] + '...')
    print('')

    with open(filepath, mode='w') as csv_file:
        csv_writer = csv.writer(csv_file, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL)

        # Write Header
        csv_writer.writerow(['Post_ID', 'User', 'Comment', 'Comment_ID', 'User_Tagging', 'User_Tagging_Link'])

        # Write Content
        for post_id, comments in collated_comments_data.items():
            for comment in comments:
                csv_writer.writerow([post_id, comment[0],  comment[1], comment[2], comment[3], comment[4]])

In [10]:
for page in page_list:
    collated_comments_data = {}
    comments_folder = ROOT_DATA_FOLDER + page + '/Comments/'
    
    # Loads files in comment folders
    comment_folder_files = os.listdir(comments_folder)
    
    print('Reading ' + page + 'folder...')

    # Itterate through each post comments
    for comment_filename in comment_folder_files:

        # Read only .txt files
        if '.txt' in comment_filename and len(comment_filename) > 0:       
            post_id = comment_filename.split('_')[2].split('.')[0]

            # Read and collate comments file
            collated_comments_data[post_id] = read_comments_csv(comments_folder + comment_filename)

    print('Compiled', len(collated_comments_data), 'post(s) with a total of', sum([len(comments) for comments in collated_comments_data.values()]),'comment(s)')

    # Write collated comments into one large file
    write_comments_csv(ROOT_DATA_FOLDER + page + '/' + page + '_comments.txt', collated_comments_data)

Reading DollarsAndSensefolder...
Compiled 167 post(s) with a total of 596 comment(s)
Writing DollarsAndSense_comments.txt...

Reading MortgageConsultancyfolder...
Compiled 21 post(s) with a total of 28 comment(s)
Writing MortgageConsultancy_comments.txt...

Reading Seedlyfolder...
Compiled 300 post(s) with a total of 701 comment(s)
Writing Seedly_comments.txt...

Reading WokeManSalaryfolder...
Compiled 28 post(s) with a total of 1815 comment(s)
Writing WokeManSalary_comments.txt...

