In [None]:
### Trustpilot data cleaning module 
### Author: Hakim Khalafi

import re
import os
import io
import csv
import string
from pathlib import Path

In [None]:
## Function definitions

def isolate_punctuation(s):
    s = re.sub('([.,!?();:"])', r' \1 ', s) # isolate punctuation
    s = ' '.join(s.split()) # Remove space multiples 
    return s

def make_dirs(directories):
    for directory in directories:
        Path(directory).mkdir(parents=True, exist_ok=True)

In [None]:
## Configurations

# Current file folder
script_folder = os.path.realpath('.')

# Data file generated from scrape step
data_files = ['dataSkype.csv'] 
#data_files = [dataO2.csv','dataSkype.csv','dataThree.csv','dataVodafone.csv']

# Folder to save our cleaned data for analyse step
data_outfolder = 'skype-binary-classification'
#data_outfolder = 'telephony12k-5star'
data_outfolder = os.path.join(script_folder, data_outfolder)
make_dirs([data_outfolder])

# Classify into Negative/Positive sentiment or 1-5 stars 
binary_classification = True

In [None]:
## Clean 

for file_iter, datafile in enumerate(data_files):
    print('Cleaning ' + data_files[file_iter])
    with open(datafile, newline='', encoding='utf8') as csv_in:
        
        datareader = csv.reader(csv_in, delimiter='\t')
        
        for idx,row in enumerate(datareader): 
            
            # Title of comment
            title = row[0]
            title = isolate_punctuation(title)

            # Body of comment
            body = row[1]
            body = isolate_punctuation(body)

            # The rating
            rating = int(row[2])

            # Merge title and body into one file 
            toWrite = title + '\n' + body + '\n'
            
            # Only consider english reviews
            if(binary_classification):
                # Make negative and positive folder 
                make_dirs([os.path.join(data_outfolder,s) for s in ['neg','pos']])
                
                # Ratings 1 and 2 become "negative" and ratings 4 and 5 become "positive"
                if(rating < 3):
                    
                    # Negatives filename, taking into account which csv source
                    fileNeg = os.path.join(data_outfolder, "neg", str(file_iter) + "-" + str(idx) + ".txt")
                    
                    with io.open(fileNeg, "w", encoding="utf-8") as f:
                        f.write(toWrite)
                elif(rating > 3):
                    
                    # Positives filename, taking into account which csv source
                    filePos = os.path.join(data_outfolder, "pos", str(file_iter) + "-" + str(idx) + ".txt")
                    with io.open(filePos, "w", encoding="utf-8") as f:
                        f.write(toWrite)       
            else:
                
                # Make ratings folders 
                make_dirs([os.path.join(data_outfolder,s) for s in ['1','2','3','4','5']])
                
                # Filename, taking into account which csv source
                filename = os.path.join(data_outfolder, str(rating), str(file_iter) + "-" + str(idx) + ".txt")

                with io.open(filename, "w", encoding="utf-8") as f:
                    f.write(toWrite)
                    
print('Finished cleaning data!')