In [None]:
## This script will take the input file and clean the data to remove https links, # and \r and non-english comments

In [1]:
#import statements
import pandas as pd
import numpy as np
import os
import re
import time  
import ast
from bs4 import BeautifulSoup
import requests, json
from polyglot.detect import Detector 
import cld2

In [2]:
#read the training data from the csv file
header = ['label','comment','parent_comment']
data = pd.read_table('train-balanced.csv',
                    sep='\t', 
                    names=header,
                    usecols=[0,1,9],
                    dtype={'label':int,'comment':str,'parent_comment':str},
                    keep_default_na=False)

In [3]:
#read the test data from the csv file
header = ['label','comment', 'parent_comment']
test_data = pd.read_table('test-balanced.csv',
                    sep='\t', 
                    names=header,
                    usecols=[0,1,9],
                    dtype={'label':int,'comment':str},
                    keep_default_na=False)

In [4]:
print("Test data shape : ", test_data.shape)
print("Train data shape : ", data.shape)

Test data shape :  (251608, 3)
Train data shape :  (1010826, 3)


In [5]:
# Reading the slang dictionary that is already created
f = open("Slangdictionary.txt","r")
res1=f.read()
f.close()
slangdict = ast.literal_eval(res1)

In [6]:
#helper function to clean the comments
def comment_clean(user_comment):
    # remove trailing \r and \n    
    user_comment.rstrip('\r\n')
    
    #remove the # from hashtag
    if '#' in user_comment:
        hash_tag = re.search('#',user_comment)
        if hash_tag is not None:
            user_comment = user_comment.replace(hash_tag.group(0),' ')
    #remove the redit tags(r/) from comment
    if 'r/' in user_comment:
        r_tag = re.search('r/',user_comment)
        if r_tag is not None:
            user_comment = user_comment.replace(r_tag.group(0),' ')
    #remove the URL links from comments  
    if 'HTTP' in user_comment:
        # url of the form [link name](http://url)
        url_link = re.search('\[(.*)\(HTTP(.*)\)', user_comment)
        if url_link is not None:
            user_comment = user_comment.replace(url_link.group(0),' ')
    if 'http' in user_comment:
        # url of the form [link name](http://url)
        url_link = re.search('\[(.*)\(http(.*)\)', user_comment)
        if url_link is not None:
            user_comment = user_comment.replace(url_link.group(0),' ')
        else:
            #url of the form http:/
            url_link = re.search('http(.*)', user_comment)
            if url_link is not None:
                user_comment = user_comment.replace(url_link.group(0),' ')                
      
    
    # Check if the comment has exactly 2 stars
    if user_comment.count('*')==2:
        boldwords = re.search(r"\*(.*?)\*",user_comment)
        #print(boldwords.group(0))
        # Check if the comments have any other text other than **
        if boldwords.group(0) != "**":
            Wordstocapitalize = re.findall(r"\*(.*?)\*",boldwords.group(0))
            Wordstocapitalize = "".join( Wordstocapitalize)
            # Replace the user comment with capitalized words
            user_comment = user_comment.replace(boldwords.group(0),Wordstocapitalize.upper())
    # replace the slangs
    comment_words = re.sub(r"[^a-zA-Z0-9\s\']","",user_comment)         
    comment_words=comment_words.split()
    for word in comment_words:
        if word.upper() in slangdict.keys():
            user_comment = user_comment.replace(word.upper(),slangdict[word.upper()])
        elif word in slangdict.keys():
            user_comment = user_comment.replace(word,slangdict[word]) 
        
    # remove numbers from comments to pass it through the langauge detector
    user_comment_not_num = re.sub(r'\d+', '', user_comment) 
    
    # replace non english comments with empty string
    try:
        isReliable, textBytesFound, details = cld2.detect(user_comment_not_num)
    except:
        try_text = ''.join(x for x in user_comment_not_num if x.isprintable())
        isReliable, textBytesFound, details = cld2.detect(try_text)
    cld_match = details[0][0]
    if not (cld_match == 'ENGLISH'):
        poly_match = Detector(user_comment_not_num, quiet=True).language.name
        if (poly_match != 'English'):
            user_comment = ' '               
    return user_comment           

In [7]:
#clean each comment and parent comment 
data[['comment','parent_comment']] = data[['comment','parent_comment']].applymap(comment_clean)
# remove data with empty comments
valid_comment = data['comment'] != ' '
data = data[valid_comment]

In [8]:
#write the cleaned train data into a csv file
data.to_csv('clean_data_train_balanced.csv',
           sep= '|',
           index=False)

In [9]:
#clean the test data
test_data[['comment','parent_comment']] = test_data[['comment','parent_comment']].applymap(comment_clean)
# remove data with empty comments
valid_comment = test_data['comment'] != ' '
test_data = test_data[valid_comment]


In [10]:
# shape of data frames after cleaning
print("After cleaning Test data shape : ", test_data.shape)
print("After cleaning Train data shape : ", data.shape)

After cleaning Test data shape :  (243784, 3)
After cleaning Train data shape :  (978934, 3)


In [11]:
#write the cleaned test data into a csv file
test_data.to_csv('clean_data_test_balanced_Wparent.csv',
           sep= '|',
           index=False)