In [20]:
import os
import pandas as pd
import re
import string
import preprocessor as p
import nltk
import ast
from textblob import TextBlob
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from googletrans import Translator
from langdetect import detect

In [11]:
#import dataset
df = pd.read_csv('../Dataset/New_Dataset.csv')

In [12]:
#happy emoticons
emoticons_happy = set([
    ':-)', ':)', ';)', ':o)', ':]', ':3', ':c)', ':>', '=]', '8)', '=)', ':}',
    ':^)', ':-D', ':D', '8-D', '8D', 'x-D', 'xD', 'X-D', 'XD', '=-D', '=D',
    '=-3', '=3', ':-))', ":'-)", ":')", ':*', ':^*', '>:P', ':-P', ':P', 'X-P',
    'x-p', 'xp', 'XP', ':-p', ':p', '=p', ':-b', ':b', '>:)', '>;)', '>:-)',
    '<3'
    ])
#sad emoticons
emoticons_sad = set([
    ':L', ':-/', '>:/', ':S', '>:[', ':@', ':-(', ':[', ':-||', '=L', ':<',
    ':-[', ':-<', '=\\', '=/', '>:(', ':(', '>.<', ":'-(", ":'(", ':\\', ':-c',
    ':c', ':{', '>:\\', ';('
    ])
#emoji patterns
emoji_pattern = re.compile("["
                           u"\U0001F600-\U0001F64F"  # emoticons
                           u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                           u"\U0001F680-\U0001F6FF"  # transport & map symbols
                           u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           u"\U00002702-\U000027B0"
                           u"\U000024C2-\U0001F251"
                           "]+", flags=re.UNICODE)

#combine happy and sad emoticon
emoticons = emoticons_happy.union(emoticons_happy)

##### Clean Tweets

In [1]:
#use slang and stopwords to clean the dataset
# slang and stop words source: https://github.com/louisowen6/NLP_bahasa_resources
def clean_text(text):
    my_file = open('cleaning_source/combined_stop_words.txt','r')
    content = my_file.read()
    stop_words = content.split('\n')
    file_2 = open('cleaning_source/update_combined_slang_words.txt','r')
    content2 = file_2.read()
    slang_words = ast.literal_eval(content2)
    my_file.close()
    file_2.close()
    text = text.lower()
    # Remove url
    text = re.sub(r'https?://[^\s]+','',text)
    # Remove hashtag
    text = re.sub(r'#\w+','',text)
    # Remove mentions
    text = re.sub(r'@\w+','',text)
    
    text = re.sub(r':', '', text)
    text = re.sub(r'‚Ä¶', '', text)
    #replace consecutive non-ASCII characters with a space
    text = re.sub(r'[^\x00-\x7F]+',' ', text)
    
    #remove emojis from text
    text = emoji_pattern.sub(r'',text)
    
    #remove punctuation
    text = re.sub('[^a-zA-Z]',' ', text)
    
    #remove tags
    text=re.sub('&lt;/?.*?&gt;","&lt;&gt;','',text)
    
    #remove digits and special chars
    text = re.sub("(\\d|\\W)+"," ",text)
    
    #remove other symbol from tweet
    text = re.sub(r'â', '', text)
    text = re.sub(r'€', '', text)
    text = re.sub(r'¦', '', text)
    
    #translate and modify slang word to make it more suitable
    word_tokens = word_tokenize(text)
    translator = Translator()
    for w in word_tokens:
        if w in slang_words.keys():
            word_tokens[word_tokens.index(w)] = slang_words[w]
    
    #filter using NLTK and append it to a string
    filtered_tweet = [w for w in word_tokens if not w in stop_words]
    filtered_tweet = []
    
    #loop through condition
    for w in word_tokens:
        #check tokens against emoticons, punctuations and stopwords
        if w not in emoticons and w not in string.punctuation and w not in stop_words:
            filtered_tweet.append(w.lower())
            
    return ' '.join(filtered_tweet)

In [26]:
df['Clean_Text'] = df['Text'].apply(lambda x: clean_text(x))

In [17]:
df.to_csv('Clean_Dataset.csv')