In [1]:
from nltk.corpus import stopwords as nltk_stopwords
import nltk;import pandas as pd;import nlp_id;import os
import string;import re
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory

In [2]:
df = pd.read_csv('./data/tsv/data.tsv', sep='\t')
data = df[['content', 'username']]

In [3]:
class StopWord(nlp_id.StopWord):
    def __init__(self, stopword_path=None):
        self.current_dir = os.path.dirname(os.path.realpath(__name__))
        if not stopword_path:
            stopword_path = os.path.join(self.current_dir, "data", "pp_stopwords")
        super(StopWord, self).__init__()
        with open(stopword_path) as f:
            additional = f.read().split('\n')
            self.stopwords = set(self.stopwords).union(set(additional))
            
        self.stopwords = set(self.stopwords) \
          .union(set(nltk_stopwords.words("english"))) \
          .union(set(nltk_stopwords.words("indonesian")))

In [33]:
class NormalizeWord:
  def __init__(self, normalize_path=None):
    self.current_dir = os.path.dirname(os.path.realpath(__name__))
    self.norms_dict = {}
    self.multiple_character = {}
    self.stemmer = StemmerFactory().create_stemmer()

    if not normalize_path:
      normalize_path = os.path.join(self.current_dir, "data", "pp_normalize")
    
    with open(normalize_path) as f:
      fdata = f.read().split("\n")
      for row in fdata:
        key, val = tuple(row.split(":"))
        val = val.split(',')
        temp:list = self.norms_dict.get(key, [])
        self.norms_dict[key] = list(set([*val, *temp]))
        
  def normalize(self, content:str):
    if re.findall(r'([aiueo]{2,})\1+', content):
      content = re.sub(r'([aiueo]{2,})\1+', lambda x: x.group(0)[0], content)
    
    # for norm, regx in self.norms_dict.items():
    #   if re.findall(rf"^{'|'.join(regx)}$", content):
    #     return norm

    content = self.stemmer.stem(content)
    return content

In [34]:
stopwords = StopWord()
tokenizer = nlp_id.Tokenizer()
normalizer = NormalizeWord()

In [47]:
def preprocess(content:str):
  text = re.sub(r"http\S+", "", content).strip() # remove links
  tokens = nltk.tokenize.word_tokenize(text) # tokenizing
  tokens = [token.lower() for token in tokens if token not in string.punctuation] # lowercasing + remove punctuation
  tokens = [re.sub(r'\W', "", token) for token in tokens if re.findall(r"\w+", token)] # remove unnecessary
  tokens = [x for x in stopwords.remove_stopword(" ".join(tokens)).split(" ") if x != '']
  tokens = [normalizer.normalize(token) for token in tokens if re.match(r'\D', token)]
  return tokens

In [48]:
dest = ['Gak sabar nonton coldplay 🥹🥲 @woootamelon semoga kita berdua kebagian tiketnya ya Allah',
'@IDWantsColdplay @coldplay Bismillah menang war tiket AAMIIN',
'Salah satu tanda @coldplay ke Indonesia untuk Tur dunia yaitu adanya sebuah Videotron di sejumlah gedung di Jakarta pada tanggal 5-6 Mei ini. Tapi pihak @coldplay akan konfirmasi kedatangan mereka ke Asia tanggal 9 Mei ini. ready Coldplayer https://t.co/zPnaMQ0EZB']
for x in dest:
  print(preprocess(x))

['gak', 'sabar', 'nonton', 'coldplay', 'woootamelon', 'moga', 'dua', 'bagi', 'tiket', 'allah']
['idwantscoldplay', 'coldplay', 'bismillah', 'menang', 'war', 'tiket', 'aamiin']
['salah', 'tanda', 'coldplay', 'indonesia', 'tur', 'dunia', 'videotron', 'gedung', 'jakarta', 'tanggal', 'mei', 'coldplay', 'konfirmasi', 'datang', 'asia', 'tanggal', 'mei', 'ready', 'coldplayer']


In [None]:
data['content'] = data['content'].apply(lambda x: preprocess(x))

In [8]:
data.to_csv("./data/tsv/cleaned_content.tsv", index=False, sep="\t")