In [1]:
import numpy as np
import pandas as pd

_SW_URL = "https://raw.githubusercontent.com/masdevid/ID-Stopwords/master/id.stopwords.02.01.2016.txt"
_POS_URL = "https://raw.githubusercontent.com/fajri91/InSet/master/positive.tsv"
_NEG_URL = "https://raw.githubusercontent.com/fajri91/InSet/master/negative.tsv"

_SW = pd.read_csv(_SW_URL, header = None, names = ['word'])
_POS = pd.read_csv(_POS_URL, sep = '\t')
_NEG = pd.read_csv(_NEG_URL, sep = '\t')

In [2]:
# Save stopwords to csv
_SW.to_csv('masdevid-stopwords.csv', index=False)

In [2]:
reviews_df = pd.read_csv("mypertamina-review.csv")
reviews_df

Unnamed: 0,reviewId,userName,content,score,thumbsUpCount,reviewCreatedVersion,at,replyContent,repliedAt
0,e3ec75ec-675b-4328-bed5-e2f3dbfe0a91,Radhy Aiman,Ini salah masa buka hp dipertamina,1,0,3.6.2,2022-07-04 17:18:54,,
1,a6923e84-bef9-4c86-8b88-ae6b8d98940b,rara mulan,Mau menyusahkan rakyat ya....?,1,0,3.6.2,2022-07-04 17:18:51,,
2,384c929f-6554-4723-bf83-355656281321,Indramayu TV Official,"Sering nge bug, dan grafiknya juga buriq kayak...",1,0,3.6.2,2022-07-04 17:18:50,,
3,c96d5d1d-3c2f-408b-8f42-2cf2c3e411ea,M.fahri29 29,Uh,1,0,3.6.2,2022-07-04 17:18:44,,
4,512d0b16-8fd4-42b4-b648-6b22c28bc422,Soko Hengky,Ribet,1,0,3.6.2,2022-07-04 17:18:36,,
...,...,...,...,...,...,...,...,...,...
122989,273b017d-1b0a-4b8a-961a-faa7629f40f2,Pengguna Google,Mantap...,5,1,,2017-08-10 21:19:14,Terima kasih sobat Sukrons KRN atas apresiasin...,2018-09-14 13:13:14
122990,7c86bfc3-7d7f-425d-98d9-bde583f2f6d9,Pengguna Google,#bringbackRioHaryantoF12018 @pertaminaracingid,3,0,,2017-08-10 09:50:30,Terima kasih atas masukannya sobat bukhari yul...,2018-09-17 04:29:16
122991,45e977af-e705-4d02-9ad4-4c3d357e0b1a,Pengguna Google,Downloader ke 100... Nice apps... smg ada prog...,5,1,1.0.2,2017-08-10 08:43:30,Terima kasih sobat Dudy Effendi atas attention...,2018-09-14 13:06:49
122992,592690d7-ecdb-4796-ae06-6764c70b3840,Pengguna Google,Good start.. be responsive toward feedback and...,5,0,1.0,2017-08-09 17:58:21,Hai sobat Ilmianto Boediman. Terima kasih atas...,2018-10-03 15:03:26


In [3]:
# Define various auxiliary helpers for preprocessing and labeling

import string

# Prepare a translation table to remove punctuations and numbers
unwanted_chars = string.punctuation + string.digits
del_trans_table = str.maketrans({k: None for k in unwanted_chars})

def preprocess(text):
    """Preprocess a single string of text"""
    
    result = text
    
    # Normalize (case-folding)
    result = result.lower()
    
    # Clean up nonstandard characters
    result = result.encode('ascii', 'ignore').decode()
    
    # Clean up punctuations
    result = result.translate(del_trans_table)
    
    # Tokenize by whitespace, drop stopwords, and merge back to a sentence
    result = ' '.join([word for word in result.split() if word not in _SW])
    
    return result

# Prepare a sentiment weight lexicon dictionary for labeling
lexicon_dict = pd.concat([_POS, _NEG]) \
    .reset_index(drop = True) \
    .set_index('word') \
    .to_dict()['weight']

def weight_sentiment(text):
    """Label sentiment of a single string of text"""
    words = text.split()
    
    sentiment = sum([lexicon_dict.get(word, 0) for word in words])
    
    return sentiment

In [4]:
# There are reviews with missing content. Not sure what it means though.
reviews_df[reviews_df['content'].isna()]

Unnamed: 0,reviewId,userName,content,score,thumbsUpCount,reviewCreatedVersion,at,replyContent,repliedAt
78312,f456d472-366e-437a-8994-0463f53558c5,yadi suardi,,1,0,3.6.2,2022-06-30 08:43:39,,
93198,2bbdd27e-6a43-413a-9b29-ca71d37af745,Noor Rachman,,5,0,3.5.1,2022-06-11 10:11:05,,


In [7]:
from tqdm.notebook import tqdm
tqdm.pandas()

# Establish initial dataframe from raw
ready_df = reviews_df[['reviewId','content']].dropna()
ready_df = ready_df.rename(columns = {'content': 'raw'})

# Perform preprocessing
ready_df['preprocessed'] = ready_df['raw'].progress_apply(preprocess)

# Perform labeling
ready_df['weight'] = ready_df['preprocessed'].progress_apply(weight_sentiment)

# Classify into pos/neg
ready_df['label'] = ["positive" if weight > 0 else
                     "negative" if weight < 0 else "neutral"
                     for weight in ready_df['weight']]

# Remove neutral sentiment
ready_df = ready_df[ready_df['label'] != 'neutral']

# Show dataframe
ready_df

  0%|          | 0/122992 [00:00<?, ?it/s]

  0%|          | 0/122992 [00:00<?, ?it/s]

Unnamed: 0,reviewId,raw,preprocessed,weight,label
0,e3ec75ec-675b-4328-bed5-e2f3dbfe0a91,Ini salah masa buka hp dipertamina,ini salah masa buka hp dipertamina,-7,negative
1,a6923e84-bef9-4c86-8b88-ae6b8d98940b,Mau menyusahkan rakyat ya....?,mau menyusahkan rakyat ya,-2,negative
2,384c929f-6554-4723-bf83-355656281321,"Sering nge bug, dan grafiknya juga buriq kayak...",sering nge bug dan grafiknya juga buriq kayak ...,-20,negative
4,512d0b16-8fd4-42b4-b648-6b22c28bc422,Ribet,ribet,-4,negative
5,5c3a91de-8f5f-4202-af8c-bdb4a7f92e59,Masa iya buka hp dlu buat ke apknya sedangkan ...,masa iya buka hp dlu buat ke apknya sedangkan ...,-10,negative
...,...,...,...,...,...
122988,e330c66a-3d7c-4884-9b65-c38ce0b280cf,Nice mulai download aplikasi sampai pengisian ...,nice mulai download aplikasi sampai pengisian ...,-11,negative
122989,273b017d-1b0a-4b8a-961a-faa7629f40f2,Mantap...,mantap,5,positive
122991,45e977af-e705-4d02-9ad4-4c3d357e0b1a,Downloader ke 100... Nice apps... smg ada prog...,downloader ke nice apps smg ada program loyalt...,5,positive
122992,592690d7-ecdb-4796-ae06-6764c70b3840,Good start.. be responsive toward feedback and...,good start be responsive toward feedback and p...,5,positive


In [9]:
ready_df.to_csv('review-label.csv', index=False)