Pra Pengolahan Data

In [1]:
import numpy as np
import pandas as pd

In [2]:
df = pd.read_csv('tweet_emotions.csv')
df.head()

Unnamed: 0,tweet_id,sentiment,content
0,1956967341,empty,@tiffanylue i know i was listenin to bad habi...
1,1956967666,sadness,Layin n bed with a headache ughhhh...waitin o...
2,1956967696,sadness,Funeral ceremony...gloomy friday...
3,1956967789,enthusiasm,wants to hang out with friends SOON!
4,1956968416,neutral,@dannycastillo We want to trade with someone w...


--- Case Folding ---  

In [3]:
# ------ Case Folding --------
# Mengunakan fungsi Series.str.lower() untuk mengubah hufur kapital menjadi huruf kecil pada Pandas
df['content'] = df['content'].str.lower()


print('Case Folding Result : \n')
print(df['content'].head())

#save to csv
df.to_csv('tweet_emotions_caseFolding.csv',index=False)

Case Folding Result : 

0    @tiffanylue i know  i was listenin to bad habi...
1    layin n bed with a headache  ughhhh...waitin o...
2                  funeral ceremony...gloomy friday...
3                 wants to hang out with friends soon!
4    @dannycastillo we want to trade with someone w...
Name: content, dtype: object


--- Tokenizing ---

In [4]:
import nltk # Library nltk
import string # Library string
# impor modul regular expression 
import re # Library regex 

# import word_tokenize & FreqDist from NLTK
from nltk.tokenize import word_tokenize
from nltk.probability import FreqDist

In [5]:
# ------ Operasi Tokenizing ---------
# ------ tokenizing per kata ---------

def remove_tweet_special(text):
    # remove tab, new line, ans back slice
    text = text.replace('\\t'," ").replace('\\n'," ").replace('\\u'," ").replace('\\',"")
    # remove non ASCII (emoticon, chinese word, .etc)
    text = text.encode('ascii', 'replace').decode('ascii')
    # remove mention, link, hashtag
    text = ' '.join(re.sub("([@#][A-Za-z0-9]+)|(\w+:\/\/\S+)"," ", text).split())
    # remove incomplete URL
    return text.replace("http://", " ").replace("https://", " ")
                
df['content'] = df['content'].apply(remove_tweet_special)


In [6]:
#remove number
def remove_number(text):
    return  re.sub(r"\d+", "", text)

df['content'] = df['content'].apply(remove_number)


#remove punctuation
def remove_punctuation(text):
    return text.translate(str.maketrans("","",string.punctuation))

df['content'] = df['content'].apply(remove_punctuation)


#remove whitespace leading & trailing
def remove_whitespace_LT(text):
    return text.strip()

df['content'] = df['content'].apply(remove_whitespace_LT)

#remove multiple whitespace into single whitespace
def remove_whitespace_multiple(text):
    return re.sub('\s+',' ',text)

df['content'] = df['content'].apply(remove_whitespace_multiple)

# remove single char
def remove_singl_char(text):
    return re.sub(r"\b[a-zA-Z]\b", "", text)

df['content'] = df['content'].apply(remove_singl_char)

# NLTK word rokenize 
def word_tokenize_wrapper(text):
    return word_tokenize(text)

df['tweet_tokens'] = df['content'].apply(word_tokenize_wrapper)

print('Tokenizing Result : \n') 
print(df['tweet_tokens'].head())
#save to csv
df.to_csv('tweet_emotions_tokenize.csv',index=False)

Tokenizing Result : 

0    [know, was, listenin, to, bad, habit, earlier,...
1    [layin, bed, with, headache, ughhhhwaitin, on,...
2                    [funeral, ceremonygloomy, friday]
3          [wants, to, hang, out, with, friends, soon]
4    [we, want, to, trade, with, someone, who, has,...
Name: tweet_tokens, dtype: object


--- Filtering (Stopwprd Removal) ---

In [7]:
from nltk.corpus import stopwords

# ----------------------- get stopword from NLTK stopword -------------------------------
# get stopword indonesia
list_stopwords = stopwords.words('english')

In [8]:
# read txt stopword using pandas
txt_stopword = pd.read_csv("tweet_emotions.csv", names= ["stopwords"], header = None)

# convert stopword string to list & append additional stopword
list_stopwords.extend(txt_stopword["stopwords"][0].split(' '))

# convert list to dictionary
list_stopwords = set(list_stopwords)

#remove stopword pada list token
def stopwords_removal(words):
    return [word for word in words if word not in list_stopwords]

df['tweet_tokens_WSW'] = df['tweet_tokens'].apply(stopwords_removal) 

print('Filtering Result : \n') 
print(df['tweet_tokens_WSW'].head())

#save to csv
df.to_csv('tweet_emotions_Filtering.csv',index=False)



Filtering Result : 

0    [know, listenin, bad, habit, earlier, started,...
1           [layin, bed, headache, ughhhhwaitin, call]
2                    [funeral, ceremonygloomy, friday]
3                         [wants, hang, friends, soon]
4        [want, trade, someone, houston, tickets, one]
Name: tweet_tokens_WSW, dtype: object


--- Stemming ---

In [10]:
from nltk.stem import PorterStemmer

df = pd.read_csv('tweet_emotions_tokenize.csv')

stemmer=PorterStemmer()

# #change to string
# df['tweet_tokens_WSW'] = df['tweet_tokens_WSW'].astype(str)


sentences = df['tweet_tokens'].tolist()

for i in range(len(sentences)):
    words=word_tokenize(sentences[i])
    #List comprehension
    words=[stemmer.stem(word) for word in words]
    sentences[i]=' '.join(words)
   
print(sentences)

#save to csv
d = {'col1': df['sentiment'], 'col2': sentences}
sentences = pd.DataFrame(d)
sentences.to_csv('tweet_emotions_stemming.csv',index=False)

["[ 'know ' , 'wa ' , 'listenin ' , 'to ' , 'bad ' , 'habit ' , 'earlier ' , 'and ' , 'start ' , 'freakin ' , 'at ' , 'hi ' , 'part ' ]", "[ 'layin ' , 'bed ' , 'with ' , 'headach ' , 'ughhhhwaitin ' , 'on ' , 'your ' , 'call ' ]", "[ 'funer ' , 'ceremonygloomi ' , 'friday ' ]", "[ 'want ' , 'to ' , 'hang ' , 'out ' , 'with ' , 'friend ' , 'soon ' ]", "[ 'we ' , 'want ' , 'to ' , 'trade ' , 'with ' , 'someon ' , 'who ' , 'ha ' , 'houston ' , 'ticket ' , 'but ' , 'no ' , 'one ' , 'will ' ]", "[ 'reping ' , 'whi ' , 'didnt ' , 'you ' , 'go ' , 'to ' , 'prom ' , 'bc ' , 'mi ' , 'bf ' , 'didnt ' , 'like ' , 'mi ' , 'friend ' ]", "[ 'should ' , 'be ' , 'sleep ' , 'but ' , 'im ' , 'not ' , 'think ' , 'about ' , 'an ' , 'old ' , 'friend ' , 'who ' , 'want ' , 'but ' , 'he ' , 'marri ' , 'now ' , 'damn ' , 'amp ' , 'he ' , 'want ' , 'me ' , 'scandal ' ]", "[ 'hmmm ' , 'i ' , 'down ' ]", "[ 'charlen ' , 'mi ' , 'love ' , 'miss ' , 'you ' ]", "[ 'im ' , 'sorri ' , 'at ' , 'least ' , 'it ' , 'fri