Pre-Processing Data

In [4]:
import numpy as np
import pandas as pd

In [5]:
df = pd.read_csv('tweet_emotions.csv')
df.head()

Unnamed: 0,tweet_id,sentiment,content
0,1956967341,empty,@tiffanylue i know i was listenin to bad habi...
1,1956967666,sadness,Layin n bed with a headache ughhhh...waitin o...
2,1956967696,sadness,Funeral ceremony...gloomy friday...
3,1956967789,enthusiasm,wants to hang out with friends SOON!
4,1956968416,neutral,@dannycastillo We want to trade with someone w...


--- Case Folding ---  

In [10]:
# ------ Case Folding --------
# Mengunakan fungsi Series.str.lower() untuk mengubah hufur kapital menjadi huruf kecil pada Pandas
df['content'] = df['content'].str.lower()


print('Case Folding Result : \n')
print(df['content'].head())

# Save to CSV with New Column with name 'Case_Folding'
df['Case_Folding'] = df['content']
df.to_csv('tweet_emotions.csv', index=False)

Case Folding Result : 

0    tiffanylue  know  was listenin to bad habit ea...
1    layin  bed with  headache ughhhhwaitin on your...
2                        funeral ceremonygloomy friday
3                  wants to hang out with friends soon
4    dannycastillo we want to trade with someone wh...
Name: content, dtype: object


--- Tokenizing ---

In [7]:
import nltk # Library nltk
import string # Library string
# impor modul regular expression 
import re # Library regex 

# import word_tokenize & FreqDist from NLTK
from nltk.tokenize import word_tokenize
from nltk.probability import FreqDist

#remove number
def remove_number(text):
    return  re.sub(r"\d+", "", text)

df['content'] = df['content'].apply(remove_number)


#remove punctuation
def remove_punctuation(text):
    return text.translate(str.maketrans("","",string.punctuation))

df['content'] = df['content'].apply(remove_punctuation)


#remove whitespace leading & trailing
def remove_whitespace_LT(text):
    return text.strip()

df['content'] = df['content'].apply(remove_whitespace_LT)

#remove multiple whitespace into single whitespace
def remove_whitespace_multiple(text):
    return re.sub('\s+',' ',text)

df['content'] = df['content'].apply(remove_whitespace_multiple)

# remove single char
def remove_singl_char(text):
    return re.sub(r"\b[a-zA-Z]\b", "", text)

df['content'] = df['content'].apply(remove_singl_char)

# NLTK word rokenize 
def word_tokenize_wrapper(text):
    return word_tokenize(text)

df['tweet_tokens'] = df['content'].apply(word_tokenize_wrapper)

print('Tokenizing Result : \n') 
print(df['tweet_tokens'].head())
#save to csv
df.to_csv('tweet_emotions_tokenize.csv',index=False)

Tokenizing Result : 

0    [tiffanylue, know, was, listenin, to, bad, hab...
1    [layin, bed, with, headache, ughhhhwaitin, on,...
2                    [funeral, ceremonygloomy, friday]
3          [wants, to, hang, out, with, friends, soon]
4    [dannycastillo, we, want, to, trade, with, som...
Name: tweet_tokens, dtype: object


--- Filtering (Stopwprd Removal) ---

In [8]:
from nltk.corpus import stopwords

# ----------------------- get stopword from NLTK stopword -------------------------------
# get stopword indonesia
list_stopwords = stopwords.words('english')

# read txt stopword using pandas
txt_stopword = pd.read_csv("tweet_emotions.csv", names= ["stopwords"], header = None)

# convert stopword string to list & append additional stopword
list_stopwords.extend(txt_stopword["stopwords"][0].split(' '))

# convert list to dictionary
list_stopwords = set(list_stopwords)

#remove stopword pada list token
def stopwords_removal(words):
    return [word for word in words if word not in list_stopwords]

df['tweet_tokens_WSW'] = df['tweet_tokens'].apply(stopwords_removal) 

print('Filtering Result : \n') 
print(df['tweet_tokens_WSW'].head())

#save to csv
df.to_csv('tweet_emotions_Filtering.csv',index=False)


Filtering Result : 

0    [tiffanylue, know, listenin, bad, habit, earli...
1           [layin, bed, headache, ughhhhwaitin, call]
2                    [funeral, ceremonygloomy, friday]
3                         [wants, hang, friends, soon]
4    [dannycastillo, want, trade, someone, houston,...
Name: tweet_tokens_WSW, dtype: object


--- Stemming ---

In [9]:
from nltk.stem import PorterStemmer

df = pd.read_csv('tweet_emotions_tokenize.csv')

stemmer=PorterStemmer()

# #change to string
# df['tweet_tokens_WSW'] = df['tweet_tokens_WSW'].astype(str)


sentences = df['tweet_tokens'].tolist()

for i in range(len(sentences)):
    words=word_tokenize(sentences[i])
    #List comprehension
    words=[stemmer.stem(word) for word in words]
    sentences[i]=' '.join(words)
   
# print(sentences)

# #save to csv
# d = {'col1': df['sentiment'], 'col2': sentences}
# sentences = pd.DataFrame(d)
# sentences.to_csv('tweet_emotions_stemming.csv',index=False)

# Only Save CSV String 

Error: need to escape, but no escapechar set

------Clustering--------

In [None]:
#Clustering data ke dalam beberapa kategori atau cluster yaitu komen positif, negatif, dan netral
#K-Means Clustering
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans

#load data
df = pd.read_csv('tweet_emotions_stemming.csv')

#mengubah data menjadi array
sentences = df['col2'].values

#mengubah data menjadi vektor
vectorizer = TfidfVectorizer(stop_words='english')
X = vectorizer.fit_transform(sentences)

#menggunakan K-Means Clustering
true_k = 3
model = KMeans(n_clusters=true_k, init='k-means++', max_iter=100, n_init=1)
model.fit(X)

for i in range(true_k):
    print("Cluster %d:" % i)
    for ind in model.cluster_centers_.argsort()[:, ::-1][i, :10]:
        print(' %s' % vectorizer.get_feature_names()[ind])


print("\nPrediction")
Y = vectorizer.transform(["wants to hang out with friends"])
prediction = model.predict(Y)
if prediction == 0:
    print("Negatif")
elif prediction == 1:
    print("Netral")
else:
    print("Positif")

Cluster 0:
 mi
 im
 just
 work
 good
 wa
 day
 thi
 love
 got
Cluster 1:
 mother
 day
 happi
 mom
 mommi
 war
 love
 mi
 star
 mum
Cluster 2:
 like
 feel
 look
 mi
 dont
 im
 thi
 wa
 just
 sound

Prediction
Negatif


------Labeling-------

In [None]:
# Spam And Ham Classification Using Naive Bayes
df = pd.read_csv('tweet_emotions_stemming.csv')


-----Classification------

-----Predict-----

---Evaluasi-----