# Sentiment Analysis terhadap Vaksinisasi COVID-19 di Indonesia: Combining Training Data
<h2>Tim Yaudahlah</h2>


---

Kaenova Mahendra Auditama<sup>1</sup><br>
Fendi Irfan Amorokhman<sup>2</sup><br>
Ananda Affan Fattahila<sup>3</sup><br>
<sup>1</sup><a href="mailto:kaenova@student.telkomuniversity.ac.id">kaenova@student.telkomuniversity.ac.id</a><br>
<sup>2</sup><a href="mailto:fendiirfan@student.telkomuniversity.ac.id">fendiirfan@student.telkomuniversity.ac.id</a><br>
<sup>3</sup><a href="mailto:affanfattahila@student.telkomuniversity.ac.id">affanfattahila@student.telkomuniversity.ac.id</a><br>
Informatics Engineering, Telkom University, Indonesia<br>
2021

---
Reference: 
```
@inproceedings{purwarianti2019improving,
  title={Improving Bi-LSTM Performance for Indonesian Sentiment Analysis Using Paragraph Vector},
  author={Ayu Purwarianti and Ida Ayu Putu Ari Crisdayanti},
  booktitle={Proceedings of the 2019 International Conference of Advanced Informatics: Concepts, Theory and Applications (ICAICTA)},
  pages={1--5},
  year={2019},
  organization={IEEE}
}
```
```
https://github.com/ridife/dataset-idsa
```

In [1]:
# Optional
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import pandas as pd

In [None]:
df1 = pd.read_csv('/content/drive/Shareddrives/GEMASTIK XIV: Yaudahlah/data/training/raw/IndoNLU_SMSA_DOC-SENTIMENT_PROSA.csv')
df1

Unnamed: 0,tweet,labels
0,mohon ulama lurus dan k212 mmbri hujjah partai...,0
1,lokasi strategis di jalan sumatera bandung . t...,1
2,betapa bahagia nya diri ini saat unboxing pake...,1
3,duh . jadi mahasiswa jangan sombong dong . kas...,-1
4,"makanan beragam , harga makanan di food stall ...",1
...,...,...
10994,tidak kecewa,1
10995,enak rasa masakan nya apalagi kepiting yang me...,1
10996,hormati partai-partai yang telah berkoalisi,0
10997,"pagi pagi di tol pasteur sudah macet parah , b...",-1


In [None]:
df2 = pd.read_csv('/content/drive/Shareddrives/GEMASTIK XIV: Yaudahlah/data/training/raw/IndoNLU_smsa_doc-sentiment-prosa_valid_preprocess.tsv', sep='\t', header=None)
df2 = df2.rename(columns={0: 'tweet', 1:'labels'})

replace = {'labels':{'neutral':0, 'negative':-1, 'positive':1}}

df2 = df2.replace(replace)
df2

Unnamed: 0,tweet,labels
0,"meski masa kampanye sudah selesai , bukan bera...",0
1,tidak enak,-1
2,restoran ini menawarkan makanan sunda . kami m...,1
3,lokasi di alun alun masakan padang ini cukup t...,1
4,betapa bejad kader gerindra yang anggota dprd ...,-1
...,...,...
1255,"film tncfu , tidak cocok untuk penonton yang t...",-1
1256,"indihome ini mahal loh bayar nya . hanya , pen...",-1
1257,"be de gea , cowok cupu yang takut dengan pacar...",-1
1258,valen yang sangat tidak berkualitas . konentat...,-1


In [None]:
df3 = pd.read_csv('/content/drive/Shareddrives/GEMASTIK XIV: Yaudahlah/data/training/raw/ridlife-dataset-idsa-Indonesian Sentiment Twitter Dataset Labeled.csv', sep='\t')
df3 = df3.rename(columns={'sentimen': 'labels', 'Tweet':'tweet'})
df3

Unnamed: 0,labels,tweet
0,-1,lagu bosan apa yang aku save ni huhuhuhuhuhuhu...
1,-1,kita lanjutkan saja diam ini hingga kau dan ak...
2,1,doa rezeki tak putus inna haa zaa larizquna ma...
3,1,makasih loh ntar kita bagi hasil aku 99 9 sisa...
4,-1,aku tak faham betul jenis orang malaysia yang ...
...,...,...
10801,1,Jangan membandingkan kehidupanmu dengan kehidu...
10802,0,Sini uname lu ntar gua follow
10803,1,Apapun yg telah kamu lakukan apapun kesalahanm...
10804,1,3 cara untuk ingat semula apa yang kita dah ha...


In [None]:
df4 = pd.read_csv('/content/drive/Shareddrives/GEMASTIK XIV: Yaudahlah/data/training/raw/affan.csv')

In [None]:
combined_df = pd.concat([df1, df2, df3, df4], ignore_index=True)
combined_df

Unnamed: 0,tweet,labels
0,mohon ulama lurus dan k212 mmbri hujjah partai...,0
1,lokasi strategis di jalan sumatera bandung . t...,1
2,betapa bahagia nya diri ini saat unboxing pake...,1
3,duh . jadi mahasiswa jangan sombong dong . kas...,-1
4,"makanan beragam , harga makanan di food stall ...",1
...,...,...
23915,melanjutkan kewaspadaan ditingkatkan protokol ...,1
23916,melanjutkan kewaspadaan ditingkatkan protokol ...,1
23917,kewaspadaan kdu ditingkatkan protokol kesehata...,1
23918,kewaspadaan kudu ditingkatkan protokol kesehat...,1


In [None]:
combined_df = combined_df[combined_df['labels'] != 0]
combined_df['labels'].unique()

array([ 1, -1])

In [None]:
num_positive = len(combined_df[combined_df['labels']==1])
print(num_positive)

10135


In [None]:
num_minus = len(combined_df[combined_df['labels']==-1])
print(num_minus)

7106


In [None]:
from sklearn.utils import shuffle

combined_df = shuffle(combined_df)

if num_minus < num_positive:
  new_df = pd.concat([combined_df[combined_df['labels']==1][:num_minus], combined_df[combined_df['labels']==-1][:num_minus]])
else:
  new_df = pd.concat([combined_df[combined_df['labels']==1][:num_positive], combined_df[combined_df['labels']==-1][:num_positive]])

new_df = shuffle(new_df)
print(len(new_df[new_df['labels']==-1]))
print(len(new_df[new_df['labels']==1]))

7106
7106


In [None]:
new_df.to_csv('/content/drive/Shareddrives/GEMASTIK XIV: Yaudahlah/data/training/combined_balance_ridlife_indonlu_affan.csv')

In [None]:
combined_df.to_csv('/content/drive/Shareddrives/GEMASTIK XIV: Yaudahlah/data/training/combined_nonbalance_ridlife_indonlu_affan.csv')

In [3]:
df_training = pd.read_csv('/content/drive/Shareddrives/GEMASTIK XIV: Yaudahlah/data/training/combined_balance_ridlife_indonlu_affan.csv')
df_validation = pd.read_csv('/content/drive/Shareddrives/GEMASTIK XIV: Yaudahlah/data/validation/validation_test.csv')

In [4]:
pd.merge(df_training,df_validation,on='tweet',how='inner')

Unnamed: 0.1,Unnamed: 0,tweet,labels_x,labels_y
