# Importing Data

## Dataset
https://huggingface.co/datasets/pythainlp/wisesight_sentiment

In [None]:
import pandas as pd

splits = {'train': 'wisesight_sentiment/train-00000-of-00001.parquet', 'validation': 'wisesight_sentiment/validation-00000-of-00001.parquet', 'test': 'wisesight_sentiment/test-00000-of-00001.parquet'}
df = pd.read_parquet("hf://datasets/pythainlp/wisesight_sentiment/" + splits["train"])
df['category'] = df['category'].replace({0: "pos", 1: "neu", 2: "neg"})
df = df[df['category'] != 3]
df

In [None]:
df.to_csv(r"datasets\wisesight_sentiment.csv", index=False, encoding='utf-8-sig')

## Transformer
https://huggingface.co/poom-sci/WangchanBERTa-finetuned-sentiment

In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification

tokenizer = AutoTokenizer.from_pretrained("poom-sci/WangchanBERTa-finetuned-sentiment")
model = AutoModelForSequenceClassification.from_pretrained("poom-sci/WangchanBERTa-finetuned-sentiment")

# Testing

In [None]:
import pandas as pd

## Opening DFs

In [None]:
# cleaning
def cleaning(df):
    df = df[df['sentiment'] != 'q']
    df.loc[df['sentiment'] == 'neu', :] = df[df['sentiment'] == 'neu'].sample(frac=1/3, random_state=42)
    df.dropna(inplace=True)
    return df

# new_df = cleaning(df3)

In [64]:
df1 = pd.read_csv(r'datasets\wisesight_sentiment.csv', names=['text', 'sentiment'], header=None)
df1 = df1.iloc[1: , :]
# print(df1.sentiment.value_counts())

df2 = pd.read_csv(r'datasets\all_aug.csv', header=None)
df2 = df2.iloc[1:, :2] # remove first row and get only the first two columns
df2 = df2[[1, 0]] # swap columns
df2.columns = ['text', 'sentiment']
# print(df2.sentiment.value_counts())

df3 = pd.read_csv(r'datasets\all_df.csv', header=None)
df3 = df3.iloc[1: , :]
df3 = df3[[1, 0]]
df3.columns = ['text', 'sentiment']
# print(df3.sentiment.value_counts())

In [65]:
amy = pd.read_csv(r'datasets\general_amy.csv', sep='\t', names=['text', 'sentiment'], header=None)
review = pd.read_csv(r'datasets\review_shopping.csv', sep='\t', names=['text', 'sentiment'], header=None)
tcas = pd.read_csv(r'datasets\tcas61.csv', sep='\t', names=['text', 'sentiment'], header=None)

lst = [amy, review, tcas]
df4 = pd.concat(lst)
df4.sentiment.value_counts()

sentiment
neg    193
pos    148
Name: count, dtype: int64

## Concatenate DFs

In [66]:
pdLst = [df1, df2, df3, df4]

new_df = pd.concat(pdLst)
new_df.drop_duplicates(inplace=True)
new_df = new_df[new_df['sentiment'] != 'q']
new_df.dropna(inplace=True)

new_df.sentiment.value_counts()

sentiment
neu    14874
neg     6947
pos     4700
Name: count, dtype: int64

In [67]:
neu_sample = new_df[new_df['sentiment'] == 'neu'].sample(frac=1/2, random_state=42)
new_df = pd.concat([new_df[new_df['sentiment'] != 'neu'], neu_sample])
new_df = new_df.sample(frac=1, random_state=42).reset_index(drop=True)

print(new_df['sentiment'].value_counts())

sentiment
neu    7437
neg    6947
pos    4700
Name: count, dtype: int64


## Save to CSV

In [None]:
df4.sentiment.value_counts()

In [None]:
print(df4.sentiment.value_counts())

In [68]:
new_df.to_csv(r"datasets\big_sample.csv", index=False, encoding='utf-8-sig')