# Training the data

## Separate the one with keyword and the one that dont

1. change from xlsx to csv
2. remove line break in each cell 
3. format dots and commas to have spaces (for tokenizer purpose)

In [29]:
import pandas as pd

# Load the provided Excel file
file_path = 'excel_files/tweets_google_translated_fixmat.xlsx'
df = pd.read_excel(file_path)

# Remove new lines in 'tweet-body-translated-corrected' column and replace them with space
df['tweet-body-translated-corrected'] = df['tweet-body-translated-corrected'].replace('\n', ' ', regex=True)

# Regular expression to ensure spaces before and after dots and commas
df['tweet-body-translated-corrected'] = df['tweet-body-translated-corrected'].str.replace(r"(\.)", r" . ", regex=True)
df['tweet-body-translated-corrected'] = df['tweet-body-translated-corrected'].str.replace(r"(,)", r" , ", regex=True)

# Remove potential multiple spaces caused by the replacements
df['tweet-body-translated-corrected'] = df['tweet-body-translated-corrected'].str.replace(r"\s+", " ", regex=True)

# Select only the 'tweet-body-translated-corrected' column without header
df_translated = df['tweet-body-translated-corrected']

# Save the modified data to a CSV file
csv_file_path = 'excel_files/tweets_raw.csv'
df_translated.to_csv(csv_file_path, index=False, header=False)

Find the row with the filter words. mostly keywords I want to emphasize

In [30]:
import pandas as pd
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
import re

# Initialize the Porter Stemmer
ps = PorterStemmer()

# Load the dataset
file_path = 'excel_files/tweets_raw.csv'
df = pd.read_csv(file_path)

# Specify the filter words
filter_words = ['direct', 'actor','fattah','amin', 'beto','cinematography', 'visuals'
                'perform','cast','act', 'shots','imagery', 'story', 'plot',
                'narrative', 'script', 'storytelling', 'special effects', "CGI", 
                "visual effects", "effects", 'music', 'sound', 'soundtrack', 'score',
                'audio', 'emotional', 'touching', 'moving', 'feelings', 'historical accuracy',
                'history', 'accurate', 'cultural representation', 'culture', 'production', 
                'costumes', 'costume design', 'set design', 'set', 'design', 'props', 'props design',
                'production', 'quality', 'experience', 'impression']

# Function to stem the words in a sentence
def stem_sentence(sentence):
    words = word_tokenize(sentence)
    stemmed_words = [ps.stem(word) for word in words]
    return ' '.join(stemmed_words)

# Update filter words with their stems
stemmed_filter_words = [ps.stem(word) for word in filter_words]

# Function to check if any stemmed filter word is in the stemmed sentence
def contains_stemmed_word(sentence, stemmed_filter_words):
    stemmed_sentence = stem_sentence(sentence)
    return any(re.search(r'\b' + re.escape(word) + r'\b', stemmed_sentence) for word in stemmed_filter_words)

# Apply stemming filter to the dataframe
df_with_filter = df[df.apply(lambda row: row.astype(str).apply(lambda x: contains_stemmed_word(x, stemmed_filter_words)).any(), axis=1)]
df_without_filter = df[~df.apply(lambda row: row.astype(str).apply(lambda x: contains_stemmed_word(x, stemmed_filter_words)).any(), axis=1)]

# Save the filtered dataframes to new CSV files
filtered_file_path = 'excel_files/filtered_rows.csv'
non_filtered_file_path = 'excel_files/non_filtered_rows.csv'

df_with_filter.to_csv(filtered_file_path, index=False)
df_without_filter.to_csv(non_filtered_file_path, index=False)

In [31]:
# To make it easier to find the aspect for each row
def find_stemmed_words(sentence, stemmed_filter_words):
    stemmed_sentence = stem_sentence(sentence)
    # Find all filter words in the sentence
    found_words = [word for word in stemmed_filter_words if re.search(r'\b' + re.escape(word) + r'\b', stemmed_sentence)]
    return ', '.join(found_words)

# Apply the modified function to the dataframe
df_with_filter['Matched_Words'] = df.apply(lambda row: ', '.join(row.astype(str).apply(lambda x: find_stemmed_words(x, stemmed_filter_words)).unique()), axis=1)

# Filter out rows with no matched words (empty strings in the new column)
df_with_filter = df_with_filter[df_with_filter['Matched_Words'] != '']

# Save the modified dataframe to a new CSV file
filtered_file_path_with_matched = 'excel_files/filtered_rows_with_matched.csv'
df_with_filter.to_csv(filtered_file_path_with_matched, index=False)

filtered_file_path_with_matched

'filtered_rows_with_matched.csv'

In [35]:
import pandas as pd

file_path = 'custom.apc.train.txt'
with open(file_path, 'r') as file:
    lines = file.readlines()

# Lists to store valid and invalid lines
valid_lines = []
invalid_sentences = []

# Possible polarities
polarities = ["Positive", "Negative", "Neutral"]

# Process the file, adjusting for removed lines
for i in range(0, len(lines)):
    if i + 2 < len(lines):  # Check if there are enough lines left for a complete set
        sentence = lines[i].strip()
        aspect = lines[i + 1].strip() if i + 1 < len(lines) else ""
        polarity = lines[i + 2].strip() if i + 2 < len(lines) else ""

        # Check if the sentence does not contain $T$ and is not an aspect or polarity
        if '$T$' in sentence and aspect and polarity in polarities:
            valid_lines.extend([sentence + "\n", aspect + "\n", polarity + "\n"])
        elif '$T$' not in sentence and sentence not in polarities and len(sentence.split()) > 3:
            invalid_sentences.append(sentence)

# Writing the valid lines to a text file
valid_file_path = 'valid_lines.txt'
with open(valid_file_path, 'w') as valid_file:
    valid_file.writelines(valid_lines)

# Writing the invalid sentences to a CSV file
invalid_df = pd.DataFrame({'Invalid Sentences': invalid_sentences})
invalid_csv_path = 'invalid_sentences.csv'
invalid_df.to_csv(invalid_csv_path, index=False)

Auto-anotate the one without keywords

In [38]:
from pyabsa import make_ABSA_dataset 
# refer to the comments in this function for detailed usage
make_ABSA_dataset(dataset_name_or_path=r'C:\Users\Haziq\Python Notebooks\FYP\excel_files\non_filtered_rows.csv', checkpoint='english')

[2024-01-12 22:22:31] (2.4.0) ********** [32mAvailable ATEPC model checkpoints for Version:2.4.0 (this version)[0m **********
[2024-01-12 22:22:31] (2.4.0) ********** [32mAvailable ATEPC model checkpoints for Version:2.4.0 (this version)[0m **********
[2024-01-12 22:22:31] (2.4.0) [32mDownloading checkpoint:english [0m
[2024-01-12 22:22:31] (2.4.0) [31mNotice: The pretrained model are used for testing, it is recommended to train the model on your own custom datasets[0m
[2024-01-12 22:22:31] (2.4.0) Checkpoint already downloaded, skip
[2024-01-12 22:22:32] (2.4.0) Load aspect extractor from checkpoints\ATEPC_ENGLISH_CHECKPOINT\fast_lcf_atepc_English_cdw_apcacc_82.36_apcf1_81.89_atef1_75.43
[2024-01-12 22:22:32] (2.4.0) config: checkpoints\ATEPC_ENGLISH_CHECKPOINT\fast_lcf_atepc_English_cdw_apcacc_82.36_apcf1_81.89_atef1_75.43\fast_lcf_atepc.config
[2024-01-12 22:22:32] (2.4.0) state_dict: checkpoints\ATEPC_ENGLISH_CHECKPOINT\fast_lcf_atepc_English_cdw_apcacc_82.36_apcf1_81.89_at

preparing ate inference dataloader: 100%|██████████| 1167/1167 [00:03<00:00, 386.19it/s]
extracting aspect terms: 100%|██████████| 37/37 [00:35<00:00,  1.04it/s]
preparing apc inference dataloader: 100%|██████████| 735/735 [00:02<00:00, 323.68it/s]
classifying aspect sentiments: 100%|██████████| 23/23 [00:22<00:00,  1.03it/s]

[2024-01-12 22:23:40] (2.4.0) The results of aspect term extraction have been saved in c:\Users\Haziq\Python Notebooks\FYP\Aspect Term Extraction and Polarity Classification.FAST_LCF_ATEPC.result.json
[2024-01-12 22:23:40] (2.4.0) Example 0: " Invite the people to boycott the upcoming box office movies that will be released soon . Don ' t forget that the movie industry is also their biggest contributor , what ' s more , the movie Wonder Woman clearly shows the perpetrators of the Jewish race . Try to see the movie in the Mat Kilau country box office in our country alone , how much has it collected . "
[2024-01-12 22:23:40] (2.4.0) Example 1: " JUST LAST YEAR THE MOVIE MAT KILAU OI CAME OUT ! LIKE OR HATE THAT MOVIE IT ' S A MOVIE LITERALLY ABOUT OUR ANCESTORS FIGHTING BRITISH COLONIZERS , APA BEBAL NAK NEUTRAL2 WHEN A LITERAL GENOCIDE IS HAPPENING ! THOSE WHO SUPPORT THE COLONIST AGAIN CAN REMOVE IT ! PTUI ! "
[2024-01-12 22:23:40] (2.4.0) Example 2: Watched Mat Kilau for Friday night 




In [15]:
from pyabsa.utils import convert_apc_set_to_atepc_set
from pyabsa import ABSADatasetList

convert_apc_set_to_atepc_set(
    # "integrated_datasets/apc_datasets/100.CustomDataset/custom.apc.train.txt",
    # "excel_files/non_filtered_rows.csv.apc"
    # 'custom.apc.train.txt'
    # 'valid_lines.txt',
    'excel_files/filtered_rows.csv.apc'
)  # for custom datasets, absolute path recommended for this function

[2024-01-13 19:38:01] (2.4.0) To ensure your conversion is successful, make sure the dataset name contain "apc" and "dataset" string 
[2024-01-13 19:38:01] (2.4.0) Find datasets files at excel_files/filtered_rows.csv.apc:
[2024-01-13 19:38:01] (2.4.0) coverting excel_files/filtered_rows.csv.apc to excel_files/filtered_rows.csv.apc.atepc
[2024-01-13 19:38:01] (2.4.0) Ignore Error: Invite the people to boycott the upcoming box office movies that will be released soon . Don ' t forget that the movie industry is also their biggest contributor , what ' s more , the movie Wonder Woman clearly shows the perpetrators of the Jewish race . Try to see the movie in the Mat Kilau country box office in our country alone , how much has it collected .
[2024-01-13 19:38:01] (2.4.0) Ignore Error: MALAYSIANS remember when we were all obsessed over Mat Kilau ( themovie ) ? especially the girls when we thought some of the 6 casts were
[2024-01-13 19:38:01] (2.4.0) Ignore Error: Negative
[2024-01-13 19:38:0

## Training

In [10]:
from pyabsa import ModelSaveOption, DeviceTypeOption
from pyabsa import AspectTermExtraction as ATEPC
import warnings

config = (
    ATEPC.ATEPCConfigManager.get_atepc_config_english()
)  # this config contains 'pretrained_bert', it is based on pretrained models

warnings.filterwarnings("ignore")

config.batch_size = 16
config.patience = 2
config.log_step = -1
config.seed = [1]
config.verbose = False  # If verbose == True, PyABSA will output the model strcture and seversal processed data examples
config.notice = (
    "This is an training example for aspect term extraction"  # for memos usage
)

dataset = "100.CustomDataset"
# dataset = "129.Kaggle"

In [13]:
config.model = ATEPC.ATEPCModelList.FAST_LCF_ATEPC  # improved version of LCF-ATEPC

trainer = ATEPC.ATEPCTrainer(
    config=config,
    dataset=dataset,
    from_checkpoint="",  # if you want to resume training from our pretrained checkpoints, you can pass the checkpoint name here
    auto_device=DeviceTypeOption.AUTO,  # use cuda if available
    checkpoint_save_mode=ModelSaveOption.SAVE_MODEL_STATE_DICT,  # save state dict only instead of the whole model
    path_to_save="state_dict_model_FACT_LCF_ATEPC",  # save model state dict to this path
    load_aug=False,  # there are some augmentation dataset for integrated datasets, you use them by setting load_aug=True to improve performance
)

[2024-01-13 19:35:08] (2.4.0) Set Model Device: cuda:0
[2024-01-13 19:35:08] (2.4.0) Device Name: NVIDIA GeForce RTX 3060 Ti
2024-01-13 19:35:08,742 INFO: PyABSA version: 2.4.0
2024-01-13 19:35:08,743 INFO: Transformers version: 4.36.2
2024-01-13 19:35:08,744 INFO: Torch version: 2.1.2+cuda11.8
2024-01-13 19:35:08,745 INFO: Device: NVIDIA GeForce RTX 3060 Ti
2024-01-13 19:35:08,803 INFO: Searching dataset 100.CustomDataset in local disk


ValueError: Task ATEPC is not supported for auto-augment

In [3]:
config.model = ATEPC.ATEPCModelList.LCF_ATEPC

trainer = ATEPC.ATEPCTrainer(
    config=config,
    dataset=dataset,
    from_checkpoint=None,  # if you want to resume training from our pretrained checkpoints, you can pass the checkpoint name here
    auto_device=DeviceTypeOption.AUTO,  # use cuda if available
    checkpoint_save_mode=ModelSaveOption.SAVE_MODEL_STATE_DICT,  # save state dict only instead of the whole model
    path_to_save="state_dict_model_FACT_LCF_ATEPC",  # save model state dict to this path
    load_aug=False,  # there are some augmentation dataset for integrated datasets, you use them by setting load_aug=True to improve performance
)

[2024-01-13 18:09:16] (2.4.0) Set Model Device: cuda:0
[2024-01-13 18:09:16] (2.4.0) Device Name: NVIDIA GeForce RTX 3060 Ti
2024-01-13 18:09:17,166 INFO: PyABSA version: 2.4.0
2024-01-13 18:09:17,166 INFO: Transformers version: 4.36.2
2024-01-13 18:09:17,167 INFO: Torch version: 2.1.2+cuda11.8
2024-01-13 18:09:17,167 INFO: Device: NVIDIA GeForce RTX 3060 Ti
2024-01-13 18:09:17,177 INFO: Searching dataset 100.CustomDataset in local disk
2024-01-13 18:09:17,526 INFO: You can set load_aug=True in a trainer to augment your dataset (English only yet) and improve performance.
2024-01-13 18:09:17,527 INFO: Please use a new folder to perform new text augment if the former augment in integrated_datasets\atepc_datasets\100.CustomDataset errored unexpectedly


convert examples to features: 100%|██████████| 852/852 [00:02<00:00, 290.13it/s]

2024-01-13 18:09:22,600 INFO: Dataset Label Details: {'Positive': 210, 'Negative': 390, 'Neutral': 252, 'Sum': 852}



convert examples to features: 100%|██████████| 234/234 [00:00<00:00, 277.02it/s]

2024-01-13 18:09:24,092 INFO: Dataset Label Details: {'Positive': 58, 'Negative': 96, 'Neutral': 80, 'Sum': 234}





2024-01-13 18:09:25,177 INFO: Save cache dataset to lcf_atepc.custom_dataset.dataset.d8bd1758139a510e5fa3e214dcbabbacfc940e289b62afe0c7f08167e5dc12a8.cache
2024-01-13 18:09:26,152 INFO: cuda memory allocated:764963840
2024-01-13 18:09:26,153 INFO: ABSADatasetsVersion:None	-->	Calling Count:0
2024-01-13 18:09:26,153 INFO: IOB_label_to_index:{'B-ASP': 1, 'I-ASP': 2, 'O': 3, '[CLS]': 4, '[SEP]': 5}	-->	Calling Count:1
2024-01-13 18:09:26,155 INFO: MV:<metric_visualizer.metric_visualizer.MetricVisualizer object at 0x000002829409C550>	-->	Calling Count:0
2024-01-13 18:09:26,155 INFO: PyABSAVersion:2.4.0	-->	Calling Count:1
2024-01-13 18:09:26,157 INFO: SRD:3	-->	Calling Count:2172
2024-01-13 18:09:26,158 INFO: TorchVersion:2.1.2+cuda11.8	-->	Calling Count:1
2024-01-13 18:09:26,158 INFO: TransformersVersion:4.36.2	-->	Calling Count:1
2024-01-13 18:09:26,159 INFO: auto_device:True	-->	Calling Count:3
2024-01-13 18:09:26,159 INFO: batch_size:16	-->	Calling Count:4
2024-01-13 18:09:26,160 INFO:

  0%|          | 0/54 [00:00<?, ?it/s]We strongly recommend passing in an `attention_mask` since your input_ids may be padded. See https://huggingface.co/docs/transformers/troubleshooting#incorrect-output-when-padding-tokens-arent-masked.
Epoch:  0| loss_apc:1.0194 | loss_ate:0.2012 |: 100%|██████████| 54/54 [04:17<00:00,  4.78s/it,  APC_ACC: 44.87(max:44.87) | APC_F1: 29.76(max:29.76) | ATE_F1: 20.96(max:20.96)]
Epoch:  1| loss_apc:1.0154 | loss_ate:0.1272 |: 100%|██████████| 54/54 [04:12<00:00,  4.68s/it,  APC_ACC: 43.59(max:44.87) | APC_F1: 30.10(max:30.10) | ATE_F1: 56.23(max:56.23)]
Epoch:  2| loss_apc:1.0022 | loss_ate:0.0791 |: 100%|██████████| 54/54 [04:15<00:00,  4.74s/it,  APC_ACC: 50.00(max:50.00) | APC_F1: 36.08(max:36.08) | ATE_F1: 61.78(max:61.78)]
Epoch:  3| loss_apc:0.7035 | loss_ate:0.1665 |: 100%|██████████| 54/54 [03:35<00:00,  4.00s/it,  APC_ACC: 52.14(max:52.14) | APC_F1: 48.29(max:48.29) | ATE_F1: 62.34(max:62.34)]
Epoch:  4| loss_apc:0.2442 | loss_ate:0.0722 |: 1

2024-01-13 18:36:43,268 INFO: 
------------------------------------------------------------------ Raw Metric Records ------------------------------------------------------------------
╒════════════════════════════════╤════════════════════════════════════════════════════╤══════════╤═══════════╤══════════╤═══════╤═══════╤═══════╤═══════╕
│ Metric                         │ Trial                                              │ Values   │  Average  │  Median  │  Std  │  IQR  │  Min  │  Max  │
╞════════════════════════════════╪════════════════════════════════════════════════════╪══════════╪═══════════╪══════════╪═══════╪═══════╪═══════╪═══════╡
│ Max-APC-Test-Acc w/o Valid Set │ lcf_atepc-custom_dataset-microsoft/deberta-v3-base │ [66.24]  │   66.24   │  66.24   │   0   │   0   │ 66.24 │ 66.24 │
├────────────────────────────────┼────────────────────────────────────────────────────┼──────────┼───────────┼──────────┼───────┼───────┼───────┼───────┤
│ Max-APC-Test-F1 w/o Valid Set  │ lcf_atepc-c

In [8]:
# train using lcf-atepc-bert
config.model = ATEPC.ATEPCModelList.BERT_BASE_ATEPC

trainer = ATEPC.ATEPCTrainer(
    config=config,
    dataset=dataset,
    from_checkpoint=None,  # if you want to resume training from our pretrained checkpoints, you can pass the checkpoint name here
    auto_device=DeviceTypeOption.AUTO,  # use cuda if available
    checkpoint_save_mode=ModelSaveOption.SAVE_MODEL_STATE_DICT,  # save state dict only instead of the whole model
    path_to_save="state_dict_model_LCF_ATEPC_BERT",  # save model state dict to this path
    load_aug=False,  # there are some augmentation dataset for integrated datasets, you use them by setting load_aug=True to improve performance
)

[2024-01-13 18:51:26] (2.4.0) Set Model Device: cuda:0
[2024-01-13 18:51:26] (2.4.0) Device Name: NVIDIA GeForce RTX 3060 Ti
2024-01-13 18:51:27,103 INFO: PyABSA version: 2.4.0
2024-01-13 18:51:27,103 INFO: Transformers version: 4.36.2
2024-01-13 18:51:27,104 INFO: Torch version: 2.1.2+cuda11.8
2024-01-13 18:51:27,104 INFO: Device: NVIDIA GeForce RTX 3060 Ti
2024-01-13 18:51:27,118 INFO: Searching dataset 100.CustomDataset in local disk
2024-01-13 18:51:27,280 INFO: You can set load_aug=True in a trainer to augment your dataset (English only yet) and improve performance.
2024-01-13 18:51:27,281 INFO: Please use a new folder to perform new text augment if the former augment in integrated_datasets\atepc_datasets\100.CustomDataset errored unexpectedly


convert examples to features: 100%|██████████| 852/852 [00:02<00:00, 293.49it/s]

2024-01-13 18:51:32,169 INFO: Dataset Label Details: {'Positive': 210, 'Negative': 390, 'Neutral': 252, 'Sum': 852}



convert examples to features: 100%|██████████| 234/234 [00:00<00:00, 256.13it/s]

2024-01-13 18:51:33,558 INFO: Dataset Label Details: {'Positive': 58, 'Negative': 96, 'Neutral': 80, 'Sum': 234}





2024-01-13 18:51:34,438 INFO: Save cache dataset to bert_base_atepc.custom_dataset.dataset.ab6b65c2af168aead4f04b319b974661fb4c42b8df8156c4ae463fb28d1861b1.cache
2024-01-13 18:51:34,712 INFO: cuda memory allocated:788294656
2024-01-13 18:51:34,712 INFO: ABSADatasetsVersion:None	-->	Calling Count:0
2024-01-13 18:51:34,713 INFO: IOB_label_to_index:{'B-ASP': 1, 'I-ASP': 2, 'O': 3, '[CLS]': 4, '[SEP]': 5}	-->	Calling Count:2
2024-01-13 18:51:34,713 INFO: MV:<metric_visualizer.metric_visualizer.MetricVisualizer object at 0x00000282983C0370>	-->	Calling Count:4
2024-01-13 18:51:34,714 INFO: PyABSAVersion:2.4.0	-->	Calling Count:2
2024-01-13 18:51:34,715 INFO: SRD:3	-->	Calling Count:2874
2024-01-13 18:51:34,715 INFO: TorchVersion:2.1.2+cuda11.8	-->	Calling Count:2
2024-01-13 18:51:34,716 INFO: TransformersVersion:4.36.2	-->	Calling Count:2
2024-01-13 18:51:34,716 INFO: auto_device:True	-->	Calling Count:97
2024-01-13 18:51:34,717 INFO: batch_size:16	-->	Calling Count:10
2024-01-13 18:51:34,7

Epoch:  0| loss_apc:1.1288 | loss_ate:0.1962 |: 100%|██████████| 54/54 [01:08<00:00,  1.26s/it,  APC_ACC: 41.03(max:41.03) | APC_F1: 19.39(max:20.10) | ATE_F1: 0.48(max:0.48)]
Epoch:  1| loss_apc:0.9891 | loss_ate:0.1222 |: 100%|██████████| 54/54 [01:08<00:00,  1.26s/it,  APC_ACC: 38.89(max:41.03) | APC_F1: 21.75(max:21.75) | ATE_F1: 59.31(max:59.31)]
Epoch:  2| loss_apc:1.0998 | loss_ate:0.1179 |: 100%|██████████| 54/54 [00:56<00:00,  1.05s/it,  APC_ACC: 38.89(max:41.03) | APC_F1: 21.75(max:21.75) | ATE_F1: 59.93(max:62.27)]
Epoch:  3| loss_apc:1.1622 | loss_ate:0.2108 |: 100%|██████████| 54/54 [01:04<00:00,  1.19s/it,  APC_ACC: 47.01(max:50.43) | APC_F1: 34.25(max:38.31) | ATE_F1: 65.09(max:65.09)]
Epoch:  4| loss_apc:0.7327 | loss_ate:0.0707 |: 100%|██████████| 54/54 [00:59<00:00,  1.10s/it,  APC_ACC: 64.10(max:64.10) | APC_F1: 62.58(max:62.58) | ATE_F1: 60.11(max:65.09)]
Epoch:  5| loss_apc:0.9264 | loss_ate:0.0940 |: 100%|██████████| 54/54 [00:54<00:00,  1.01s/it,  APC_ACC: 61.11(

2024-01-13 19:01:02,025 INFO: 
--------------------------------------------------------------------- Raw Metric Records ---------------------------------------------------------------------
╒════════════════════════════════╤══════════════════════════════════════════════════════════╤══════════╤═══════════╤══════════╤═══════╤═══════╤═══════╤═══════╕
│ Metric                         │ Trial                                                    │ Values   │  Average  │  Median  │  Std  │  IQR  │  Min  │  Max  │
╞════════════════════════════════╪══════════════════════════════════════════════════════════╪══════════╪═══════════╪══════════╪═══════╪═══════╪═══════╪═══════╡
│ Max-APC-Test-Acc w/o Valid Set │ fast_lcf_atepc-custom_dataset-microsoft/deberta-v3-base  │ [53.52]  │   53.52   │  53.52   │   0   │   0   │ 53.52 │ 53.52 │
├────────────────────────────────┼──────────────────────────────────────────────────────────┼──────────┼───────────┼──────────┼───────┼───────┼───────┼───────┤
│ Max-APC-

In [71]:
aspect_extractor = trainer.load_trained_model()
assert isinstance(aspect_extractor, ATEPC.AspectExtractor)

[2024-01-13 00:09:29] (2.4.0) Load aspect extractor from state_dict_model_FACT_LCF_ATEPC/fast_lcf_atepc_custom_dataset_cdw_apcacc_61.54_apcf1_61.44_atef1_69.33/
[2024-01-13 00:09:29] (2.4.0) config: state_dict_model_FACT_LCF_ATEPC/fast_lcf_atepc_custom_dataset_cdw_apcacc_61.54_apcf1_61.44_atef1_69.33/fast_lcf_atepc.config
[2024-01-13 00:09:29] (2.4.0) state_dict: state_dict_model_FACT_LCF_ATEPC/fast_lcf_atepc_custom_dataset_cdw_apcacc_61.54_apcf1_61.44_atef1_69.33/fast_lcf_atepc.state_dict
[2024-01-13 00:09:29] (2.4.0) model: None
[2024-01-13 00:09:29] (2.4.0) tokenizer: state_dict_model_FACT_LCF_ATEPC/fast_lcf_atepc_custom_dataset_cdw_apcacc_61.54_apcf1_61.44_atef1_69.33/fast_lcf_atepc.tokenizer
[2024-01-13 00:09:29] (2.4.0) Set Model Device: cuda:0
[2024-01-13 00:09:29] (2.4.0) Device Name: NVIDIA GeForce RTX 3060 Ti


In [44]:
atepc_examples = [
    "MALAYSIANS remember when we all obsessed over Mat Kilau (the movie)? especially the girls when we thought some of the 6 casts were attractive",
    "His camera work is not like the style I did in the Mat Kilau film, and it is not like the Kingsman style of camera work. I don’t remember what movie it was, but it’s not like a typical mission impossible movie or common action movie camera work, it’s different.",
    "the story line is so good and the actors are so good looking",
]
# predict interface accepts a list of example or a single example
for ex in atepc_examples:
    result = aspect_extractor.predict(
        text=ex,
        print_result=True,
        ignore_error=True,  # ignore an invalid example, if it is False, invalid examples will raise Exceptions
        eval_batch_size=32,
    )

[2024-01-12 22:38:53] (2.4.0) The results of aspect term extraction have been saved in c:\Users\Haziq\Python Notebooks\FYP\Aspect Term Extraction and Polarity Classification.FAST_LCF_ATEPC.result.json
[2024-01-12 22:38:53] (2.4.0) Example 0: MALAYSIANS remember when we all obsessed over Mat Kilau ( the movie ) ? especially the girls when we thought some of the 6 [32m<casts:Positive Confidence:0.8485>[0m were attractive
[2024-01-12 22:38:54] (2.4.0) The results of aspect term extraction have been saved in c:\Users\Haziq\Python Notebooks\FYP\Aspect Term Extraction and Polarity Classification.FAST_LCF_ATEPC.result.json
[2024-01-12 22:38:54] (2.4.0) Example 0: His [31m<camera work:Negative Confidence:0.7216>[0m is not like the style I did in the Mat Kilau film , and it is not like the Kingsman style of camera work . I don ’ t remember what movie it was , but it ’ s not like a typical mission impossible movie or common action movie camera work , it ’ s different .
[2024-01-12 22:38:55] 

In [78]:
# Load the state dict
state_dict_path = 'state_dict_model_FACT_LCF_ATEPC/fast_lcf_atepc_custom_dataset_cdw_apcacc_61.54_apcf1_61.44_atef1_69.33/'

# Load the model using the checkpoint manager
aspect_classifier = ATEPC.AspectExtractor(state_dict_path, auto_device=True )

# inference_source = [examples]
atepc_result = aspect_classifier.predict(
                        text="MALAYSIANS remember when we all obsessed over Mat Kilau (the movie)? especially the girls when we thought some of the 6 casts were attractive",
                        print_result=True,
                        ignore_error=True,  # Predict the sentiment of extracted aspect terms
                    )

# Convert result to data frame for better visual in Streamlit
processed_result = []
print(atepc_result)
for item in atepc_result:
    print(f"item:{item}")
    for aspect, sentiment, confidence in zip(item['aspect'], item['sentiment'], item['confidence']):
        processed_result.append({'Aspect': aspect, 'Sentiment': sentiment, 'Confidence': confidence})

df = pd.DataFrame(processed_result)

# Display the result as a table
print(df)

[2024-01-13 00:16:12] (2.4.0) Load aspect extractor from state_dict_model_FACT_LCF_ATEPC/fast_lcf_atepc_custom_dataset_cdw_apcacc_61.54_apcf1_61.44_atef1_69.33/
[2024-01-13 00:16:12] (2.4.0) config: state_dict_model_FACT_LCF_ATEPC/fast_lcf_atepc_custom_dataset_cdw_apcacc_61.54_apcf1_61.44_atef1_69.33/fast_lcf_atepc.config
[2024-01-13 00:16:12] (2.4.0) state_dict: state_dict_model_FACT_LCF_ATEPC/fast_lcf_atepc_custom_dataset_cdw_apcacc_61.54_apcf1_61.44_atef1_69.33/fast_lcf_atepc.state_dict
[2024-01-13 00:16:12] (2.4.0) model: None
[2024-01-13 00:16:12] (2.4.0) tokenizer: state_dict_model_FACT_LCF_ATEPC/fast_lcf_atepc_custom_dataset_cdw_apcacc_61.54_apcf1_61.44_atef1_69.33/fast_lcf_atepc.tokenizer
[2024-01-13 00:16:12] (2.4.0) Set Model Device: cuda:0
[2024-01-13 00:16:12] (2.4.0) Device Name: NVIDIA GeForce RTX 3060 Ti
[2024-01-13 00:16:16] (2.4.0) The results of aspect term extraction have been saved in c:\Users\Haziq\Python Notebooks\FYP\Aspect Term Extraction and Polarity Classifica

TypeError: string indices must be integers

### Use the model for dataset and find out the aspect frequency and polarity

In [81]:
aspect_extractor.batch_predict(
    target_file='excel_files/tweets_raw.csv',
    print_result=True,
    save_result=True,
    ignore_error=True,
    eval_batch_size=32,
)

[2024-01-13 01:04:25] (2.4.0) loading: excel_files/tweets_raw.csv


preparing ate inference dataloader: 100%|██████████| 1352/1352 [00:03<00:00, 422.11it/s]
extracting aspect terms: 100%|██████████| 43/43 [00:11<00:00,  3.70it/s]
preparing apc inference dataloader: 100%|██████████| 513/513 [00:01<00:00, 265.70it/s]
classifying aspect sentiments: 100%|██████████| 17/17 [00:05<00:00,  3.30it/s]


[2024-01-13 01:04:48] (2.4.0) The results of aspect term extraction have been saved in c:\Users\Haziq\Python Notebooks\FYP\Aspect Term Extraction and Polarity Classification.FAST_LCF_ATEPC.result.json
[2024-01-13 01:04:48] (2.4.0) Example 0: " Invite the people to boycott the upcoming box office movies that will be released soon . Don ' t forget that the movie industry is also their biggest contributor , what ' s more , the movie Wonder Woman clearly shows the perpetrators of the Jewish race . Try to see the movie in the Mat Kilau country box office in our country alone , how much has it collected . "
[2024-01-13 01:04:48] (2.4.0) Example 1: " JUST LAST YEAR THE MOVIE MAT KILAU OI CAME OUT ! LIKE OR HATE THAT MOVIE IT ' S A MOVIE LITERALLY ABOUT OUR ANCESTORS FIGHTING BRITISH COLONIZERS , APA BEBAL NAK NEUTRAL2 WHEN A LITERAL GENOCIDE IS HAPPENING ! THOSE WHO SUPPORT THE COLONIST AGAIN CAN REMOVE IT ! PTUI ! "
[2024-01-13 01:04:48] (2.4.0) Example 2: Watched Mat Kilau for Friday night 

KeyboardInterrupt: 

In [None]:
import json
import pandas as pd

# Path to your JSON file
file_path = 'Aspect Term Extraction and Polarity Classification.FAST_LCF_ATEPC.result.json'

# Read the file
with open(file_path, 'r', encoding='utf-8') as file:
    data = json.load(file)

# Extract aspects and their polarities
aspects_with_polarity = []
extracted_data = []

for entry in data:
    # Extract aspects and their polarities
    if entry.get("aspect")!=[]:

        aspect_terms = entry.get("aspect")
        aspect_polarities = entry.get("sentiment")

        for term, polarity in zip(aspect_terms, aspect_polarities):
            extracted_data.append({'Aspect': term.lower(), 'Polarity': polarity})

# Convert to DataFrame
df_aspects = pd.DataFrame(extracted_data)

# Save the DataFrame to a CSV file
output_csv_path = 'excel_files/aspects_with_polarity.csv'
df_aspects.to_csv(output_csv_path, index=False)


## Vader (Sentence level sentiment analysis)

In [5]:
import nltk
from nltk.sentiment import SentimentIntensityAnalyzer

nltk.download('vader_lexicon')
sid = SentimentIntensityAnalyzer()

examples= "His camera work is not like the style I did in the Mat Kilau film, and it is not like the Kingsman style of camera work. I don’t remember what movie it was, but it’s not like a typical mission impossible movie or common action movie camera work, it’s different."
# vader_scores = sid.polarity_scores(examples)
# vader_sentiment = 'Positive' if vader_scores['compound'] >= 0.05 else 'Negative' if vader_scores['compound'] <= -0.05 else 'Neutral'

# Displaying VADER results
vader_scores = sid.polarity_scores(examples)
vader_sentiment = 'Positive' if vader_scores['compound'] >= 0.05 else 'Negative' if vader_scores['compound'] <= -0.05 else 'Neutral'
# print(f"Overall Sentiment (VADER): {vader_sentiment}")
# print(f"VADER Scores: {vader_scores}")

Overall Sentiment (VADER): Negative
VADER Scores: {'neg': 0.116, 'neu': 0.884, 'pos': 0.0, 'compound': -0.5824}


In [51]:
import pandas as pd
import nltk
from nltk.sentiment import SentimentIntensityAnalyzer

# Load the CSV file
file_path = 'excel_files/tweets_raw.csv'
df = pd.read_csv(file_path, header=None)

# Initialize the Sentiment Intensity Analyzer
nltk.download('vader_lexicon')
sid = SentimentIntensityAnalyzer()

# Function to apply VADER analysis
def analyze_sentiment(tweet):
    scores = sid.polarity_scores(tweet)
    return scores

# Analyzing the first column (assuming it contains the tweets)
df['VADER_Scores'] = df[0].apply(analyze_sentiment)

# Splitting the dictionary into separate columns
df[['VADER_neg', 'VADER_neu', 'VADER_pos', 'VADER_compound']] = df['VADER_Scores'].apply(pd.Series)
df['VADER_Sentiment'] = df['VADER_compound'].apply(lambda x: 'Positive' if x >= 0.05 else 'Negative' if x <= -0.05 else 'Neutral')

# Optionally, you can drop the 'VADER_Scores' column if it's no longer needed
df.drop(columns=['VADER_Scores'], inplace=True)

# Save the updated DataFrame to a new CSV file
output_file_path = 'excel_files/tweets_polarity.csv'
df.to_csv(output_file_path, index=False)

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\Haziq\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!
