In [None]:
!pip install pyabsa

In [None]:
import re

URL_PATTERN = re.compile(r'https?://[^ ]+')
MENTIONS_PATTERN = re.compile(r'@[^ ]+')
HASHTAGS_PATTERN = re.compile(r'#[^ ]+')
SPECIAL_CHARS_PATTERN = re.compile(r'[^A-Za-zÀ-ž ]')
MULTIPLE_SPACES_PATTERN = re.compile(' +')

USER_DATA_COLUMNS = ['id', 'created_at', 'description', 'updated_bio', 'translated_bio', 'location', 'location_city',
                     'location_state', 'location_country' 'public_metrics_followers_count',
                     'public_metrics_following_count', 'public_metrics_tweet_count', 'public_metrics_listed_count',
                     'verified', 'extracted_urls', 'extracted_mentions', 'extracted_hashtags', 'tweets_in_dataset',
                     'retweets_in_dataset']

TWEET_DATA_COLUMNS = ['id', 'author_id', 'text', 'updated_text', 'created_at', 'lang', 'public_metrics_retweet_count',
                      'public_metrics_reply_count', 'public_metrics_like_count', 'public_metrics_quote_count',
                      'extracted_urls', 'extracted_mentions', 'extracted_hashtags']

TWEET_USER_MAP_COLUMNS = ['id', 'author_id', 'created_at', 'is_retweet']


In [None]:
import pandas as pd
import numpy as np
# from pyabsa import available_checkpoints, TaskCodeOption
# checkpoint_map = available_checkpoints(task_code=TaskCodeOption.Aspect_Term_Extraction_and_Classification, show_ckpts=True)

In [None]:
from pyabsa import ATEPCCheckpointManager

aspect_extractor = ATEPCCheckpointManager.get_aspect_extractor(checkpoint='multilingual',
                                   auto_device=True  # False means load model on CPU
                                   )



In [None]:
# You can inference from a list of setences or a DatasetItem from PyABSA 
examples = ['Das Personal war sehr unhöflich, aber das Essen war sehr lecker']
inference_source = examples
atepc_result = aspect_extractor.extract_aspect(inference_source=inference_source,  #
                          pred_sentiment=True,  # Predict the sentiment of extracted aspect terms
                          )

[2023-02-22 17:19:21] (2.0.28) The results of aspect term extraction have been saved in /content/Aspect Term Extraction and Polarity Classification.FAST_LCF_ATEPC.result.json
[2023-02-22 17:19:21] (2.0.28) Example 0: Das <Personal:Negative Confidence:0.9934371113777161> war sehr unhöflich , aber das <Essen:Positive Confidence:0.9978693723678589> war sehr lecker


  lcf_cdm_vec = torch.tensor(
  float(x) for x in F.softmax(i_apc_logits).cpu().numpy().tolist()


In [None]:
from google.colab import drive
drive.mount('/content/drive')
df = pd.read_csv("drive/MyDrive/Colab Notebooks/Modified_Tweet_Data.csv")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
temp = df[df['user_category'] == 'PostDoc']
temp = temp[temp['lang'] == 'de']
temp = list(df[df['updated_text'] != "__NA__"]['updated_text'])

In [None]:
inference_source = temp
atepc_result = aspect_extractor.extract_aspect(inference_source=inference_source,  #
                          pred_sentiment=True,  # Predict the sentiment of extracted aspect terms
                          )

In [None]:
i = 0
for output in atepc_result:
    i += 1
    if i == 20:
        break
    if len(output['aspect']) == 0:
        print("Empty")
    print(output['aspect'])
    print(output['sentiment'])

In [None]:
aspectsList = []
sentiList = []
for output in atepc_result:
    if len(output['aspect']) >= 0:
        aspectsList += output['aspect']
        sentiList += output['sentiment']

In [None]:
!pip install pyabsa==1.16.27

In [None]:
import pandas as pd
import numpy as np
from pyabsa import ATEPCCheckpointManager

aspect_extractor = ATEPCCheckpointManager.get_aspect_extractor(checkpoint='multilingual',
                                   auto_device=True  # False means load model on CPU
                                   )


In [None]:
from google.colab import drive
df = pd.read_csv("drive/MyDrive/Colab Notebooks/Modified_Tweet_Data.csv")

In [None]:
# You can inference from a list of setences or a DatasetItem from PyABSA 
examples = ['Das Personal war sehr unhöflich, aber das Essen war sehr lecker']
atepc_result = aspect_extractor.extract_aspect(inference_source=examples,  #
                          pred_sentiment=True,  # Predict the sentiment of extracted aspect terms
                          )

The results of aspect term extraction have been saved in /content/atepc_inference.result.json
Example 0: Das <Personal:Negative Confidence:0.9996588230133057> war sehr unhöflich , aber das <Essen:Positive Confidence:0.999847412109375> war sehr lecker


  lcf_cdm_vec = torch.tensor([f.lcf_cdm_vec for f in infer_features], dtype=torch.float32)
  probs = [float(x) for x in F.softmax(i_apc_logits).cpu().numpy().tolist()]


In [None]:
temp = df[df['user_category'] == "PostDoc"]
temp = temp[temp['lang'] == "en"]
temp = list(temp[temp['updated_text'] != "__NA__"]['updated_text'])
len(temp)

44797

In [None]:
def absa(df, category, lang, aspect_extractor):

    temp = df[df['user_category'] == category]
    temp = temp[temp['lang'] == lang]
    temp = list(temp[temp['updated_text'] != "__NA__"]['updated_text'])

    atepc_result = aspect_extractor.extract_aspect(inference_source=temp,  #
                          pred_sentiment=True,
                          print_result = False
                          )
    aspectsList = []
    sentiList = []
    
    for output in atepc_result:
        if len(output['aspect']) >= 0:
            aspectsList += output['aspect']
            sentiList += output['sentiment']
    
    if len(aspectsList) == 0:
        print("No aspects extracted")
        return None
    
    df = pd.DataFrame({
        'Aspects': aspectsList,
        'Sentiment': sentiList,
        })
    
    df_1 = pd.get_dummies(data=df['Sentiment'])

    agg_dict = {'Aspects': ['count']}
    if len(df_1.columns.values) >= 1:
        for col in df_1.columns.values:
            agg_dict[col] = ['sum']

    df = pd.concat([df, df_1], axis=1)

    new_df = df.groupby('Aspects').agg(agg_dict).sort_values(('Aspects', 'count'), ascending=False)

    new_df.to_csv("drive/MyDrive/Colab Notebooks/dataset/" + category + "_" + lang + ".csv")

    return new_df

In [None]:
absa(df, "PostDoc", "de", aspect_extractor)
absa(df, "PostDoc", "en", aspect_extractor)
# absa(df, "Professor", "de", aspect_extractor)
# absa(df, "Professor", "en", aspect_extractor)
# absa(df, "PhD Student", "de", aspect_extractor)
# absa(df, "PhD Student", "en", aspect_extractor)
# absa(df, "Others", "de", aspect_extractor)
# absa(df, "Others", "en", aspect_extractor)
# absa(df, "Unknown", "de", aspect_extractor)
# absa(df, "Unknown", "en", aspect_extractor)
# absa(df, "Lecturer", "de", aspect_extractor)
# absa(df, "Lecturer", "en", aspect_extractor)
# absa(df, "PD", "de", aspect_extractor)
# absa(df, "PD", "en", aspect_extractor)

100%|██████████| 10578/10578 [00:26<00:00, 402.88it/s, preparing apc inference dataloader...]
100%|██████████| 42/42 [02:17<00:00,  3.26s/it, extracting aspect terms...]
100%|██████████| 7280/7280 [00:23<00:00, 306.28it/s, preparing apc inference dataloader...]
  probs = [float(x) for x in F.softmax(i_apc_logits).cpu().numpy().tolist()]
100%|██████████| 57/57 [01:37<00:00,  1.71s/it, classifying aspect sentiments...]


The results of aspect term extraction have been saved in /content/atepc_inference.result.json


100%|██████████| 2103/2103 [00:03<00:00, 576.48it/s, preparing apc inference dataloader...]
100%|██████████| 9/9 [00:27<00:00,  3.06s/it, extracting aspect terms...]
100%|██████████| 1322/1322 [00:03<00:00, 356.44it/s, preparing apc inference dataloader...]
100%|██████████| 11/11 [00:17<00:00,  1.63s/it, classifying aspect sentiments...]


The results of aspect term extraction have been saved in /content/atepc_inference.result.json


Unnamed: 0_level_0,Aspects,Negative,Neutral,Positive
Unnamed: 0_level_1,count,sum,sum,sum
Aspects,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
system,69,58,6,5
job,32,13,16,3
staff,30,18,9,3
thread,27,2,3,22
position,27,6,19,2
...,...,...,...,...
hanin,1,1,0,0
gregor mendel,1,0,1,0
grant,1,1,0,0
grafik,1,1,0,0


In [None]:
absa(df, "Professor", "de", aspect_extractor)
absa(df, "Professor", "en", aspect_extractor)
absa(df, "PhD Student", "de", aspect_extractor)
absa(df, "PhD Student", "en", aspect_extractor)
absa(df, "Others", "de", aspect_extractor)
absa(df, "Others", "en", aspect_extractor)
absa(df, "Unknown", "de", aspect_extractor)
absa(df, "Unknown", "en", aspect_extractor)
absa(df, "Lecturer", "de", aspect_extractor)
absa(df, "Lecturer", "en", aspect_extractor)
absa(df, "PD", "de", aspect_extractor)
absa(df, "PD", "en", aspect_extractor)

100%|██████████| 7580/7580 [00:14<00:00, 516.42it/s, preparing apc inference dataloader...]
100%|██████████| 30/30 [01:38<00:00,  3.28s/it, extracting aspect terms...]
100%|██████████| 5214/5214 [00:18<00:00, 281.50it/s, preparing apc inference dataloader...]
  probs = [float(x) for x in F.softmax(i_apc_logits).cpu().numpy().tolist()]
100%|██████████| 41/41 [01:10<00:00,  1.72s/it, classifying aspect sentiments...]


The results of aspect term extraction have been saved in /content/atepc_inference.result.json


100%|██████████| 1284/1284 [00:02<00:00, 574.07it/s, preparing apc inference dataloader...]
100%|██████████| 6/6 [00:16<00:00,  2.72s/it, extracting aspect terms...]
100%|██████████| 766/766 [00:02<00:00, 322.94it/s, preparing apc inference dataloader...]
100%|██████████| 6/6 [00:10<00:00,  1.80s/it, classifying aspect sentiments...]


The results of aspect term extraction have been saved in /content/atepc_inference.result.json


100%|██████████| 2471/2471 [00:04<00:00, 539.98it/s, preparing apc inference dataloader...]
100%|██████████| 10/10 [00:32<00:00,  3.25s/it, extracting aspect terms...]
100%|██████████| 1673/1673 [00:06<00:00, 243.75it/s, preparing apc inference dataloader...]
100%|██████████| 14/14 [00:22<00:00,  1.57s/it, classifying aspect sentiments...]


The results of aspect term extraction have been saved in /content/atepc_inference.result.json


100%|██████████| 480/480 [00:01<00:00, 437.43it/s, preparing apc inference dataloader...]
100%|██████████| 2/2 [00:06<00:00,  3.25s/it, extracting aspect terms...]
100%|██████████| 300/300 [00:00<00:00, 322.72it/s, preparing apc inference dataloader...]
100%|██████████| 3/3 [00:03<00:00,  1.29s/it, classifying aspect sentiments...]


The results of aspect term extraction have been saved in /content/atepc_inference.result.json


100%|██████████| 8560/8560 [00:16<00:00, 515.81it/s, preparing apc inference dataloader...]
100%|██████████| 34/34 [01:51<00:00,  3.29s/it, extracting aspect terms...]
100%|██████████| 5645/5645 [00:20<00:00, 275.66it/s, preparing apc inference dataloader...]
100%|██████████| 45/45 [01:16<00:00,  1.70s/it, classifying aspect sentiments...]


The results of aspect term extraction have been saved in /content/atepc_inference.result.json


100%|██████████| 1044/1044 [00:01<00:00, 603.25it/s, preparing apc inference dataloader...]
100%|██████████| 5/5 [00:13<00:00,  2.74s/it, extracting aspect terms...]
100%|██████████| 615/615 [00:01<00:00, 343.62it/s, preparing apc inference dataloader...]
100%|██████████| 5/5 [00:08<00:00,  1.63s/it, classifying aspect sentiments...]


The results of aspect term extraction have been saved in /content/atepc_inference.result.json


100%|██████████| 7026/7026 [00:14<00:00, 489.67it/s, preparing apc inference dataloader...]
100%|██████████| 28/28 [01:31<00:00,  3.28s/it, extracting aspect terms...]
100%|██████████| 4875/4875 [00:17<00:00, 283.40it/s, preparing apc inference dataloader...]
100%|██████████| 39/39 [01:06<00:00,  1.70s/it, classifying aspect sentiments...]


The results of aspect term extraction have been saved in /content/atepc_inference.result.json


100%|██████████| 939/939 [00:01<00:00, 581.88it/s, preparing apc inference dataloader...]
100%|██████████| 4/4 [00:12<00:00,  3.18s/it, extracting aspect terms...]
100%|██████████| 588/588 [00:01<00:00, 349.22it/s, preparing apc inference dataloader...]
100%|██████████| 5/5 [00:07<00:00,  1.59s/it, classifying aspect sentiments...]


The results of aspect term extraction have been saved in /content/atepc_inference.result.json


100%|██████████| 358/358 [00:00<00:00, 382.73it/s, preparing apc inference dataloader...]
100%|██████████| 2/2 [00:04<00:00,  2.22s/it, extracting aspect terms...]
100%|██████████| 250/250 [00:00<00:00, 409.93it/s, preparing apc inference dataloader...]
100%|██████████| 2/2 [00:03<00:00,  1.63s/it, classifying aspect sentiments...]


The results of aspect term extraction have been saved in /content/atepc_inference.result.json
The results of aspect term extraction have been saved in /content/atepc_inference.result.json


100%|██████████| 1690/1690 [00:04<00:00, 356.00it/s, preparing apc inference dataloader...]
100%|██████████| 7/7 [00:22<00:00,  3.16s/it, extracting aspect terms...]
100%|██████████| 1148/1148 [00:03<00:00, 366.82it/s, preparing apc inference dataloader...]
100%|██████████| 9/9 [00:15<00:00,  1.71s/it, classifying aspect sentiments...]


The results of aspect term extraction have been saved in /content/atepc_inference.result.json
The results of aspect term extraction have been saved in /content/atepc_inference.result.json


Unnamed: 0_level_0,Aspects,Negative,Neutral,Positive
Unnamed: 0_level_1,count,sum,sum,sum
Aspects,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
work,3,3,0,0
use,2,1,0,1
interview,2,0,2,0
system,2,2,0,0
teacher,2,1,0,1
conditions,2,1,0,1
job,2,0,1,1
privileges,2,2,0,0
application,2,1,1,0
article,2,0,2,0
