# Word Level POS Extraction from Tweet

In [None]:
!pip install HanTa

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
from HanTa import HanoverTagger as ht
import pandas as pd
import numpy as np
import re
from google.colab import files
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [None]:
df = pd.read_csv("Final_Tweet_Data.csv")
germantagger = ht.HanoverTagger('morphmodel_ger.pgz')
englishtagger = ht.HanoverTagger('morphmodel_en.pgz')

URL_PATTERN = re.compile(r'https?://[^ ]+')
MENTIONS_PATTERN = re.compile(r'@[^ ]+')
HASHTAGS_PATTERN = re.compile(r'#[^ ]+')
SPECIAL_CHARS_PATTERN = re.compile(r'[^A-Za-zÀ-ž ]')
MULTIPLE_SPACES_PATTERN = re.compile(' +')

pattern_list = [URL_PATTERN,
                MENTIONS_PATTERN,
                HASHTAGS_PATTERN,
                SPECIAL_CHARS_PATTERN,
                MULTIPLE_SPACES_PATTERN]

englishStopWords = nltk.corpus.stopwords.words("english")
# germanStopWords = nltk.corpus.stopwords.words("german")


germanStopWords = []
with open('german_stopwords.txt', encoding='utf-8') as inputfile:
    for line in inputfile:
        germanStopWords.extend(line.strip().split(','))

In [None]:
def text_process_pos(input_text_col, lang_col, idx_col, pattern_list):

    tweet_pos_tbl = []

    for i in range(len(input_text_col)):

        input_text = input_text_col[i]
        lang = lang_col[i]
        idx = idx_col[i]

        if lang not in ["en", "de"]:
            continue
        
        if input_text is np.nan:
            continue

        if lang == "en":
            input_text = input_text.replace("&amp;", " and ").replace("&gt;", " more than ").replace("&lt;", " less than ")

        if lang == "de":
            input_text = input_text.replace("&amp;", " und ").replace("&gt;", " mehr als ").replace("&lt;", " weniger als ")
            
        for pattern in pattern_list:
            input_text = pattern.sub(" ", input_text)
            
        # Convert to lower case
        input_text = input_text.lower().strip()

        if len(input_text) <= 0:
            continue
        
        if lang == 'en':
            input_text = ' '.join([word for word in input_text.split() if word not in englishStopWords and len(word) >= 3])
            tweet_pos_tbl += [[idx, lang, lemma.lower(), word, pos] for (word,lemma,pos) in englishtagger.tag_sent(input_text.split())]
        elif lang == 'de':
            input_text = ' '.join([word for word in input_text.split() if word not in germanStopWords and len(word) >= 3])
            tweet_pos_tbl += [[idx, lang, lemma.lower(), word, pos] for (word,lemma,pos) in germantagger.tag_sent(input_text.split())]

    if len(tweet_pos_tbl) > 0:
        return pd.DataFrame(tweet_pos_tbl, columns=["idx", "lang", "lemma", "original_word", "pos"])
    else:
        return None

In [None]:
tweet_pos_df = text_process_pos(df["text"].to_list(), df["lang"].to_list(), df["idx"].to_list(), pattern_list)

In [None]:
tweet_pos_df.to_csv("Tweet_POS.csv", index=False)
files.download("Tweet_POS.csv")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
tweet_pos_df

Unnamed: 0,idx,lang,lemma,original_word,pos
0,0,de,zeigen,zeigt,VV(FIN)
1,0,de,diskussion,diskussion,NN
2,0,de,beobachten,beobachtet,VV(FIN)
3,0,de,berufung,berufung,NN
4,0,de,art,art,NE
...,...,...,...,...,...
448187,176695,en,collection,collection,NN
448188,176695,en,info,info,CD
448189,176695,en,resource,resources,NNS
448190,176695,en,call,calls,VBZ


# Aspect Extraction

In [None]:
import pandas as pd
from google.colab import files
df_tweet_pos = pd.read_csv("Tweet_POS.csv", keep_default_na=False)
df = pd.read_csv("Final_Tweet_Data.csv")

In [None]:
df_tweet_pos[(df_tweet_pos["pos"] == "NP") | (df_tweet_pos["pos"] == "NNS") | (df_tweet_pos["pos"] == "NPS") | (df_tweet_pos["pos"] == "NN_VAR")]["lang"].value_counts()

en    12031
Name: lang, dtype: int64

In [None]:
df_tweet_pos["has_noun"] = df_tweet_pos["pos"].apply(lambda x: 1 if x in ["NN", "NE", "NA", "NP", "NNS", "NPS", "NN_VAR"] else 0)
df_tweet_pos["has_adj"] = df_tweet_pos["pos"].apply(lambda x: 1 if x in ["JJ", "ADJ(A)", "ADJ(D)"] else 0)

In [None]:
agg_dict = {'has_noun': ['sum'], 'has_adj': ["sum"]}
df_noun_adj_flag = df_tweet_pos.groupby(by="idx").agg(agg_dict)
df_noun_adj_flag.columns = df_noun_adj_flag.columns.droplevel(1)
df_noun_adj_flag.reset_index(inplace=True)
new_df_tweet_pos = df_tweet_pos.merge(df_noun_adj_flag, on='idx')
new_df_tweet_pos = new_df_tweet_pos[['idx', 'lang', 'lemma', 'pos', 'has_noun_y', 'has_adj_y']]
new_df_tweet_pos = new_df_tweet_pos.merge(df[["idx", "user_category"]], on="idx")
new_df_tweet_pos = new_df_tweet_pos[(new_df_tweet_pos["has_adj_y"] > 0) & (new_df_tweet_pos["has_noun_y"] > 0)]
new_df_tweet_pos = new_df_tweet_pos[(new_df_tweet_pos["pos"] == "NN") |
                                    (new_df_tweet_pos["pos"] == "NE") |
                                    (new_df_tweet_pos["pos"] == "NA") |
                                    (new_df_tweet_pos["pos"] == "NP") |
                                    (new_df_tweet_pos["pos"] == "NNS") |
                                    (new_df_tweet_pos["pos"] == "NPS") |
                                    (new_df_tweet_pos["pos"] == "NN_VAR")]
agg_dict = {'lemma': ['count']}
new_df_tweet_pos = new_df_tweet_pos.groupby(by=["user_category", "lang", "lemma"]).agg(agg_dict)
new_df_tweet_pos.columns = new_df_tweet_pos.rename(columns={"lemma": "Count"}).columns.droplevel(1)
new_df_tweet_pos = new_df_tweet_pos.reset_index()
new_df_tweet_pos = new_df_tweet_pos.rename(columns={"user_category": "Category",
                                                    "lang": "Language",
                                                    "lemma": "Aspect"})
new_df_tweet_pos["Language"] = new_df_tweet_pos["Language"].map({'en': 'English', 'de': 'German'})

In [None]:
new_df_tweet_pos[
                 (new_df_tweet_pos["Category"] == "Professor") &
                 (new_df_tweet_pos["Language"] == "German")].sort_values("Count", ascending=False)

Unnamed: 0,Category,Language,Aspect,Count
30556,Professor,German,jahr,693
34084,Professor,German,wissenschaft,663
33497,Professor,German,uni,392
29594,Professor,German,forschung,373
32124,Professor,German,professur,308
...,...,...,...,...
30482,Professor,German,innengeneration,1
30480,Professor,German,innenbildung,1
30477,Professor,German,inkrafttreten,1
30475,Professor,German,inkompatibilität,1


In [None]:
new_df_tweet_pos

Unnamed: 0,Category,Language,Aspect,Count
0,Lecturer,German,abhängigkeit,1
1,Lecturer,German,abhängigkeitshierarchie,1
2,Lecturer,German,ablehnung,1
3,Lecturer,German,absagenwahnsinn,1
4,Lecturer,German,abschluss,2
...,...,...,...,...
44742,Unknown,English,youtube,1
44743,Unknown,English,yr,1
44744,Unknown,English,zeitvertrag,1
44745,Unknown,English,zero,2


In [None]:
new_df_tweet_pos.to_csv("AspectsWithoutCategoryAll.csv", index=False)
files.download("AspectsWithoutCategoryAll.csv")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
new_df_tweet_pos[new_df_tweet_pos["Count"] > 50]

Unnamed: 0,Category,Language,Aspect,Count
1083,Others,German,antwort,70
1099,Others,German,arbeit,274
1114,Others,German,arbeitsbedingung,261
1175,Others,German,arbeitsverhältnis,67
1176,Others,German,arbeitsvertrag,57
...,...,...,...,...
44233,Unknown,English,people,54
44381,Unknown,English,research,62
44428,Unknown,English,scholar,55
44731,Unknown,English,work,62


In [None]:
new_df_tweet_pos[new_df_tweet_pos["Count"] > 50].to_csv("AspectsWithoutCategory.csv", index=False)
files.download("AspectsWithoutCategory.csv")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

# ABSA Approach 1

In [None]:
!pip install tweetnlp

In [None]:
import pandas as pd
import numpy as np
import tweetnlp
import re
from google.colab import files

In [None]:
model = tweetnlp.load_model('sentiment', multilingual=True)

Downloading config.json:   0%|          | 0.00/841 [00:00<?, ?B/s]

Downloading sentencepiece.bpe.model:   0%|          | 0.00/4.83M [00:00<?, ?B/s]

Downloading special_tokens_map.json:   0%|          | 0.00/150 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/1.04G [00:00<?, ?B/s]

In [None]:
URL_PATTERN = re.compile(r'https?://[^ ]+')
MENTIONS_PATTERN = re.compile(r'@[^ ]+')
HASHTAGS_PATTERN = re.compile(r'#[^ ]+')
SPECIAL_CHARS_PATTERN = re.compile(r'[^A-Za-zÀ-ž ]')
MULTIPLE_SPACES_PATTERN = re.compile(' +')

pattern_list = [URL_PATTERN,
                MENTIONS_PATTERN,
                HASHTAGS_PATTERN,
                SPECIAL_CHARS_PATTERN,
                MULTIPLE_SPACES_PATTERN]

def convert_to_phrases(df, aspect_ids):

    final_phrase_list = []

    for idx in aspect_ids:
        temp = df[df["idx"] == idx]
        final_phrase_list += divide_into_phrases(temp)
    
    return pd.DataFrame(final_phrase_list, columns=["idx", "Category", "Language", "Phrase"])


def divide_into_phrases(input_df):
    aspect_count = input_df["tweet_aspect_count"].iloc[0]
    user_category = input_df["user_category"].iloc[0]
    lang = input_df["lang"].iloc[0]
    idx = input_df["idx"].iloc[0]

    word_list = input_df["original_word"].to_list()
    is_aspect = input_df["is_aspect"].to_list()
    is_adj = input_df["is_adj"].to_list()

    phrase_text = []
    phrase_list = []

    remaining_aspects = aspect_count
    temp_aspect_count = 0
    temp_adj_count = 0

    for i in range(len(word_list)):

        if remaining_aspects == 1:
            phrase_list.append([idx, user_category, lang, ' '.join(word_list[i:])])
            return phrase_list
        elif remaining_aspects < 1:
            return phrase_list

        if is_aspect[i] == 1:
            if temp_aspect_count >= 1:
                phrase_list.append([idx, user_category, lang, ' '.join(phrase_text)])
                remaining_aspects -= 1
                temp_aspect_count = 1
                temp_adj_count = 0
                phrase_text = [word_list[i]]
            else:
                phrase_text.append(word_list[i])
                temp_aspect_count += 1
        else:
            phrase_text.append(word_list[i])

        if is_adj[i] == 1:
            temp_adj_count += 1

        if (temp_adj_count >= 1) & (temp_aspect_count >= 1):
            phrase_list.append([idx, user_category, lang, ' '.join(phrase_text)])
            phrase_text = []
            remaining_aspects -= 1
            temp_aspect_count = 0
            temp_adj_count = 0

        if remaining_aspects == 1:
            phrase_list.append([idx, user_category, lang, ' '.join(word_list[i:])])
            return phrase_list
        elif remaining_aspects < 1:
            return phrase_list

def text_process_phrases(input_text_col, idx_col, phrase_col, aspect_col, lang_col, pattern_list):

    final_phrases = []

    for i in range(len(input_text_col)):

        input_text = input_text_col[i]
        idx = idx_col[i]
        phrases = phrase_col[i]
        aspects = aspect_col[i]
        lang = lang_col[i]
            
        for pattern in pattern_list:
            input_text = pattern.sub(" ", input_text)

        if lang == "en":
            input_text = input_text.replace("&amp;", " and ").replace("&gt;", " more than ").replace("&lt;", " less than ")

        if lang == "de":
            input_text = input_text.replace("&amp;", " und ").replace("&gt;", " mehr als ").replace("&lt;", " weniger als ")
            
        # Convert to lower case
        input_text = input_text.lower().strip()

        if len(input_text) <= 0:
            continue
        
        input_text = [word for word in input_text.split()]
        phrase_list = phrases.split("___")
        aspect_list = aspects.split("___")

        j = 0
        
        for i in range(len(phrase_list)):
            temp_phrase = phrase_list[i].split()
            
            new_phrase = []
            
            while j < len(input_text):
                new_phrase.append(input_text[j])
                
                if (input_text[j] == temp_phrase[-1]) or (j == len(input_text) - 1):
                    final_phrases.append([idx, ' '.join(new_phrase), aspect_list[i]])
                    j+=1
                    break

                j+=1
        

    if len(final_phrases) > 0:
        return pd.DataFrame(final_phrases, columns=["idx", "Complete_Phrases", "Aspect"])
    else:
        return None

def get_sentiment(input_text, model):

    sentiment = model.sentiment(input_text, return_probability=True)

    return [sentiment['label'].capitalize(), sentiment['probability']['negative'], sentiment['probability']['neutral'], sentiment['probability']['positive']]

In [None]:
df_tweet_pos = pd.read_csv("Tweet_POS.csv", keep_default_na=False)
df_aspects = pd.read_csv("AspectsWithoutCategory.csv")
df = pd.read_csv("Final_Tweet_Data.csv")

In [None]:
# Keep only those tweets that have aspects
df_tweet_pos["has_noun"] = df_tweet_pos["pos"].apply(lambda x: 1 if x in ["NN", "NE", "NA", "NP", "NNS", "NPS", "NN_VAR"] else 0)
df_tweet_pos["has_adj"] = df_tweet_pos["pos"].apply(lambda x: 1 if x in ["JJ", "ADJ(A)", "ADJ(D)"] else 0)

# Group together to get count of nouns and adjectives in each tweet
agg_dict = {'has_noun': ['sum'], 'has_adj': ["sum"]}
df_noun_adj_flag = df_tweet_pos.groupby(by="idx").agg(agg_dict)
df_noun_adj_flag.columns = df_noun_adj_flag.columns.droplevel(1)
df_noun_adj_flag.reset_index(inplace=True)

# Add counts to the tweet_pos dataframe
new_df_tweet_pos = df_tweet_pos.merge(df_noun_adj_flag, on='idx')
new_df_tweet_pos["lang"] = new_df_tweet_pos["lang"].map({'en': 'English', 'de': 'German'})
new_df_tweet_pos = new_df_tweet_pos[['idx', 'lang', 'lemma', 'original_word', 'pos', 'has_noun_y', 'has_adj_y']]

# Include user categories from tweets file
new_df_tweet_pos = new_df_tweet_pos.merge(df[["idx", "user_category"]], on="idx")

# Keep only those tweets that have atleast one noun and adj
new_df_tweet_pos = new_df_tweet_pos[(new_df_tweet_pos["has_adj_y"] > 0) & (new_df_tweet_pos["has_noun_y"] > 0)]

# Keep only those tweets that have aspects
aspect_ids = new_df_tweet_pos.merge(df_aspects, left_on=["lang", "lemma", "user_category"], right_on=["Language", "Aspect", "Category"])['idx'].drop_duplicates()
aspect_ids = pd.DataFrame(aspect_ids, columns=["idx"])
df_tweets_with_aspects = aspect_ids.merge(new_df_tweet_pos)

# Mark Aspect terms
df_tweets_with_aspects = df_tweets_with_aspects.merge(df_aspects, left_on=["user_category", "lang", "lemma"], right_on=["Category", "Language", "Aspect"], how="left")
df_tweets_with_aspects["is_aspect"] = df_tweets_with_aspects["Category"].apply(lambda x: 0 if x is np.nan else 1)
df_tweets_with_aspects = df_tweets_with_aspects[['idx', 'lang', 'lemma', 'original_word', 'pos', 'user_category', 'is_aspect']]

# Mark Adjectives
df_tweets_with_aspects["is_adj"] = df_tweets_with_aspects["pos"].apply(lambda x: 1 if x in ["JJ", "ADJ(A)", "ADJ(D)"] else 0)

# Mark number of aspects in each tweet
agg_dict = {'is_aspect': ['sum']}
df_aspect_tweet_count = df_tweets_with_aspects.groupby(by="idx").agg(agg_dict)
df_aspect_tweet_count.columns = df_aspect_tweet_count.columns.droplevel(1)
df_aspect_tweet_count.reset_index(inplace=True)
df_aspect_tweet_count.rename(columns={"is_aspect": "tweet_aspect_count"}, inplace=True)
df_tweets_with_aspects = df_tweets_with_aspects.merge(df_aspect_tweet_count, on="idx")

In [None]:
print(df_tweets_with_aspects.loc[1377]["lemma"])

herzlich


In [None]:
df_tweets_with_aspects.to_csv("Aspect_Tweet_POS.csv", index=False)
files.download("Aspect_Tweet_POS.csv")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
df_tweets_with_aspects

Unnamed: 0,idx,lang,lemma,original_word,pos,user_category,is_aspect,is_adj,tweet_aspect_count
0,0,German,zeigen,zeigt,VV(FIN),PostDoc,0,0,3
1,0,German,diskussion,diskussion,NN,PostDoc,1,0,3
2,0,German,beobachten,beobachtet,VV(FIN),PostDoc,0,0,3
3,0,German,berufung,berufung,NN,PostDoc,0,0,3
4,0,German,art,art,NE,PostDoc,0,0,3
...,...,...,...,...,...,...,...,...,...
321119,16933,English,non,non,JJ,Unknown,0,1,1
321120,16933,English,passport,passport,NN,Unknown,1,0,1
321121,16933,English,holder,holder,NN,Unknown,0,0,1
321122,16933,English,grateful,grateful,JJ,Unknown,0,1,1


In [None]:
aspect_ids = df_tweets_with_aspects["idx"].unique()

In [None]:
# Divide the tweets into phrases
df_phrases = convert_to_phrases(df_tweets_with_aspects, aspect_ids)
df_phrases["Aspect"] = df_tweets_with_aspects[df_tweets_with_aspects["is_aspect"] == 1]["lemma"].to_list()

In [None]:
df_phrases[df_phrases["idx"] == 17447]

Unnamed: 0,idx,Category,Language,Phrase,Aspect
2466,17447,PostDoc,German,kenne zuhauf ärztlichen kollegen,kollege
2467,17447,PostDoc,German,urlaubstage opfern anträge papers absurd,antrag
2468,17447,PostDoc,German,baut wiss ärzte arbzg arbschg verstoßen urlaub...,urlaub
2469,17447,PostDoc,German,forschung,forschung


In [None]:
df_phrases

Unnamed: 0,idx,Category,Language,Phrase,Aspect
0,0,PostDoc,German,zeigt diskussion beobachtet berufung art selek...,diskussion
1,0,PostDoc,German,amnesie äußere bedingungen,bedingung
2,0,PostDoc,German,bedingungen zufall einsetzt leistung zugeschri...,leistung
3,18167,PostDoc,German,grandiose leistung,leistung
4,18167,PostDoc,German,leistung monatelange interne diskussionen verb...,diskussion
...,...,...,...,...,...
59568,120166,Unknown,English,recommendations really wants equitable events ...,passport
59569,120303,Unknown,English,tweeting thread case conversation means withou...,passport
59570,120368,Unknown,English,jeez ridiculous say since underlying assumptio...,passport
59571,120423,Unknown,English,employers well intended may often fail grasp i...,passport


In [None]:
df_phrases.to_csv("Extracted_Phrases_Aspects.csv", index=False)
files.download("Extracted_Phrases_Aspects.csv")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
df_phrases_updated = df_phrases[['idx','Phrase', 'Aspect']].groupby(['idx'])[['Phrase', 'Aspect']].transform(lambda x: '___'.join(x))

In [None]:
df_phrases_updated = pd.DataFrame({"idx": df_phrases["idx"], "Phrases": df_phrases_updated["Phrase"], "Aspect": df_phrases_updated["Aspect"]})
df_phrases_updated = df_phrases_updated.drop_duplicates()
df_phrases_updated

Unnamed: 0,idx,Phrases,Aspect
0,0,zeigt diskussion beobachtet berufung art selek...,diskussion___bedingung___leistung
3,18167,grandiose leistung___leistung monatelange inte...,leistung___diskussion
5,18189,gründe___diskussion___unis angst wisszeitvg___...,grund___diskussion___uni___prof___dauerstelle_...
11,18603,richtig gehts diskussion___pure entfristung___...,diskussion___entfristung___professur___professur
15,20333,parallel diskussion verfolgt wissen strukturel...,diskussion
...,...,...,...
59568,120166,recommendations really wants equitable events ...,passport
59569,120303,tweeting thread case conversation means withou...,passport
59570,120368,jeez ridiculous say since underlying assumptio...,passport
59571,120423,employers well intended may often fail grasp i...,passport


In [None]:
df = df.merge(df_phrases_updated, on="idx")

In [None]:
df

Unnamed: 0,idx,id,author_id,text,updated_text,created_at,lang,public_metrics_retweet_count,public_metrics_reply_count,public_metrics_like_count,...,extracted_mentions,extracted_hashtags,location_city,location_state,location_country,location,user_category,subcategories,Phrases,Aspect
0,0,1587227874699902979,1133449609983025154,👆Das zeigt sich u.a. in der #IchbinHanna #Wiss...,das zeigt sich u a in der diskussion oft beoba...,2022-10-31 23:39:49+00:00,de,0,0,10,...,__NA__,"['IchbinHanna', 'WissZeitVG']",Unknown,Mecklenburg-Vorpommern,Germany,Greifswald,PostDoc,,zeigt diskussion beobachtet berufung art selek...,diskussion___bedingung___leistung
1,41,1587225260902031360,1133449609983025154,„Die Meritokratie erzeugt tatsächlich leistung...,die meritokratie erzeugt tatsächlich leistungs...,2022-10-31 23:29:26+00:00,de,1,1,7,...,__NA__,__NA__,Unknown,Mecklenburg-Vorpommern,Germany,Greifswald,PostDoc,,meritokratie erzeugt leistungs bereite fähige ...,leistung
2,5645,1482409620194930689,1133449609983025154,@AmreiBahr @Uni_Stuttgart Good News. 👍 \nAber ...,good news aber mal generell wie sinnbefreit si...,2022-01-15 17:49:29+00:00,de,5,0,23,...,"['1260870565541396480', '54570476']","['IchBinHanna', 'WisssystemFehler']",Unknown,Mecklenburg-Vorpommern,Germany,Greifswald,PostDoc,,good news generell sinnbefreit junprof drm nac...,uni
3,7639,1447844234342588416,1133449609983025154,#WissSystemFehler #IchBinHanna Das ist genau d...,das ist genau das was seit den er j schief geg...,2021-10-12 08:38:59+00:00,de,4,1,9,...,__NA__,"['WissSystemFehler', 'IchBinHanna', 'Innovatio...",Unknown,Mecklenburg-Vorpommern,Germany,Greifswald,PostDoc,,schief gegangen forschung daueraufgaben drm pr...,forschung
4,16941,1403417443297599490,1133449609983025154,12/ Anderes Problem im 🇩🇪Unisystem: immer mehr...,anderes problem im unisystem immer mehr stelle...,2021-06-11 18:22:47+00:00,de,3,1,5,...,__NA__,"['WissZeitVG', 'Länder', 'Bund', 'Daueraufgaben']",Unknown,Mecklenburg-Vorpommern,Germany,Greifswald,PostDoc,,problem unisystem stellen drm finanziert dauer...,problem___geld___system
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
26412,176609,1402953996520861696,3410264410,"Ich bin Tyson, Wirtschaftswissenschafter, seit...",ich bin tyson wirtschaftswissenschafter seit j...,2021-06-10 11:41:13+00:00,de,0,1,2,...,__NA__,['IchbinHanna'],Unknown,Unknown,Unknown,Metaebene,Unknown,,tyson wirtschaftswissenschafter jahren vollzei...,jahr___job___forschung
26413,176616,1402951793286463488,1306284693214629888,Ich habe nach der Lehre zur Einzelhandelskauff...,ich habe nach der lehre zur einzelhandelskauff...,2021-06-10 11:32:27+00:00,de,0,1,2,...,__NA__,['IchbinHanna'],Unknown,Unknown,Unknown,🇩🇪 #Ampel Dummland,Unknown,,lehre einzelhandelskauffrau wirtschaftsfachwir...,befristung___erfahrung
26414,176630,1402911285914841094,256582223,@anna_neumaier @DrKEichhorn @BMBF_Bund Also ic...,also ich bin jetzt total überzeugt vom toll da...,2021-06-10 08:51:30+00:00,de,0,0,5,...,"['1883790510', '730098730943324162', '29697277...",['WissZeitVG'],Bochum,North Rhine-Westphalia,Germany,"Bochum, Germany",PostDoc,Lecturer,überzeugt politik___politik genialen mechanism...,politik___vertrag
26415,176675,1559110620972285953,151775715,"Hätte nicht gedacht, dass ich das nochmal sage...",hätte nicht gedacht dass ich das nochmal sagen...,2022-08-15 09:31:53+00:00,de,0,2,13,...,__NA__,"['IchBinNichtMehrHanna', 'IchBinHanna']",Hamburg,Unknown,Germany,Hamburg,PostDoc,Scientific Assistant,gedacht nochmal system nix ändert hoffentlich ...,system


In [None]:
print(df[df["idx"] == 0]["text"].iloc[0])
print(df[df["idx"] == 0]["Phrases"].iloc[0])

👆Das zeigt sich u.a. in der #IchbinHanna #WissZeitVG Diskussion: oft beobachtet man, daß nach der Berufung eine Art selektive Amnesie um äußere Bedingungen, wieviel 🍀u Zufall auch dabei war, einsetzt u alles nur der eigenen Leistung zugeschrieben wird. 9/
zeigt diskussion beobachtet berufung art selektive___amnesie äußere bedingungen___bedingungen zufall einsetzt leistung zugeschrieben


In [None]:
df_complete_phrases = text_process_phrases(df["text"], df["idx"], df["Phrases"], df["Aspect"], df["lang"], pattern_list)

In [None]:
df_complete_phrases

Unnamed: 0,idx,Complete_Phrases,Aspect
0,0,das zeigt sich u a in der diskussion oft beoba...,diskussion
1,0,amnesie um äußere bedingungen,bedingung
2,0,wieviel u zufall auch dabei war einsetzt u all...,leistung
3,41,die meritokratie erzeugt tatsächlich leistungs...,leistung
4,5645,good news aber mal generell wie sinnbefreit si...,uni
...,...,...,...
59564,176630,es mit so genialen mechanismen erlaubt so viel...,vertrag
59565,176675,hätte nicht gedacht dass ich das nochmal sagen...,system
59566,176676,frisch zurück aus dem urlaub gibt s some,urlaub
59567,176676,personal news,personal


In [None]:
for phrase in df_complete_phrases[df_complete_phrases["idx"] == 0]["Complete_Phrases"]:
    print(phrase)

das zeigt sich u a in der diskussion oft beobachtet man daß nach der berufung eine art selektive
amnesie um äußere bedingungen
wieviel u zufall auch dabei war einsetzt u alles nur der eigenen leistung zugeschrieben


In [None]:
# Assign sentiments to aspects
get_sentiment("wieviel u zufall auch dabei war einsetzt u alles nur der eigenen leistung zugeschrieben", model)

['Neutral', 0.29894495010375977, 0.6387374401092529, 0.062317587435245514]

In [None]:
df_complete_phrases[["Sentiment", "Negative_Probability", "Neutral_Probability", "Positive_Probability"]] = df_complete_phrases.apply(lambda x: get_sentiment(x["Complete_Phrases"], model), axis=1, result_type='expand')

In [None]:
df_complete_phrases.to_csv("Phrase_Sentiment_Probabilities.csv", index=False)
files.download("Phrase_Sentiment_Probabilities.csv")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
df_complete_phrases_final = df_complete_phrases.copy()

In [None]:
df_complete_phrases_final["Negative"] = df_complete_phrases_final["Sentiment"].apply(lambda x: 1 if x == "Negative" else 0)
df_complete_phrases_final["Neutral"] = df_complete_phrases_final["Sentiment"].apply(lambda x: 1 if x == "Neutral" else 0)
df_complete_phrases_final["Positive"] = df_complete_phrases_final["Sentiment"].apply(lambda x: 1 if x == "Positive" else 0)
df_complete_phrases_final["is_aspect"] = 1

In [None]:
df_complete_phrases_final = df_complete_phrases_final.merge(df[["idx", "lang", "user_category"]], on="idx")
df_complete_phrases_final["lang"] = df_complete_phrases_final["lang"].map({'en': 'English', 'de': 'German'})
df_complete_phrases_final

Unnamed: 0,idx,Complete_Phrases,Aspect,Sentiment,Negative_Probability,Neutral_Probability,Positive_Probability,Negative,Neutral,Positive,is_aspect,lang,user_category
0,0,das zeigt sich u a in der diskussion oft beoba...,diskussion,Neutral,0.259346,0.687608,0.053046,0,1,0,1,German,PostDoc
1,0,amnesie um äußere bedingungen,bedingung,Negative,0.760853,0.208611,0.030536,1,0,0,1,German,PostDoc
2,0,wieviel u zufall auch dabei war einsetzt u all...,leistung,Neutral,0.298945,0.638737,0.062318,0,1,0,1,German,PostDoc
3,41,die meritokratie erzeugt tatsächlich leistungs...,leistung,Neutral,0.275726,0.500127,0.224147,0,1,0,1,German,PostDoc
4,5645,good news aber mal generell wie sinnbefreit si...,uni,Negative,0.786022,0.161021,0.052957,1,0,0,1,German,PostDoc
...,...,...,...,...,...,...,...,...,...,...,...,...,...
59564,176630,es mit so genialen mechanismen erlaubt so viel...,vertrag,Positive,0.280227,0.270228,0.449545,0,0,1,1,German,PostDoc
59565,176675,hätte nicht gedacht dass ich das nochmal sagen...,system,Positive,0.321776,0.290343,0.387881,0,0,1,1,German,PostDoc
59566,176676,frisch zurück aus dem urlaub gibt s some,urlaub,Neutral,0.212963,0.445281,0.341757,0,1,0,1,German,PostDoc
59567,176676,personal news,personal,Neutral,0.310926,0.548239,0.140834,0,1,0,1,German,PostDoc


In [None]:
agg_dict = {'is_aspect': ['sum'], 'Negative': ["sum"], 'Neutral': ["sum"], 'Positive': ["sum"], 'Negative_Probability': ["mean"], 'Neutral_Probability': ["mean"], 'Positive_Probability': ["mean"]}
df_complete_phrases_final = df_complete_phrases_final.groupby(by=["lang", "user_category", "Aspect"], as_index=False).agg(agg_dict)
df_complete_phrases_final.columns = df_complete_phrases_final.columns.droplevel(1)
df_complete_phrases_final.rename(columns={"user_category": "Category", "lang": "Language", "is_aspect": "Count"}, inplace=True)
df_complete_phrases_final

Unnamed: 0,Language,Category,Aspect,Count,Negative,Neutral,Positive,Negative_Probability,Neutral_Probability,Positive_Probability
0,English,Others,academia,89,36,43,10,0.431364,0.370752,0.197885
1,English,Others,career,60,11,40,9,0.302216,0.479229,0.218555
2,English,Others,contract,109,39,60,10,0.386766,0.459949,0.153285
3,English,Others,germany,88,27,42,19,0.356745,0.407269,0.235986
4,English,Others,job,69,24,34,11,0.393432,0.398972,0.207596
...,...,...,...,...,...,...,...,...,...,...
476,German,Unknown,wissenschaftler,201,96,92,13,0.464702,0.415603,0.119695
477,German,Unknown,wissenschaftssystem,53,28,21,4,0.517908,0.370315,0.111777
478,German,Unknown,woche,95,26,56,13,0.363728,0.442244,0.194029
479,German,Unknown,zeit,190,79,96,15,0.420591,0.413967,0.165442


In [None]:
df_complete_phrases_final.to_csv("Approch1_FinalOutputWithoutCategory.csv", index=False)
files.download("Approch1_FinalOutputWithoutCategory.csv")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

# ABSA Approach 2 - with Complete tweet sentiment

In [None]:
!pip install tweetnlp

In [None]:
import pandas as pd
import numpy as np
import re
import tweetnlp
from google.colab import files

In [None]:
df_tweet_pos = pd.read_csv("Tweet_POS.csv", keep_default_na=False)
df_aspects = pd.read_csv("AspectsWithoutCategory.csv")
df = pd.read_csv("Final_Tweet_Data.csv")

In [None]:
model = tweetnlp.load_model('sentiment', multilingual=True)

In [None]:
URL_PATTERN = re.compile(r'https?://[^ ]+')
MENTIONS_PATTERN = re.compile(r'@[^ ]+')
HASHTAGS_PATTERN = re.compile(r'#[^ ]+')
SENTI_SPECIAL_CHARS_PATTERN = re.compile(r'[^A-Za-zÀ-ž.,?! ]')
MULTIPLE_SPACES_PATTERN = re.compile(' +')

senti_pattern_list = [URL_PATTERN,
                      MENTIONS_PATTERN,
                      HASHTAGS_PATTERN,
                      SENTI_SPECIAL_CHARS_PATTERN,
                      MULTIPLE_SPACES_PATTERN]

def predict_sentiment(row, pattern_list):
    
    error_op = "__NA__"

    input_text = row['text']
    lang = row['lang']
    
    if input_text is np.nan:
        return error_op

    if lang not in ["de", "en"]:
        return error_op

    if lang == "en":
        input_text = input_text.replace("&amp;", " and ").replace("&gt;", " more than ").replace("&lt;", " less than ")
    if lang == "de":
        input_text = input_text.replace("&amp;", " und ").replace("&gt;", " mehr als ").replace("&lt;", " weniger als ")

    for pattern in pattern_list:
        input_text = pattern.sub(" ", input_text)

    # Convert to lower case
    input_text = input_text.lower().strip()

    if len(input_text) == 0:
        return error_op

    return model.sentiment(input_text)['label'].capitalize()

In [None]:
# Skip this part if file already calculated and saved
df["Sentiment"] = df.apply(lambda row: predict_sentiment(row, senti_pattern_list), axis=1)

In [None]:
df.to_csv("FullTweetSentiments.csv", index=False)
files.download('FullTweetSentiments.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
# Load pre-calculated sentiment file
df_with_sentiment = pd.read_csv("FullTweetSentiments.csv")

In [None]:
# df_with_sentiment = df.copy()

In [None]:
# Keep only those tweets that have aspects
df_tweet_pos["has_noun"] = df_tweet_pos["pos"].apply(lambda x: 1 if x in ["NN", "NE", "NA", "NP", "NNS", "NPS", "NN_VAR"] else 0)
df_tweet_pos["has_adj"] = df_tweet_pos["pos"].apply(lambda x: 1 if x in ["JJ", "ADJ(A)", "ADJ(D)"] else 0)

# Group together to get count of nouns and adjectives in each tweet
agg_dict = {'has_noun': ['sum'], 'has_adj': ["sum"]}
df_noun_adj_flag = df_tweet_pos.groupby(by="idx").agg(agg_dict)
df_noun_adj_flag.columns = df_noun_adj_flag.columns.droplevel(1)
df_noun_adj_flag.reset_index(inplace=True)

# Add counts to the tweet_pos dataframe
new_df_tweet_pos = df_tweet_pos.merge(df_noun_adj_flag, on='idx')
new_df_tweet_pos["lang"] = new_df_tweet_pos["lang"].map({'en': 'English', 'de': 'German'})
new_df_tweet_pos = new_df_tweet_pos[['idx', 'lang', 'lemma', 'pos', 'has_noun_y', 'has_adj_y']]

# Include user categories from tweets file
new_df_tweet_pos = new_df_tweet_pos.merge(df[["idx", "user_category"]], on="idx")

# Keep only those tweets that have atleast one noun and adj
new_df_tweet_pos = new_df_tweet_pos[(new_df_tweet_pos["has_adj_y"] > 0) & (new_df_tweet_pos["has_noun_y"] > 0)]

# Keep only those tweets that have aspects
aspect_ids = new_df_tweet_pos.merge(df_aspects, left_on=["lang", "lemma", "user_category"], right_on=["Language", "Aspect", "Category"])['idx'].drop_duplicates()
aspect_ids = pd.DataFrame(aspect_ids, columns=["idx"])
df_tweets_with_aspects = aspect_ids.merge(new_df_tweet_pos)

# Mark Aspect terms
df_tweets_with_aspects = df_tweets_with_aspects.merge(df_aspects, left_on=["user_category", "lang", "lemma"], right_on=["Category", "Language", "Aspect"], how="left")
df_tweets_with_aspects["is_aspect"] = df_tweets_with_aspects["Category"].apply(lambda x: 0 if x is np.nan else 1)
df_tweets_with_aspects = df_tweets_with_aspects[['idx', 'lang', 'lemma', 'pos', 'user_category', 'is_aspect']]

# Keep only aspect terms
df_tweets_with_aspects = df_tweets_with_aspects[df_tweets_with_aspects["is_aspect"] == 1]
df_tweets_with_aspects = df_tweets_with_aspects.merge(df_with_sentiment[["idx", "Sentiment"]])

In [None]:
df_tweets_with_aspects["Negative"] = df_tweets_with_aspects["Sentiment"].apply(lambda x: 1 if x == "Negative" else 0)
df_tweets_with_aspects["Neutral"] = df_tweets_with_aspects["Sentiment"].apply(lambda x: 1 if x == "Neutral" else 0)
df_tweets_with_aspects["Positive"] = df_tweets_with_aspects["Sentiment"].apply(lambda x: 1 if x == "Positive" else 0)

In [None]:
df_tweets_with_aspects

Unnamed: 0,idx,lang,lemma,pos,user_category,is_aspect,Sentiment,Negative,Neutral,Positive
0,0,German,diskussion,NN,PostDoc,1,Neutral,0,1,0
1,0,German,bedingung,NN,PostDoc,1,Neutral,0,1,0
2,0,German,leistung,NN,PostDoc,1,Neutral,0,1,0
3,18167,German,leistung,NN,PostDoc,1,Neutral,0,1,0
4,18167,German,diskussion,NN,PostDoc,1,Neutral,0,1,0
...,...,...,...,...,...,...,...,...,...,...
59568,120166,English,passport,NNS,Unknown,1,Negative,1,0,0
59569,120303,English,passport,NNS,Unknown,1,Neutral,0,1,0
59570,120368,English,passport,NN,Unknown,1,Negative,1,0,0
59571,120423,English,passport,NNS,Unknown,1,Negative,1,0,0


In [None]:
agg_dict = {'is_aspect': ['sum'], 'Negative': ["sum"], 'Neutral': ["sum"], 'Positive': ["sum"]}
df_tweets_with_aspects = df_tweets_with_aspects.groupby(by=["lang", "user_category", "lemma"], as_index=False).agg(agg_dict)
df_tweets_with_aspects.columns = df_tweets_with_aspects.columns.droplevel(1)
df_tweets_with_aspects.rename(columns={"user_category": "Category", "lang": "Language", "lemma": "Aspect", "is_aspect": "Count"}, inplace=True)

In [None]:
df_tweets_with_aspects

Unnamed: 0,Language,Category,Aspect,Count,Negative,Neutral,Positive
0,English,Others,academia,89,42,29,18
1,English,Others,career,60,20,27,13
2,English,Others,contract,109,55,42,12
3,English,Others,germany,88,32,28,28
4,English,Others,job,69,32,18,19
...,...,...,...,...,...,...,...
476,German,Unknown,wissenschaftler,201,142,44,15
477,German,Unknown,wissenschaftssystem,53,36,13,4
478,German,Unknown,woche,95,47,34,14
479,German,Unknown,zeit,190,120,49,21


In [None]:
df_tweets_with_aspects.to_csv("Approch2_FinalOutputWithoutCategory.csv", index=False)
files.download('Approch2_FinalOutputWithoutCategory.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

# Co-occurences

In [None]:
import pandas as pd
from google.colab import files

In [None]:
df_tweet_aspects = pd.read_csv("Aspect_Tweet_POS.csv", keep_default_na=False)

In [None]:
df_tweet_aspects_adj = df_tweet_aspects[df_tweet_aspects["is_adj"] == 1]
df_tweet_aspects_aspects = df_tweet_aspects[df_tweet_aspects["is_aspect"] == 1]
df_tweet_aspects_aspects

Unnamed: 0,idx,lang,lemma,original_word,pos,user_category,is_aspect,is_adj,tweet_aspect_count
1,0,German,diskussion,diskussion,NN,PostDoc,1,0,3
8,0,German,bedingung,bedingungen,NN,PostDoc,1,0,3
11,0,German,leistung,leistung,NN,PostDoc,1,0,3
14,18167,German,leistung,leistung,NN,PostDoc,1,0,2
17,18167,German,diskussion,diskussionen,NN,PostDoc,1,0,2
...,...,...,...,...,...,...,...,...,...
321062,120166,English,passport,passports,NNS,Unknown,1,0,1
321072,120303,English,passport,passports,NNS,Unknown,1,0,1
321083,120368,English,passport,passport,NN,Unknown,1,0,1
321111,120423,English,passport,passports,NNS,Unknown,1,0,1


In [None]:
df_tweet_aspects_adj

Unnamed: 0,idx,lang,lemma,original_word,pos,user_category,is_aspect,is_adj,tweet_aspect_count
5,0,German,selektiv,selektive,ADJ(A),PostDoc,0,1,3
7,0,German,äußer,äußere,ADJ(A),PostDoc,0,1,3
13,18167,German,grandios,grandiose,ADJ(A),PostDoc,0,1,2
15,18167,German,monatelang,monatelange,ADJ(A),PostDoc,0,1,2
16,18167,German,intern,interne,ADJ(A),PostDoc,0,1,2
...,...,...,...,...,...,...,...,...,...
321103,120423,English,short,short,JJ,Unknown,0,1,1
321109,120423,English,wonderful,wonderful,JJ,Unknown,0,1,1
321115,120423,English,short,short,JJ,Unknown,0,1,1
321119,16933,English,non,non,JJ,Unknown,0,1,1


In [None]:
df_tweet_aspects_aspects[df_tweet_aspects_aspects["tweet_aspect_count"] > 1][['idx', 'lemma']].groupby(['idx'])['lemma'].transform(lambda x: '___'.join(x)).drop_duplicates()

1                         diskussion___bedingung___leistung
14                                    leistung___diskussion
21        grund___diskussion___uni___prof___dauerstelle_...
35         diskussion___entfristung___professur___professur
60                                     diskussion___artikel
                                ...                        
320186                          science___science___science
320374                                    scholar___scholar
320401                                   scholar___passport
320490                         scholar___passport___scholar
320657                                   passport___scholar
Name: lemma, Length: 12796, dtype: object

In [None]:
df_tweet_aspects_aspects[df_tweet_aspects_aspects["tweet_aspect_count"] > 1]

Unnamed: 0,idx,lang,lemma,original_word,pos,user_category,is_aspect,is_adj,tweet_aspect_count
1,0,German,diskussion,diskussion,NN,PostDoc,1,0,3
8,0,German,bedingung,bedingungen,NN,PostDoc,1,0,3
11,0,German,leistung,leistung,NN,PostDoc,1,0,3
14,18167,German,leistung,leistung,NN,PostDoc,1,0,2
17,18167,German,diskussion,diskussionen,NN,PostDoc,1,0,2
...,...,...,...,...,...,...,...,...,...
320665,120169,English,scholar,scholar,NN,Unknown,1,0,2
320697,120175,English,scholar,scholars,NNS,Unknown,1,0,2
320699,120175,English,passport,passports,NNS,Unknown,1,0,2
320712,120176,English,scholar,scholars,NNS,Unknown,1,0,2


In [None]:
df_adj_combos = pd.DataFrame({
    "idx": df_tweet_aspects_adj[df_tweet_aspects_adj["tweet_aspect_count"] > 1]["idx"],
    "Adj_Combined": df_tweet_aspects_adj[df_tweet_aspects_adj["tweet_aspect_count"] > 1][['idx', 'lemma']].groupby(['idx'])['lemma'].transform(lambda x: '___'.join(x))
    })
df_adj_combos = df_adj_combos.drop_duplicates()
df_adj_combos

Unnamed: 0,idx,Adj_Combined
5,0,selektiv___äußer
13,18167,grandios___monatelang___intern
25,18189,wisszeitvg___malperform
33,18603,richtig___pur___frühzeitig___grundsätzlich
59,10122,komplett___empfehlenswert___sensationell
...,...,...
320581,114365,privileged
320611,120086,white___arabic___cultural___german___arabic
320652,120169,absent
320700,120175,future


In [None]:
df_aspect_combos = pd.DataFrame({
    "idx": df_tweet_aspects_aspects[df_tweet_aspects_aspects["tweet_aspect_count"] > 1]["idx"],
    "lang": df_tweet_aspects_aspects[df_tweet_aspects_aspects["tweet_aspect_count"] > 1]["lang"],
    "user_category": df_tweet_aspects_aspects[df_tweet_aspects_aspects["tweet_aspect_count"] > 1]["user_category"],
    "Aspect_Combined": df_tweet_aspects_aspects[df_tweet_aspects_aspects["tweet_aspect_count"] > 1][['idx', 'lemma']].groupby(['idx'])['lemma'].transform(lambda x: '___'.join(x))
    })

df_aspect_combos = df_aspect_combos.drop_duplicates()
df_aspect_combos

Unnamed: 0,idx,lang,user_category,Aspect_Combined
1,0,German,PostDoc,diskussion___bedingung___leistung
14,18167,German,PostDoc,leistung___diskussion
21,18189,German,PostDoc,grund___diskussion___uni___prof___dauerstelle_...
35,18603,German,PostDoc,diskussion___entfristung___professur___professur
60,10122,German,PostDoc,diskussion___artikel
...,...,...,...,...
320571,114365,English,Unknown,scholar___passport
320612,120086,English,Unknown,scholar___passport
320657,120169,English,Unknown,passport___scholar
320697,120175,English,Unknown,scholar___passport


In [None]:
df_aspect_adj_combos = df_aspect_combos.merge(df_adj_combos, on="idx")
df_aspect_adj_combos

Unnamed: 0,idx,lang,user_category,Aspect_Combined,Adj_Combined
0,0,German,PostDoc,diskussion___bedingung___leistung,selektiv___äußer
1,18167,German,PostDoc,leistung___diskussion,grandios___monatelang___intern
2,18189,German,PostDoc,grund___diskussion___uni___prof___dauerstelle_...,wisszeitvg___malperform
3,18603,German,PostDoc,diskussion___entfristung___professur___professur,richtig___pur___frühzeitig___grundsätzlich
4,10122,German,PostDoc,diskussion___artikel,komplett___empfehlenswert___sensationell
...,...,...,...,...,...
16768,114365,English,Unknown,scholar___passport,privileged
16769,120086,English,Unknown,scholar___passport,white___arabic___cultural___german___arabic
16770,120169,English,Unknown,passport___scholar,absent
16771,120175,English,Unknown,scholar___passport,future


In [None]:
def get_aspect_combos(lang_col, cat_col, aspect_combo_col):

    aspect_combos = []

    for aspectCount in range(len(aspect_combo_col)):
        aspectList = list(set(aspect_combo_col[aspectCount].split("___")))
        aspectList.sort()

        if len(aspectList) >= 1:
            for i in range(len(aspectList) - 1):
                for j in range(len(aspectList) - i - 1):
                    aspect_combos.append(lang_col[aspectCount] + "____" + cat_col[aspectCount] + "____" + aspectList[i] + "___" + aspectList[i + j + 1])

    return_df = pd.DataFrame({"Aspect_Combos": aspect_combos})

    return_df = return_df.value_counts().reset_index().rename(columns={0: "Count"})

    return_df[["Language", "Category", "Aspects"]] = return_df.apply(lambda x: x['Aspect_Combos'].split('____'), axis=1, result_type='expand')
    return_df = return_df[["Category", "Language", "Aspects", "Count"]]

    return return_df

def get_aspect_adj_combos(lang_col, cat_col, aspect_combo_col, adj_combo_col):

    aspect_adj_combos = []

    for aspectCount in range(len(aspect_combo_col)):
        aspectList = list(set(aspect_combo_col[aspectCount].split("___")))
        aspectList.sort()

        adjList = list(set(adj_combo_col[aspectCount].split("___")))
        adjList.sort()

        for aspect in aspectList:
            for adj in adjList:
                if aspect != adj:
                    aspect_adj_combos.append(lang_col[aspectCount] + "____" + cat_col[aspectCount] + "____" + aspect + "___" + adj)

    return_df = pd.DataFrame({"Aspect_Adj_Combos": aspect_adj_combos})

    return_df = return_df.value_counts().reset_index().rename(columns={0: "Count"})

    return_df[["Language", "Category", "Aspect_Adj"]] = return_df.apply(lambda x: x['Aspect_Adj_Combos'].split('____'), axis=1, result_type='expand')
    return_df = return_df[["Category", "Language", "Aspect_Adj", "Count"]]

    return return_df

In [None]:
df_co_occurring_aspects = get_aspect_combos(df_aspect_adj_combos["lang"].to_list(),
                                        df_aspect_adj_combos["user_category"].to_list(),
                                        df_aspect_adj_combos["Aspect_Combined"].to_list())

In [None]:
df_co_occurring_aspects

Unnamed: 0,Category,Language,Aspects,Count
0,PostDoc,English,condition___work,88
1,Others,German,arbeitsbedingung___wissenschaft,79
2,PostDoc,German,jahr___wissenschaft,76
3,PostDoc,German,jahr___uni,75
4,PostDoc,English,academia___work,68
...,...,...,...,...
13300,PostDoc,German,bmbf___track,1
13301,PostDoc,German,bmbf___uhr,1
13302,Professor,German,chance___druck,1
13303,Professor,German,chance___drittmittel,1


In [None]:
df_co_occurring_aspect_adj = get_aspect_adj_combos(df_aspect_adj_combos["lang"].to_list(),
                                                   df_aspect_adj_combos["user_category"].to_list(),
                                                   df_aspect_adj_combos["Aspect_Combined"].to_list(),
                                                   df_aspect_adj_combos["Adj_Combined"].to_list())

In [None]:
df_co_occurring_aspect_adj[df_co_occurring_aspect_adj["Count"] > 50]

Unnamed: 0,Category,Language,Aspect_Adj,Count
0,PostDoc,English,academia___german,87
1,Others,German,vertrag___befristet,74
2,Others,German,wissenschaft___prekär,72
3,PostDoc,English,academic___german,70
4,PostDoc,German,universität___hierarchisch,66
5,PostDoc,German,universität___österreichisch,65
6,PostDoc,German,universität___demokratiepolitisch,64
7,PostDoc,German,sauerei___hierarchisch,64
8,PostDoc,German,sauerei___österreichisch,64
9,PostDoc,German,sauerei___demokratiepolitisch,64


In [None]:
df_co_occurring_aspects.to_csv("Final_Co-occuring_Aspects.csv", index=False)
files.download("Final_Co-occuring_Aspects.csv")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
df_co_occurring_aspect_adj.to_csv("Final_Co-occuring_Aspect_Adj.csv", index=False)
files.download("Final_Co-occuring_Aspect_Adj.csv")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

# Test Section

In [None]:
import nltk
from nltk.corpus import wordnet as wn
from nltk.corpus import sentiwordnet as swn
from nltk.tag import pos_tag
import numpy as np
nltk.download('sentiwordnet')
nltk.download('wordnet')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package sentiwordnet to /root/nltk_data...
[nltk_data]   Package sentiwordnet is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [None]:
list(swn.senti_synsets('slow'))
sentence='It was a bad day'

token = nltk.word_tokenize(sentence)
after_tagging = nltk.pos_tag(token)
print (token)
print (after_tagging)
def penn_to_wn(tag):
    """
    Convert between the PennTreebank tags to simple Wordnet tags
    """
    if tag.startswith('J'):
        return wn.ADJ
    elif tag.startswith('N'):
        return wn.NOUN
    elif tag.startswith('R'):
        return wn.ADV
    elif tag.startswith('V'):
        return wn.VERB
    return None
sentiment = 0.0
tokens_count = 0
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()
for word, tag in after_tagging:
            wn_tag = penn_to_wn(tag)
            if wn_tag not in (wn.NOUN, wn.ADJ, wn.ADV):
                continue
 
            lemma = lemmatizer.lemmatize(word, pos=wn_tag)
            if not lemma:
                continue
 
            synsets = wn.synsets(lemma, pos=wn_tag)
            if not synsets:
                continue
 
            # Take the first sense, the most common
            synset = synsets[0]
            swn_synset = swn.senti_synset(synset.name())
            print(swn_synset)

            sentiment += swn_synset.pos_score() - swn_synset.neg_score()
            tokens_count += 1
print (sentiment)

['It', 'was', 'a', 'bad', 'day']
[('It', 'PRP'), ('was', 'VBD'), ('a', 'DT'), ('bad', 'JJ'), ('day', 'NN')]
<bad.a.01: PosScore=0.0 NegScore=0.625>
<day.n.01: PosScore=0.0 NegScore=0.0>
-0.625


In [None]:
get_en_sentiment("beautiful", "N")

[nan, nan]

In [None]:
input_text = "Das Essen war nicht wirklich gut, aber das Ambiente war schön"
model.sentiment(input_text, return_probability=True)

{'label': 'positive',
 'probability': {'negative': 0.09191258996725082,
  'neutral': 0.17488881945610046,
  'positive': 0.7331986427307129}}

In [None]:
input_text = "Das Essen war nicht wirklich gut"
model.sentiment(input_text, return_probability=True)

{'label': 'negative',
 'probability': {'negative': 0.9092323780059814,
  'neutral': 0.06772072613239288,
  'positive': 0.02304690331220627}}

In [None]:
input_text = "aber das Ambiente war schön"
model.sentiment(input_text, return_probability=True)

{'label': 'positive',
 'probability': {'negative': 0.0425802506506443,
  'neutral': 0.11027234047651291,
  'positive': 0.8471474051475525}}

In [None]:
lemmatizer = WordNetLemmatizer()

lookup_pos = { "VV(FIN)" : "V", "NN" : "N", "NE" : "NE", "ADJ(A)" : "AJ", "ADV" : "AV", "VV(PP)" : "V", "ADJ(D)" : "AJ", "XY" : "XY", "FM" : "FM", "VV(INF)" : "V", "VV(IMP)" : "V", "APPR" : "APPR", "VM(FIN)" : "V", "NNA" : "N", "PIAT" : "PIAT", "PTKVZ" : "PTKVZ", "VV(IZU)" : "V", "PWAV" : "PWS", "CARD" : "CARD", "ITJ" : "ITJ", "NNI" : "N", "PIS" : "PIS", "PROAV" : "AV", "APPRART": "APPR", "APZR": "APPR", "PDS" : "PWS", "PPOSAT" : "PPOSAT", "KON" : "KON", "VA(FIN)" : "AV", "PRF" : "PIS", "PDAT" : "PIAT", "PWAT" : "PIAT", "APPO" : "APPR", "ART" : "AV", "VA(INF)" : "V" }

def hanta_to_wn(tag):

    if tag.startswith('J'):
        return wn.ADJ
    elif tag.startswith('N'):
        return wn.NOUN
    elif tag.startswith('R'):
        return wn.ADV
    elif tag.startswith('V'):
        return wn.VERB
    return None

def get_en_sentiment(row):

    input_word = row["lemma"]
    lang = row["lang"]
    pos = row["pos"]

    error_op = [0.0, 0.0]

    if lang not in ["English"]:
        return [row["negative_score"], row["positive_score"]]

    wn_tag = hanta_to_wn(pos)

    if wn_tag not in (wn.NOUN, wn.ADJ, wn.ADV):
        return error_op
        
    lemma = lemmatizer.lemmatize(input_word, pos=wn_tag)
    if not lemma:
        return error_op
        
    synsets = wn.synsets(lemma, pos=wn_tag)

    if not synsets:
        return error_op

    synset = synsets[0]
    swn_synset = swn.senti_synset(synset.name())
    print(swn_synset)

    return [swn_synset.neg_score(), swn_synset.pos_score()]