# Word Level POS Extraction from Tweet

In [None]:
!pip install HanTa

In [None]:
from HanTa import HanoverTagger as ht
import pandas as pd
import re
from google.colab import files
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [None]:
df = pd.read_csv("Final_Tweet_Data.csv")
germantagger = ht.HanoverTagger('morphmodel_ger.pgz')
englishtagger = ht.HanoverTagger('morphmodel_en.pgz')

URL_PATTERN = re.compile(r'https?://[^ ]+')
MENTIONS_PATTERN = re.compile(r'@[^ ]+')
HASHTAGS_PATTERN = re.compile(r'#[^ ]+')
SPECIAL_CHARS_PATTERN = re.compile(r'[^A-Za-zÀ-ž ]')
MULTIPLE_SPACES_PATTERN = re.compile(' +')

pattern_list = [URL_PATTERN,
                MENTIONS_PATTERN,
                HASHTAGS_PATTERN,
                SPECIAL_CHARS_PATTERN,
                MULTIPLE_SPACES_PATTERN]

englishStopWords = nltk.corpus.stopwords.words("english")

germanStopWords = []
with open('german_stopwords.txt', encoding='utf-8') as inputfile:
    for line in inputfile:
        germanStopWords.extend(line.strip().split(','))

In [None]:
def text_process_pos(input_text_col, lang_col, idx_col, pattern_list):

    tweet_pos_tbl = []

    for i in range(len(input_text_col)):

        input_text = input_text_col[i]
        lang = lang_col[i]
        idx = idx_col[i]

        if lang not in ["en", "de"]:
            continue
        
        if input_text is np.nan:
            continue
            
        for pattern in pattern_list:
            input_text = pattern.sub(" ", input_text)
            
        # Convert to lower case
        input_text = input_text.lower().strip()

        if len(input_text) <= 0:
            continue
        
        if lang == 'en':
            input_text = ' '.join([word for word in input_text.split() if word not in englishStopWords and len(word) >= 3])
            tweet_pos_tbl += [[idx, lang, lemma.lower(), pos] for (word,lemma,pos) in englishtagger.tag_sent(input_text.split())]
        elif lang == 'de':
            input_text = ' '.join([word for word in input_text.split() if word not in germanStopWords and len(word) >= 3])
            tweet_pos_tbl += [[idx, lang, lemma.lower(), pos] for (word,lemma,pos) in germantagger.tag_sent(input_text.split())]

    if len(tweet_pos_tbl) > 0:
        return pd.DataFrame(tweet_pos_tbl, columns=["idx", "lang", "lemma", "pos"])
    else:
        return None

In [None]:
tweet_pos_df = text_process_pos(df["text"], df["lang"], df["idx"], pattern_list)

In [None]:
tweet_pos_df.to_csv("Tweet_POS.csv", index=False)
files.download("Tweet_POS.csv")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
tweet_pos_df

Unnamed: 0,idx,lang,lemma,pos
0,0,de,zeigen,VV(FIN)
1,0,de,diskussion,NN
2,0,de,beobachten,VV(FIN)
3,0,de,berufung,NN
4,0,de,art,NE
...,...,...,...,...
453790,176695,en,collection,NN
453791,176695,en,info,CD
453792,176695,en,resource,NNS
453793,176695,en,call,VBZ


# Aspect Extraction

In [None]:
import pandas as pd
from google.colab import files
df_tweet_pos = pd.read_csv("Tweet_POS.csv")
df = pd.read_csv("Final_Tweet_Data.csv")

In [None]:
df_tweet_pos[(df_tweet_pos["pos"] == "NP") | (df_tweet_pos["pos"] == "NNS") | (df_tweet_pos["pos"] == "NPS") | (df_tweet_pos["pos"] == "NN_VAR")]["lang"].value_counts()

en    12035
Name: lang, dtype: int64

In [None]:
df_tweet_pos["has_noun"] = df_tweet_pos["pos"].apply(lambda x: 1 if x in ["NN", "NE", "NA", "NP", "NNS", "NPS", "NN_VAR"] else 0)
df_tweet_pos["has_adj"] = df_tweet_pos["pos"].apply(lambda x: 1 if x in ["JJ", "ADJ(A)", "ADJ(D)"] else 0)

In [None]:
agg_dict = {'has_noun': ['sum'], 'has_adj': ["sum"]}
df_noun_adj_flag = df_tweet_pos.groupby(by="idx").agg(agg_dict)
df_noun_adj_flag.columns = df_noun_adj_flag.columns.droplevel(1)
df_noun_adj_flag.reset_index(inplace=True)
new_df_tweet_pos = df_tweet_pos.merge(df_noun_adj_flag, on='idx')
new_df_tweet_pos = new_df_tweet_pos[['idx', 'lang', 'lemma', 'pos', 'has_noun_y', 'has_adj_y']]
new_df_tweet_pos = new_df_tweet_pos.merge(df[["idx", "user_category"]], on="idx")
new_df_tweet_pos = new_df_tweet_pos[(new_df_tweet_pos["has_adj_y"] > 0) & (new_df_tweet_pos["has_noun_y"] > 0)]
new_df_tweet_pos = new_df_tweet_pos[(new_df_tweet_pos["pos"] == "NN") |
                                    (new_df_tweet_pos["pos"] == "NE") |
                                    (new_df_tweet_pos["pos"] == "NA") |
                                    (new_df_tweet_pos["pos"] == "NP") |
                                    (new_df_tweet_pos["pos"] == "NNS") |
                                    (new_df_tweet_pos["pos"] == "NPS") |
                                    (new_df_tweet_pos["pos"] == "NN_VAR")]
agg_dict = {'lemma': ['count']}
new_df_tweet_pos = new_df_tweet_pos.groupby(by=["user_category", "lang", "lemma"]).agg(agg_dict)
new_df_tweet_pos.columns = new_df_tweet_pos.rename(columns={"lemma": "Count"}).columns.droplevel(1)
new_df_tweet_pos = new_df_tweet_pos.reset_index()
new_df_tweet_pos = new_df_tweet_pos.rename(columns={"user_category": "Category",
                                                    "lang": "Language",
                                                    "lemma": "Aspect"})
new_df_tweet_pos["Language"] = new_df_tweet_pos["Language"].map({'en': 'English', 'de': 'German'})

In [None]:
new_df_tweet_pos[
                 (new_df_tweet_pos["Category"] == "Professor") &
                 (new_df_tweet_pos["Language"] == "German")].sort_values("Count", ascending=False)

Unnamed: 0,Category,Language,Aspect,Count
30499,Professor,German,jahr,690
34014,Professor,German,wissenschaft,662
33431,Professor,German,uni,391
29542,Professor,German,forschung,373
32063,Professor,German,professur,309
...,...,...,...,...
30425,Professor,German,innengeneration,1
30423,Professor,German,innenbildung,1
30420,Professor,German,inkrafttreten,1
30418,Professor,German,inkompatibilität,1


In [None]:
new_df_tweet_pos

Unnamed: 0,Category,Language,Aspect,Count
0,Lecturer,German,abhängigkeit,1
1,Lecturer,German,abhängigkeitshierarchie,1
2,Lecturer,German,ablehnung,1
3,Lecturer,German,absagenwahnsinn,1
4,Lecturer,German,abschluss,2
...,...,...,...,...
44665,Unknown,English,youtube,1
44666,Unknown,English,yr,1
44667,Unknown,English,zeitvertrag,1
44668,Unknown,English,zero,2


In [None]:
new_df_tweet_pos.to_csv("AspectsWithoutCategoryAll.csv", index=False)
files.download("AspectsWithoutCategoryAll.csv")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
new_df_tweet_pos[new_df_tweet_pos["Count"] > 50]

Unnamed: 0,Category,Language,Aspect,Count
1079,Others,German,antwort,70
1095,Others,German,arbeit,273
1110,Others,German,arbeitsbedingung,261
1171,Others,German,arbeitsverhältnis,67
1172,Others,German,arbeitsvertrag,57
...,...,...,...,...
44156,Unknown,English,people,55
44304,Unknown,English,research,62
44352,Unknown,English,scholar,55
44654,Unknown,English,work,63


In [None]:
new_df_tweet_pos[new_df_tweet_pos["Count"] > 50].to_csv("AspectsWithoutCategory.csv", index=False)
files.download("AspectsWithoutCategory.csv")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

# ABSA Approach 1

In [None]:
import pandas as pd
import numpy as np
import nltk
from nltk.corpus import wordnet as wn
from nltk.corpus import sentiwordnet as swn
from nltk.stem import WordNetLemmatizer
nltk.download('sentiwordnet')
nltk.download('wordnet')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package sentiwordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/sentiwordnet.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


True

In [None]:
df_tweet_pos = pd.read_csv("Tweet_POS.csv")
df_aspects = pd.read_csv("AspectsWithoutCategory.csv")
df = pd.read_csv("Final_Tweet_Data.csv")
df_sentimerge = pd.read_csv('sentimerge.csv')

In [None]:
lemmatizer = WordNetLemmatizer()

lookup_pos = { "VV(FIN)" : "V", "NN" : "N", "NE" : "NE", "ADJ(A)" : "AJ", "ADV" : "AV", "VV(PP)" : "V", "ADJ(D)" : "AJ", "XY" : "XY", "FM" : "FM", "VV(INF)" : "V", "VV(IMP)" : "V", "APPR" : "APPR", "VM(FIN)" : "V", "NNA" : "N", "PIAT" : "PIAT", "PTKVZ" : "PTKVZ", "VV(IZU)" : "V", "PWAV" : "PWS", "CARD" : "CARD", "ITJ" : "ITJ", "NNI" : "N", "PIS" : "PIS", "PROAV" : "AV", "APPRART": "APPR", "APZR": "APPR", "PDS" : "PWS", "PPOSAT" : "PPOSAT", "KON" : "KON", "VA(FIN)" : "AV", "PRF" : "PIS", "PDAT" : "PIAT", "PWAT" : "PIAT", "APPO" : "APPR", "ART" : "AV", "VA(INF)" : "V" }

def hanta_to_wn(tag):

    if tag.startswith('J'):
        return wn.ADJ
    elif tag.startswith('N'):
        return wn.NOUN
    elif tag.startswith('R'):
        return wn.ADV
    elif tag.startswith('V'):
        return wn.VERB
    return None

def get_en_sentiment(row):

    input_word = row["lemma"]
    lang = row["lang"]
    pos = row["pos"]

    error_op = [0.0, 0.0]

    if lang not in ["English"]:
        return [row["negative_score"], row["positive_score"]]

    wn_tag = hanta_to_wn(pos)

    if wn_tag not in (wn.NOUN, wn.ADJ, wn.ADV):
        return error_op
        
    lemma = lemmatizer.lemmatize(input_word, pos=wn_tag)
    if not lemma:
        return error_op
        
    synsets = wn.synsets(lemma, pos=wn_tag)

    if not synsets:
        return error_op

    synset = synsets[0]
    swn_synset = swn.senti_synset(synset.name())
    print(swn_synset)

    return [swn_synset.neg_score(), swn_synset.pos_score()]

In [None]:
# Keep only those tweets that have aspects
df_tweet_pos["has_noun"] = df_tweet_pos["pos"].apply(lambda x: 1 if x in ["NN", "NE", "NA", "NP", "NNS", "NPS", "NN_VAR"] else 0)
df_tweet_pos["has_adj"] = df_tweet_pos["pos"].apply(lambda x: 1 if x in ["JJ", "ADJ(A)", "ADJ(D)"] else 0)

# Group together to get count of nouns and adjectives in each tweet
agg_dict = {'has_noun': ['sum'], 'has_adj': ["sum"]}
df_noun_adj_flag = df_tweet_pos.groupby(by="idx").agg(agg_dict)
df_noun_adj_flag.columns = df_noun_adj_flag.columns.droplevel(1)
df_noun_adj_flag.reset_index(inplace=True)

# Add counts to the tweet_pos dataframe
new_df_tweet_pos = df_tweet_pos.merge(df_noun_adj_flag, on='idx')
new_df_tweet_pos["lang"] = new_df_tweet_pos["lang"].map({'en': 'English', 'de': 'German'})
new_df_tweet_pos = new_df_tweet_pos[['idx', 'lang', 'lemma', 'pos', 'has_noun_y', 'has_adj_y']]

# Include user categories from tweets file
new_df_tweet_pos = new_df_tweet_pos.merge(df[["idx", "user_category"]], on="idx")

# Keep only those tweets that have atleast one noun and adj
new_df_tweet_pos = new_df_tweet_pos[(new_df_tweet_pos["has_adj_y"] > 0) & (new_df_tweet_pos["has_noun_y"] > 0)]

# Keep only those tweets that have aspects
aspect_ids = new_df_tweet_pos.merge(df_aspects, left_on=["lang", "lemma", "user_category"], right_on=["Language", "Aspect", "Category"])['idx'].drop_duplicates()
aspect_ids = pd.DataFrame(aspect_ids, columns=["idx"])
df_tweets_with_aspects = aspect_ids.merge(new_df_tweet_pos)

# Mark Aspect terms
df_tweets_with_aspects = df_tweets_with_aspects.merge(df_aspects, left_on=["user_category", "lang", "lemma"], right_on=["Category", "Language", "Aspect"], how="left")
df_tweets_with_aspects["is_aspect"] = df_tweets_with_aspects["Category"].apply(lambda x: 0 if x is np.nan else 1)
df_tweets_with_aspects = df_tweets_with_aspects[['idx', 'lang', 'lemma', 'pos', 'user_category', 'is_aspect']]

# Mark number of aspects in each tweet
agg_dict = {'is_aspect': ['sum']}
df_aspect_tweet_count = df_tweets_with_aspects.groupby(by="idx").agg(agg_dict)
df_aspect_tweet_count.columns = df_aspect_tweet_count.columns.droplevel(1)
df_aspect_tweet_count.reset_index(inplace=True)
df_aspect_tweet_count.rename(columns={"is_aspect": "tweet_aspect_count"}, inplace=True)
df_tweets_with_aspects = df_tweets_with_aspects.merge(df_aspect_tweet_count, on="idx")

In [None]:
df_tweets_with_aspects

Unnamed: 0,idx,lang,lemma,pos,user_category,is_aspect,tweet_aspect_count
0,0,German,zeigen,VV(FIN),PostDoc,0,3
1,0,German,diskussion,NN,PostDoc,1,3
2,0,German,beobachten,VV(FIN),PostDoc,0,3
3,0,German,berufung,NN,PostDoc,0,3
4,0,German,art,NE,PostDoc,0,3
...,...,...,...,...,...,...,...
327190,16933,English,non,JJ,Unknown,0,1
327191,16933,English,passport,NN,Unknown,1,1
327192,16933,English,holder,NN,Unknown,0,1
327193,16933,English,grateful,JJ,Unknown,0,1


In [None]:
temp = df_tweets_with_aspects[(df_tweets_with_aspects["idx"] == 0) | (df_tweets_with_aspects["idx"] == 20368)].copy()

In [None]:
temp

Unnamed: 0,idx,lang,lemma,pos,user_category,is_aspect,tweet_aspect_count
0,0,German,zeigen,VV(FIN),PostDoc,0,3
1,0,German,diskussion,NN,PostDoc,1,3
2,0,German,beobachten,VV(FIN),PostDoc,0,3
3,0,German,berufung,NN,PostDoc,0,3
4,0,German,art,NE,PostDoc,0,3
5,0,German,selektiv,ADJ(A),PostDoc,0,3
6,0,German,amnesie,ADV,PostDoc,0,3
7,0,German,äußer,ADJ(A),PostDoc,0,3
8,0,German,bedingung,NN,PostDoc,1,3
9,0,German,zufall,NN,PostDoc,0,3


In [None]:
temp["sentimerge_pos"] = temp["pos"].map(lookup_pos)
temp = temp.merge(df_sentimerge, left_on=["lemma", "sentimerge_pos"], right_on=["lemma", "PoS"], how="left")
temp["negative_score"] = temp["sentiment"].apply(lambda x: x if x < 0 else 0)
temp["positive_score"] = temp["sentiment"].apply(lambda x: x if x > 0 else 0)
temp = temp[['idx', 'lang', 'lemma', 'pos', 'user_category', 'is_aspect', 'tweet_aspect_count', 'sentimerge_pos', 'negative_score', 'positive_score']]

In [None]:
temp

Unnamed: 0,idx,lang,lemma,pos,user_category,is_aspect,tweet_aspect_count,sentimerge_pos,negative_score,positive_score
0,0,German,zeigen,VV(FIN),PostDoc,0,3,V,-0.002771,0.0
1,0,German,diskussion,NN,PostDoc,1,3,N,-0.911402,0.0
2,0,German,beobachten,VV(FIN),PostDoc,0,3,V,0.0,0.040449
3,0,German,berufung,NN,PostDoc,0,3,N,0.0,0.038397
4,0,German,art,NE,PostDoc,0,3,NE,0.0,0.0
5,0,German,selektiv,ADJ(A),PostDoc,0,3,AJ,0.0,0.829953
6,0,German,amnesie,ADV,PostDoc,0,3,AV,0.0,0.0
7,0,German,äußer,ADJ(A),PostDoc,0,3,AJ,-0.000749,0.0
8,0,German,bedingung,NN,PostDoc,1,3,N,-0.399958,0.0
9,0,German,zufall,NN,PostDoc,0,3,N,0.0,0.015593


In [None]:
temp[["negative_score", "positive_score"]] = temp.apply(lambda row: get_en_sentiment(row), axis=1, result_type='expand')

<absurdity.n.01: PosScore=0.0 NegScore=0.0>
<policy.n.01: PosScore=0.0 NegScore=0.0>
<year.n.01: PosScore=0.0 NegScore=0.0>
<duration.n.01: PosScore=0.0 NegScore=0.0>
<particularly.r.01: PosScore=0.0 NegScore=0.0>
<guideline.n.01: PosScore=0.0 NegScore=0.0>
<university.n.01: PosScore=0.0 NegScore=0.0>
<today.n.01: PosScore=0.125 NegScore=0.0>
<publication.n.01: PosScore=0.0 NegScore=0.0>
<thesis.n.01: PosScore=0.0 NegScore=0.0>
<possible.a.01: PosScore=0.5 NegScore=0.0>


In [None]:
temp

Unnamed: 0,idx,lang,lemma,pos,user_category,is_aspect,tweet_aspect_count,sentimerge_pos,negative_score,positive_score
0,0,German,zeigen,VV(FIN),PostDoc,0,3,V,-0.002771,0.0
1,0,German,diskussion,NN,PostDoc,1,3,N,-0.911402,0.0
2,0,German,beobachten,VV(FIN),PostDoc,0,3,V,0.0,0.040449
3,0,German,berufung,NN,PostDoc,0,3,N,0.0,0.038397
4,0,German,art,NE,PostDoc,0,3,NE,0.0,0.0
5,0,German,selektiv,ADJ(A),PostDoc,0,3,AJ,0.0,0.829953
6,0,German,amnesie,ADV,PostDoc,0,3,AV,0.0,0.0
7,0,German,äußer,ADJ(A),PostDoc,0,3,AJ,-0.000749,0.0
8,0,German,bedingung,NN,PostDoc,1,3,N,-0.399958,0.0
9,0,German,zufall,NN,PostDoc,0,3,N,0.0,0.015593


# ABSA Approach 2 - with Complete tweet sentiment

In [None]:
!pip install tweetnlp

In [None]:
import pandas as pd
import numpy as np
import tweetnlp
from google.colab import files

In [None]:
df_tweet_pos = pd.read_csv("Tweet_POS.csv")
df_aspects = pd.read_csv("AspectsWithoutCategory.csv")
df = pd.read_csv("Final_Tweet_Data.csv")

In [None]:
model = tweetnlp.load_model('sentiment', multilingual=True)

URL_PATTERN = re.compile(r'https?://[^ ]+')
MENTIONS_PATTERN = re.compile(r'@[^ ]+')
HASHTAGS_PATTERN = re.compile(r'#[^ ]+')
SENTI_SPECIAL_CHARS_PATTERN = re.compile(r'[^A-Za-zÀ-ž.,?! ]')
MULTIPLE_SPACES_PATTERN = re.compile(' +')

senti_pattern_list = [URL_PATTERN,
                      MENTIONS_PATTERN,
                      HASHTAGS_PATTERN,
                      SENTI_SPECIAL_CHARS_PATTERN,
                      MULTIPLE_SPACES_PATTERN]

def predict_sentiment(row, pattern_list):
    
    error_op = "__NA__"

    input_text = row['text']
    lang = row['lang']
    
    if input_text is np.nan:
        return error_op

    if lang not in ["de", "en"]:
        return error_op

    for pattern in pattern_list:
        input_text = pattern.sub(" ", input_text)

    # Convert to lower case
    input_text = input_text.lower().strip()

    if len(input_text) == 0:
        return error_op

    return model.sentiment(input_text)['label'].capitalize()

In [None]:
# Skip this part if file already calculated and saved
df["Sentiment"] = df.apply(lambda row: predict_sentiment(row, senti_pattern_list), axis=1)

In [None]:
df.to_csv("FullTweetSentiments.csv", index=False)
files.download('FullTweetSentiments.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
# Load pre-calculated sentiment file
df_with_sentiment = pd.read_csv("FullTweetSentiments.csv")

In [None]:
# Keep only those tweets that have aspects
df_tweet_pos["has_noun"] = df_tweet_pos["pos"].apply(lambda x: 1 if x in ["NN", "NE", "NA", "NP", "NNS", "NPS", "NN_VAR"] else 0)
df_tweet_pos["has_adj"] = df_tweet_pos["pos"].apply(lambda x: 1 if x in ["JJ", "ADJ(A)", "ADJ(D)"] else 0)

# Group together to get count of nouns and adjectives in each tweet
agg_dict = {'has_noun': ['sum'], 'has_adj': ["sum"]}
df_noun_adj_flag = df_tweet_pos.groupby(by="idx").agg(agg_dict)
df_noun_adj_flag.columns = df_noun_adj_flag.columns.droplevel(1)
df_noun_adj_flag.reset_index(inplace=True)

# Add counts to the tweet_pos dataframe
new_df_tweet_pos = df_tweet_pos.merge(df_noun_adj_flag, on='idx')
new_df_tweet_pos["lang"] = new_df_tweet_pos["lang"].map({'en': 'English', 'de': 'German'})
new_df_tweet_pos = new_df_tweet_pos[['idx', 'lang', 'lemma', 'pos', 'has_noun_y', 'has_adj_y']]

# Include user categories from tweets file
new_df_tweet_pos = new_df_tweet_pos.merge(df[["idx", "user_category"]], on="idx")

# Keep only those tweets that have atleast one noun and adj
new_df_tweet_pos = new_df_tweet_pos[(new_df_tweet_pos["has_adj_y"] > 0) & (new_df_tweet_pos["has_noun_y"] > 0)]

# Keep only those tweets that have aspects
aspect_ids = new_df_tweet_pos.merge(df_aspects, left_on=["lang", "lemma", "user_category"], right_on=["Language", "Aspect", "Category"])['idx'].drop_duplicates()
aspect_ids = pd.DataFrame(aspect_ids, columns=["idx"])
df_tweets_with_aspects = aspect_ids.merge(new_df_tweet_pos)

# Mark Aspect terms
df_tweets_with_aspects = df_tweets_with_aspects.merge(df_aspects, left_on=["user_category", "lang", "lemma"], right_on=["Category", "Language", "Aspect"], how="left")
df_tweets_with_aspects["is_aspect"] = df_tweets_with_aspects["Category"].apply(lambda x: 0 if x is np.nan else 1)
df_tweets_with_aspects = df_tweets_with_aspects[['idx', 'lang', 'lemma', 'pos', 'user_category', 'is_aspect']]

# Keep only aspect terms
df_tweets_with_aspects = df_tweets_with_aspects[df_tweets_with_aspects["is_aspect"] == 1]
df_tweets_with_aspects = df_tweets_with_aspects.merge(df_with_sentiment[["idx", "Sentiment"]])

In [None]:
df_tweets_with_aspects["Negative"] = df_tweets_with_aspects["Sentiment"].apply(lambda x: 1 if x == "Negative" else 0)
df_tweets_with_aspects["Neutral"] = df_tweets_with_aspects["Sentiment"].apply(lambda x: 1 if x == "Neutral" else 0)
df_tweets_with_aspects["Positive"] = df_tweets_with_aspects["Sentiment"].apply(lambda x: 1 if x == "Positive" else 0)

In [None]:
df_tweets_with_aspects

Unnamed: 0,idx,lang,lemma,pos,user_category,is_aspect,Sentiment,Negative,Neutral,Positive
0,0,German,diskussion,NN,PostDoc,1,Neutral,0,1,0
1,0,German,bedingung,NN,PostDoc,1,Neutral,0,1,0
2,0,German,leistung,NN,PostDoc,1,Neutral,0,1,0
3,18167,German,leistung,NN,PostDoc,1,Neutral,0,1,0
4,18167,German,diskussion,NN,PostDoc,1,Neutral,0,1,0
...,...,...,...,...,...,...,...,...,...,...
60333,91530,English,passport,NNS,Unknown,1,Positive,0,0,1
60334,120166,English,passport,NNS,Unknown,1,Negative,1,0,0
60335,120368,English,passport,NN,Unknown,1,Negative,1,0,0
60336,120423,English,passport,NNS,Unknown,1,Negative,1,0,0


In [None]:
agg_dict = {'is_aspect': ['sum'], 'Negative': ["sum"], 'Neutral': ["sum"], 'Positive': ["sum"]}
df_tweets_with_aspects = df_tweets_with_aspects.groupby(by=["lang", "user_category", "lemma"], as_index=False).agg(agg_dict)
df_tweets_with_aspects.columns = df_tweets_with_aspects.columns.droplevel(1)
df_tweets_with_aspects.rename(columns={"user_category": "Category", "lang": "Language", "lemma": "Aspect", "is_aspect": "Count"}, inplace=True)

In [None]:
df_tweets_with_aspects

Unnamed: 0,Language,Category,Aspect,Count,Negative,Neutral,Positive
0,English,Others,academia,89,43,28,18
1,English,Others,amp,96,36,35,25
2,English,Others,career,60,21,26,13
3,English,Others,contract,109,55,42,12
4,English,Others,germany,88,32,28,28
...,...,...,...,...,...,...,...
481,German,Unknown,wissenschaftler,201,142,44,15
482,German,Unknown,wissenschaftssystem,53,36,13,4
483,German,Unknown,woche,95,47,34,14
484,German,Unknown,zeit,190,119,50,21


In [None]:
df_tweets_with_aspects.to_csv("Approch2_FinalOutputWithoutCategory.csv", index=False)
files.download('Approch2_FinalOutputWithoutCategory.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

# Test Section

In [None]:
import nltk
from nltk.corpus import wordnet as wn
from nltk.corpus import sentiwordnet as swn
from nltk.tag import pos_tag
import numpy as np
nltk.download('sentiwordnet')
nltk.download('wordnet')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package sentiwordnet to /root/nltk_data...
[nltk_data]   Package sentiwordnet is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [None]:
list(swn.senti_synsets('slow'))
sentence='It was a bad day'

token = nltk.word_tokenize(sentence)
after_tagging = nltk.pos_tag(token)
print (token)
print (after_tagging)
def penn_to_wn(tag):
    """
    Convert between the PennTreebank tags to simple Wordnet tags
    """
    if tag.startswith('J'):
        return wn.ADJ
    elif tag.startswith('N'):
        return wn.NOUN
    elif tag.startswith('R'):
        return wn.ADV
    elif tag.startswith('V'):
        return wn.VERB
    return None
sentiment = 0.0
tokens_count = 0
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()
for word, tag in after_tagging:
            wn_tag = penn_to_wn(tag)
            if wn_tag not in (wn.NOUN, wn.ADJ, wn.ADV):
                continue
 
            lemma = lemmatizer.lemmatize(word, pos=wn_tag)
            if not lemma:
                continue
 
            synsets = wn.synsets(lemma, pos=wn_tag)
            if not synsets:
                continue
 
            # Take the first sense, the most common
            synset = synsets[0]
            swn_synset = swn.senti_synset(synset.name())
            print(swn_synset)

            sentiment += swn_synset.pos_score() - swn_synset.neg_score()
            tokens_count += 1
print (sentiment)

['It', 'was', 'a', 'bad', 'day']
[('It', 'PRP'), ('was', 'VBD'), ('a', 'DT'), ('bad', 'JJ'), ('day', 'NN')]
<bad.a.01: PosScore=0.0 NegScore=0.625>
<day.n.01: PosScore=0.0 NegScore=0.0>
-0.625


In [None]:
get_en_sentiment("beautiful", "N")

[nan, nan]