In [1]:
import re
import pandas as pd
from transformers import AutoTokenizer,AutoModelForSequenceClassification
import torch
from tqdm import tqdm
import json
import copy
import os

In [2]:
def load_text(path=r'data\all.jsonl'):
    text_list=[]
    with open(path,'r',encoding='utf-8') as infile:
        for line in infile:
            line_dict=json.loads(line.strip())
            text_list.append(line_dict['answer_zh'])
    print("data loaded")
    return text_list

In [3]:
def extract_text_emoji(text_list):
    emoji_pattern=re.compile("["
        u"\U0001F600-\U0001F64F"
        u"\U0001F300-\U0001F5FF"
        u"\U0001F680-\U0001F6FF"
        u"\U0001F1E0-\U0001F1FF"
        u"\U00002700-\U000027BF"
        u"\U0001F900-\U0001F9FF"
        u"\U00002600-\U000026FF"
        "]",flags=re.UNICODE)
    pattern=re.compile(r'(.*?)('+emoji_pattern.pattern+r'+)')
    results=[]
    for text in tqdm(text_list,total=len(text_list),desc='txt/emoji parsing...'):
        paras=text.split('\n\n')
        for para in paras:
            matches=pattern.findall(para)
            if matches:
                for txt,emo in matches:
                    full_txt=txt+emo
                    results.append({
                        "emoji":emo,
                        "txt":txt,
                        "full_txt":full_txt
                    })
    with open(r'data\extract_text_emoji_result.jsonl','w',encoding='utf-8') as outfile:
        for result in results:
            outfile.write(json.dumps(result,ensure_ascii=False)+'\n')
    print('txt/emoji parsing result saved as data/extract_text_emoji_result.jsonl')
    return results

In [4]:
def txt_sentiment_tagging(model_name,results):
    dic_results=copy.deepcopy(results)
    tokenizer=AutoTokenizer.from_pretrained(model_name)
    model=AutoModelForSequenceClassification.from_pretrained(model_name)
    device=torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)
    model.eval()
    id2label=model.config.id2label
    for dic_result in tqdm(dic_results,total=len(dic_results),desc='txt sentiment tagging...'):
        txt=dic_result['txt']
        inputs=tokenizer(txt,return_tensors='pt',padding=True,truncation=True)
        inputs={key:value.to(device) for key,value in inputs.items()}
        with torch.inference_mode():
            outputs=model(**inputs)
            predictions=outputs.logits.argmax(dim=-1).item()
            sentiment_tag=id2label[predictions]
        dic_result['txt_senti']=sentiment_tag
    with open(r'data\txt_sentiment_tagging_result.jsonl','w',encoding='utf-8') as outfile:
        for dic_result in dic_results:
            outfile.write(json.dumps(dic_result,ensure_ascii=False)+'\n')
    print('txt sentiment tagging result saved as data/txt_sentiment_tagging_result.jsonl')
    return dic_results

In [26]:
def is_conflict(emoji_senti,txt_senti):
    if (emoji_senti=='positive' and txt_senti=='negative') or (emoji_senti=='negative' and txt_senti=='positive'):
        return True
    else:
        return False

In [27]:
def emoji_sentiment_tagging(dic_results,lexicon_path=r'data\train-00000-of-00001-5a3ca3bbb3cb2c22.parquet'):
    lexicon_df=pd.read_parquet(lexicon_path)
    emoji_senti_dict={}
    emoji_name_dict={}
    for _,row in lexicon_df.iterrows():
        if row['Emoji'] and row['label'] and row['Unicode_Name']:
            emoji_senti_dict[row['Emoji']]='positive' if row['label']==2 else 'neutral' if row['label']==1 else 'negative'
            emoji_name_dict[row['Emoji']]=row['Unicode_Name']
    mydf=pd.DataFrame(dic_results)
    mydf["emoji_name"]=mydf["emoji"].map(emoji_name_dict)
    mydf["emoji_senti"]=mydf["emoji"].map(emoji_senti_dict)
    print('emoji sentiment tagged')
    mydf["conflict"]=mydf.apply(lambda row: is_conflict(row['emoji_senti'],row['txt_senti']),axis=1)
    print('sentiment conflict analyzed')
    return mydf

In [None]:
def emoji_counts(df_tagged):
    mydf=df_tagged.copy()
    mydf["emoji_count"]=mydf["emoji"].map(mydf["emoji"].value_counts())
    mydf_sorted=mydf.sort_values(by="emoji_count",ascending=False)
    print('emoji count finished')
    mydf_arranged=mydf_sorted[['emoji','emoji_name','emoji_count','emoji_senti','txt_senti','conflict','txt','full_txt']]
    mydf_cleaned=mydf_arranged.dropna()
    mydf_cleaned2=mydf_cleaned[mydf_cleaned['txt'].str.strip()!='']
    os.makedirs('result',exist_ok=True)
    mydf_cleaned2.to_csv(r'result\result_final.csv',index=False,encoding='utf-8-sig')
    print('result_final saved: result/result_final.csv')
    return mydf_cleaned2

In [None]:
def generate_report(mydf_final,path=r'result\report_final.csv'):
    total_count=len(mydf_final)
    summary_df=mydf_final.groupby('emoji').agg(name=('emoji_name','first'),senti=('emoji_senti','first'),count=('emoji_count','first'),conflict_rate=('conflict','mean')).reset_index()
    summary_df['use_percentage']=summary_df['count']/total_count
    summary_df=summary_df.sort_values(by=['count','name'],ascending=False)
    summary_df.to_csv(path,index=False,encoding='utf-8-sig')
    print('report final saved: result/report_final.csv')

In [8]:
text_list=load_text(path=r'data\all.jsonl')

data loaded


In [9]:
results=extract_text_emoji(text_list)

txt/emoji parsing...: 100%|██████████| 2449/2449 [00:00<00:00, 4635.42it/s]


txt/emoji parsing result saved as data/extract_text_emoji_result.jsonl


In [10]:
dic_results=txt_sentiment_tagging(model_name="clapAI/roberta-large-multilingual-sentiment",results=results)

  attn_output = torch.nn.functional.scaled_dot_product_attention(
txt sentiment tagging...: 100%|██████████| 16550/16550 [02:31<00:00, 109.21it/s]


txt sentiment tagging result saved as data/txt_sentiment_tagging_result.jsonl


In [28]:
df_tagged=emoji_sentiment_tagging(dic_results=dic_results,lexicon_path=r'data\train-00000-of-00001-5a3ca3bbb3cb2c22.parquet')

emoji sentiment tagged
sentiment conflict analyzed


In [29]:
mydf_final=emoji_counts(df_tagged)

emoji count finished


In [None]:
generate_report(mydf_final,path=r'result\report_final.csv')