Importing libraries

In [1]:
# Data management libraries
import pandas as pd
import numpy as np
import re

# Tweet extraction and processing
import tweepy
import emoji

# Natural language Processing
import spacy
from spacytextblob.spacytextblob import SpacyTextBlob

Importing variables

In [2]:
# Paths
PATH_BEARER_TOKEN = 'C:\\Users\\gl\\Desktop\\Big Data\\TFM\\Docs\\bearer_token.txt'
PATH_TTRPG_EXTRACTION_SPECIFICATIONS = 'C:\\Users\\gl\\Desktop\\Big Data\\TFM\\Docs\\ttrpg_extraction_specifications.xlsx'
PATH_RESULTING_JSON = 'C:\\Users\\gl\\Desktop\\Big Data\\TFM\\Docs\\tweets_with_sentiment.json'

# Tables
TTRPG_EXTRACTION_SPECIFICATIONS = pd.read_excel(PATH_TTRPG_EXTRACTION_SPECIFICATIONS)

# Tweet extraction specifications
CONTEXT_INCLUDE = ' (context:66.872578743771963392 OR context:67.1148209264256663552)' # tt gaming, ttrpg games
QUERY_SPECIFICATIONS = ' -is:retweet lang:en' + CONTEXT_INCLUDE

Extraction functions definition

In [3]:
def tweet_list_to_DataFrame(tweets_found, TTRPG_info):
    if (tweets_found.meta['result_count'] == 0):
        return None
    public_metric_names = ['retweet_count', 'like_count', 'reply_count', 'quote_count']
    tweets = pd.DataFrame()

    for tweet in tweets_found.data:
        tweet_info = {'id': str(tweet.id), 'text': tweet.text,
              'rpg_system': TTRPG_info['rpg_system'],
              'setting': TTRPG_info['setting']}
        
        for metric in public_metric_names:
            tweet_info[metric] = tweet.public_metrics[metric]

        tweets = tweets.append(tweet_info, ignore_index=True)
    
    return tweets

Tweet processing functions definition

In [4]:
def clean_tweet_text(tweet_text):
    tweet_text = re.sub(r"(?:\@|https?\://)\S+", 'mention_or_link',  tweet_text)
    tweet_text = re.sub("\n", ".",  tweet_text)

    banned_words = ['alien', 'basic fantasy role', 'black agents',
                   'alice is missing', 'new generation', 'scum and villainy|scum &amp villainy',
                   'super heroes', 'unknown armies', 'in the dark|dark heresy|dark ages',
                   'games|game', 'delta green']

    for wrd in banned_words:
            tweet_text = re.sub(wrd, 'system_word', tweet_text, flags=re.I)
    
    return tweet_text

Natural language functions definition

In [5]:
def get_sentiment_analysis(tweets, natural_language_processor):
    text_polarity = list()
    text_subjectivity = list()
    text_adjectives = list()

    for tweet_text in list(tweets.loc[:, 'text_cleaned']):
        text_sentiment = natural_language_processor(tweet_text)

        text_polarity.append(text_sentiment._.blob.polarity)
        text_subjectivity.append(text_sentiment._.blob.subjectivity)
        
        adjectives_list = get_sentence_adjectives(text_sentiment)
        if adjectives_list:
            adjectives_list = filter_adjectives(adjectives_list)
        text_adjectives.append(adjectives_list)

    tweets['polarity'] = text_polarity
    tweets['subjectivity'] = text_subjectivity
    tweets['adjectives'] = text_adjectives   
    
    return tweets

def get_sentence_adjectives(text_sentiment):
    result = list()
    
    for token in text_sentiment:
        if(token.pos_ == 'ADJ'):
            result.append(token.lemma_)
    
    result = [x.lower() for x in result]
    return list(np.unique(result))

In [6]:
def filter_adjectives(adjectives_list):
    not_words = ['#', '*', '+', '-', 'd&amp;d.', '¿', '„']
    not_adjectives = ['#ad', "i've", "a’r", "c'", 'co3', 'e', 'n1', 'uma', 'umut', 'unas', 'und', 'usó']
    media_words = ['dnd', 'd100', 'ttrpg', 'ttrpgs', 'vtt']

    filtered_adjectives = list()

    for adjective in adjectives_list:
        if(adjective in not_words):
            continue
        if(adjective[0].isnumeric()):
            continue
        if(adjective in not_adjectives):
            continue
        if(adjective in media_words):
            continue
        if(is_emoji(adjective)):
            continue
        filtered_adjectives.append(adjective)
    
    return filtered_adjectives

def is_emoji(s):
    n = emoji.emoji_count(s)
    return n != 0

### Main code

Tweet extraction

In [7]:
bearer_token = open(PATH_BEARER_TOKEN).readline()
tweepy_client = tweepy.Client(bearer_token=bearer_token)

In [8]:
tweets = pd.DataFrame()

for TTRPG_info in TTRPG_EXTRACTION_SPECIFICATIONS.iloc:
    this_query = TTRPG_info['query'] + QUERY_SPECIFICATIONS
    tweets_found = tweepy_client.search_recent_tweets(query=this_query,
                                                      max_results=100,
                                                      tweet_fields='public_metrics')
    
    tweets = tweets.append(tweet_list_to_DataFrame(tweets_found, TTRPG_info))

tweets.reset_index(drop=True, inplace=True)
tweets.head()

Unnamed: 0,id,like_count,quote_count,reply_count,retweet_count,rpg_system,setting,text
0,1607675836697825280,0.0,0.0,0.0,98.0,Call of Cthulhu,Lovecraft horror,RT @N_Kailow: CoC「口渇ルルパ」🍴🍷\n\n欲しい 足りない 貴方が欲しい...
1,1607675835582124033,0.0,0.0,0.0,1.0,Call of Cthulhu,Lovecraft horror,RT @Michelle_Rabi53: CoCシナリオ \n“レイニービタースイートルーム...
2,1607675835028496387,0.0,0.0,0.0,867.0,Call of Cthulhu,Lovecraft horror,RT @sori_TRPG: 🌸CoC6版 ２PLシナリオ「刀狂双乱」🌸\n「あなたたちは刀...
3,1607675805039198210,0.0,0.0,0.0,0.0,Call of Cthulhu,Lovecraft horror,CoC｢ようこそ！冥迷市役所都市伝説課へ！｣\n来年1月に回ります！最高に素敵な合わせ立ち絵...
4,1607675793949487104,0.0,0.0,0.0,0.0,Call of Cthulhu,Lovecraft horror,Coc『 山羊の歌は謡えない 』\n第3.5話 開幕の時はきたり\n\nお久しぶりのやぎうた...


Tweet processing

In [9]:
# Filter Retweets   ERASE IF USED WITH FULL QUERY
print('Number of tweets: ' + str(tweets.shape[0]))
tweets = tweets.loc[~tweets.text.str.startswith('RT ')]
print('Number of tweets: ' + str(tweets.shape[0]))

Number of tweets: 4293
Number of tweets: 2908


In [10]:
text_cleaned = [clean_tweet_text(tweet_text) for tweet_text in tweets['text']]
tweets['text_cleaned'] = text_cleaned
tweets.head()

Unnamed: 0,id,like_count,quote_count,reply_count,retweet_count,rpg_system,setting,text,text_cleaned
3,1607675805039198210,0.0,0.0,0.0,0.0,Call of Cthulhu,Lovecraft horror,CoC｢ようこそ！冥迷市役所都市伝説課へ！｣\n来年1月に回ります！最高に素敵な合わせ立ち絵...,CoC｢ようこそ！冥迷市役所都市伝説課へ！｣.来年1月に回ります！最高に素敵な合わせ立ち絵は...
4,1607675793949487104,0.0,0.0,0.0,0.0,Call of Cthulhu,Lovecraft horror,Coc『 山羊の歌は謡えない 』\n第3.5話 開幕の時はきたり\n\nお久しぶりのやぎうた...,Coc『 山羊の歌は謡えない 』.第3.5話 開幕の時はきたり..お久しぶりのやぎうた！！！...
5,1607675793903345664,0.0,0.0,0.0,0.0,Call of Cthulhu,Lovecraft horror,@coc_kazuo そして落とし合いに巻き込まれていくと,mention_or_link そして落とし合いに巻き込まれていくと
8,1607675694213115904,1.0,0.0,0.0,1.0,Call of Cthulhu,Lovecraft horror,CoCシナリオ \n“レイニービタースイートルーム”\n\nKPくぼはるさん(大智 心音)\...,CoCシナリオ .“レイニービタースイートルーム”..KPくぼはるさん(大智 心音)..PL...
9,1607675690551496704,0.0,0.0,1.0,0.0,Call of Cthulhu,Lovecraft horror,웃긴건 한 사람만 남고 (이분은 coc 외 티알 자체를 접으심)\n둘은 깨짐 사실 ...,웃긴건 한 사람만 남고 (이분은 coc 외 티알 자체를 접으심).둘은 깨짐 사실 하...


Natural language processing

In [11]:
natural_language_processor = spacy.load('en_core_web_sm')
natural_language_processor.add_pipe('spacytextblob')

<spacytextblob.spacytextblob.SpacyTextBlob at 0x1f508fb5490>

In [12]:
tweets = get_sentiment_analysis(tweets, natural_language_processor)

Export

In [13]:
tweets.to_json(PATH_RESULTING_JSON, orient='index', double_precision=2)