# Filtering topics

## Initialization

In [1]:
import pandas as pd
import numpy as np
import pickle
import tqdm
import plotly.express as px
import os

In [None]:
PATH_DIR = './'
os.chdir(PATH_DIR)

In [2]:
PATH = 'Data/bert_out/'
FILE_NAME = 'topic_distr_original'

with open(PATH+FILE_NAME+'.pickle', 'rb') as handle:
    topic_distr = pickle.load(handle)

topic_distr

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,92,93,94,95,96,97,98,99,100,101
0,0.024797,0.011705,0.010693,0.008375,0.005628,0.018542,0.020291,0.004845,0.008754,0.008264,...,0.012315,0.006511,0.008253,0.009008,0.011415,0.013037,0.004864,0.015753,0.013542,0.008988
1,0.040935,0.000000,0.040133,0.000000,0.033549,0.029655,0.025441,0.000000,0.025226,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.032615,0.000000,0.024874,0.000000,0.000000,0.000000
2,0.012864,0.021521,0.009310,0.010332,0.009723,0.014192,0.012019,0.010545,0.010255,0.010771,...,0.007341,0.011818,0.010794,0.009102,0.008880,0.006091,0.008485,0.009068,0.007105,0.009315
3,0.011270,0.016598,0.010866,0.000000,0.007408,0.021291,0.000000,0.000000,0.013026,0.021692,...,0.014128,0.010863,0.012475,0.012540,0.000000,0.014832,0.000000,0.016253,0.000000,0.014646
4,0.013652,0.024821,0.008584,0.007621,0.005711,0.013430,0.008990,0.008046,0.007147,0.017659,...,0.009970,0.012323,0.009855,0.006026,0.009519,0.013055,0.000000,0.007105,0.006700,0.011197
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
92823,0.015265,0.014507,0.006162,0.008193,0.006817,0.015499,0.011959,0.007650,0.011001,0.010540,...,0.008193,0.007293,0.005387,0.010188,0.010505,0.010320,0.010541,0.009675,0.009273,0.008920
92824,0.016893,0.017005,0.009695,0.007517,0.000000,0.024585,0.023215,0.007745,0.000000,0.010414,...,0.008897,0.011413,0.000000,0.012517,0.010195,0.012201,0.024589,0.007866,0.008193,0.000000
92825,0.011575,0.016199,0.007878,0.000000,0.004853,0.017223,0.006641,0.004726,0.006566,0.011706,...,0.008648,0.013275,0.009290,0.008208,0.007408,0.040022,0.005015,0.017245,0.010282,0.010724
92826,0.012377,0.013047,0.011152,0.008696,0.000000,0.016131,0.008041,0.012153,0.000000,0.008125,...,0.007397,0.007312,0.014083,0.000000,0.008264,0.006892,0.010266,0.012969,0.012778,0.016228


## Visualization

In [3]:
def visualize_thresholds(topic_distr, lower_thr=0, upper_thr=1):
  tmp_dfs = []

  # iterating through different threshold levels
  for thr in tqdm.tqdm(np.arange(lower_thr, upper_thr, 0.001)):

      tmp_df = pd.DataFrame()
      tmp_df['num_topics'] = topic_distr[ topic_distr >= thr ].count(axis=1)

      tmp_df['num_docs'] = 1

      tmp_df['num_topics_group'] = tmp_df['num_topics']\
          .map(lambda x: str(x) if x < 5 else '5+')

      # aggregating stats
      tmp_df_aggr = tmp_df.groupby('num_topics_group', as_index = False).num_docs.sum()
      tmp_df_aggr['threshold'] = thr

      tmp_dfs.append(tmp_df_aggr)

  num_topics_stats_df = pd.concat(tmp_dfs).pivot(index = 'threshold',
                                values = 'num_docs',
                                columns = 'num_topics_group').fillna(0)

  num_topics_stats_df = num_topics_stats_df.apply(lambda x: 100.*x/num_topics_stats_df.sum(axis = 1))

  # visualisation
  colormap = px.colors.sequential.YlGnBu
  return px.area(num_topics_stats_df,
        title = 'Distribution of number of topics',
        labels = {'num_topics_group': 'number of topics',
                  'value': 'percentage of tweets'},
        color_discrete_map = {
            '0': colormap[0],
            '1': colormap[3],
            '2': colormap[4],
            '3': colormap[5],
            '4': colormap[6],
            '5+': colormap[7]
        })

In [4]:
visualize_thresholds(topic_distr)

100%|██████████| 1000/1000 [01:23<00:00, 11.94it/s]


In [5]:
visualize_thresholds(topic_distr, lower_thr=0, upper_thr=0.03)
# 0.018 - 0.02

100%|██████████| 30/30 [00:02<00:00, 12.77it/s]


## Topics

In [6]:
PATH = 'Data/'
PATH_FILE_XLSX = 'topic_stats'

df_topics = pd.read_excel(PATH+PATH_FILE_XLSX+'.xlsx', engine='openpyxl')

In [7]:
df_topics.head()

Unnamed: 0.1,Unnamed: 0,Topic,Count,Name,Name_1,Name_2,Name_3,Representation,Representative_Docs
0,0,-1,58626,-1_like_rt_one_people,,,,"['like', 'rt', 'one', 'people', 'get', 'follow...","['mins rt rt like', 'k hours rt follow', 'k ho..."
1,1,0,5651,0_tag_follow_giveaway_retweet,Contest Participation,,Social Media,"['tag', 'follow', 'giveaway', 'retweet', 'tag ...",['giveaway x og spots enter follow like rt tag...
2,2,1,2987,1_birthday_happy birthday_happy_morning,General Expressions,,General Expressions,"['birthday', 'happy birthday', 'happy', 'morni...","['happy birthday', 'happy birthday', 'happy bi..."
3,3,2,2442,2_song_album_music_spotify,Music,,Entertainment,"['song', 'album', 'music', 'spotify', 'songs',...",['seokjin shazam record holder seokjin represe...
4,4,3,2225,3_trump_biden_president_fbi,US Politics,,Politics and Conflicts,"['trump', 'biden', 'president', 'fbi', 'donald...",['say president trump hes president anymore su...


In [8]:
PATH = 'Data/Original dataset 11-2022/'
FILE_NAME = 'filtered_tweets'
df_tweets = pd.read_excel(PATH+FILE_NAME+'.xlsx', engine='openpyxl')

In [9]:
df_tweets.head()

Unnamed: 0.1,Unnamed: 0,tweet_id,user_id,text,created_at,in_reply_to_status_id,in_reply_to_user_id,quote_count,reply_count,retweet_count,...,entities,type,filtered_text,tokenized_text,stemmed_text,hashtags,emojis,smileys,numbers,urls
0,0,1587232917985512960,1340915697384209920,RT @sergioisbetter: ‼️‼️💚SALE ENDING SOON💚‼️‼️...,Mon Oct 31 23:59:51 +0000 2022,,,0,0,0,...,"{'hashtags': [], 'urls': [{'url': 'https://t.c...",RT,,,,[],[],[],[],[]
1,1,1586896938363802112,1113923052084113024,‼️‼️💚SALE ENDING SOON💚‼️‼️\nWatch our latest h...,Mon Oct 31 01:44:47 +0000 2022,,,2,10,787,...,"{'hashtags': [], 'urls': [{'url': 'https://t.c...",tweet,sale ending soon watch latest hottest collab n...,$EMOJI$SALE ENDING SOON$EMOJI$Watch our latest...,sale end soon watch latest hottest collab new...,[],"['💚', '💚']",[],[],"['https://t.co/kGpSdd3HUR', 'https://t.co/MPkU..."
2,2,1587232917947929088,768539405472542720,Now playing: WJIC Network ID by The WJIC Netwo...,Mon Oct 31 23:59:51 +0000 2022,,,0,0,0,...,"{'hashtags': [], 'urls': [{'url': 'https://t.c...",tweet,playing wjic network id wjic network listen live,Now playing: WJIC Network ID by The WJIC Netwo...,play wjic network id wjic network listen live,[],[],[],[],['https://t.co/Lzezf1IpOn']
3,3,1587232917972999936,1582752440780313088,RT @Kyzef: Good morning mfers ☀️ Hope everybod...,Mon Oct 31 23:59:51 +0000 2022,,,0,0,0,...,"{'hashtags': [], 'urls': [], 'user_mentions': ...",RT,,,,[],[],[],[],[]
4,4,1587109985929945088,1262722155357199872,Good morning mfers ☀️ Hope everybody has a gre...,Mon Oct 31 15:51:22 +0000 2022,,,0,23,8,...,"{'hashtags': [], 'urls': [], 'user_mentions': ...",tweet,good morning mfers hope everybody great halloween,Good morning mfers $EMOJI$ Hope everybody has ...,good morn mfer hope everybodi great halloween,[],['☀'],[],[],[]


In [10]:
def create_topics_dictionary(df_topics):
  topics_dict = {}

  for i in range(1,len(df_topics)):
    topics_dict[df_topics['Topic'][i]] = []
    topics_dict[df_topics['Topic'][i]].append(df_topics['Name_3'][i])

    if (pd.notnull(df_topics['Name_2'][i])):
      topics_dict[df_topics['Topic'][i]].append(df_topics['Name_2'][i])
      
    if (df_topics['Name_1'][i] not in topics_dict[df_topics['Topic'][i]]):
      topics_dict[df_topics['Topic'][i]].append(df_topics['Name_1'][i])

  return topics_dict

def get_topic_scores(df, topic_list, idx):
  topics = []
  scores = []

  for x in topic_list:
    topics.append(x)
    scores.append(df.loc[idx][x])

  return {'Topics': topics, 'Scores': scores}

def get_topic_names(df, topic_names, topic_list, idx, specific_topics=False):
  dictionary = {}
  min_iters = 0
  max_iters = 1

  for x in topic_list:
    if (specific_topics == True):
      min_iters = 1
      max_iters = len(topic_names[x])

    for i in range(min_iters, max_iters):
      if (topic_names[x][i] not in dictionary.keys()):
        dictionary[topic_names[x][i]] = df.loc[idx][x]
      else:
        dictionary[topic_names[x][i]] += df.loc[idx][x]

  return {'Topics': dictionary.keys(), 'Scores': dictionary.values()}

def get_relevant_topics(df, threshold, topic_names):
  df_topics = pd.DataFrame()
  df_topics['topics_list'] = df.apply(lambda row: get_topic_scores(df, row[row >= threshold].index.tolist(), row.name), axis=1)
  df_topics['topics_list_general'] = df.apply(lambda row: get_topic_names(df, topic_names, row[row >= threshold].index.tolist(), row.name), axis=1)
  df_topics['topics_list_specific'] = df.apply(lambda row: get_topic_names(df, topic_names, row[row >= threshold].index.tolist(), row.name, specific_topics=True), axis=1)
  return df_topics

def combine_datasets(df_tweets, df_topic_distr):
  df_final = pd.DataFrame()
  df_final = df_tweets.copy(deep=True)
  df_final.loc[pd.notna(df_final['filtered_text']), 'topics_list'] = df_topic_distr['topics_list'].values
  df_final.loc[pd.notna(df_final['filtered_text']), 'topics_list_general'] = df_topic_distr['topics_list_general'].values
  df_final.loc[pd.notna(df_final['filtered_text']), 'topics_list_specific'] = df_topic_distr['topics_list_specific'].values

  return df_final

In [11]:
topics_dict = create_topics_dictionary(df_topics)
topics_dict

{0: ['Social Media', 'Contest Participation'],
 1: ['General Expressions'],
 2: ['Entertainment', 'Music'],
 3: ['Politics and Conflicts', 'US Politics'],
 4: ['Politics and Conflicts', 'Ukraine Conflict'],
 5: ['General Expressions'],
 6: ['Finance', 'Cryptocurrency'],
 7: ['Religion', 'Hinduism/Islam'],
 8: ['Sports', 'American Football/Basketball'],
 9: ['Food'],
 10: ['Sports', 'Football'],
 11: ['Social Interactions and Relationships', 'Family'],
 12: ['Education'],
 13: ['Art and Creativity', 'Fashion'],
 14: ['Religion', 'Christianity'],
 15: ['Art and Creativity', 'Art'],
 16: ['Transportation'],
 17: ['Politics and Conflicts', 'Tigray Conflict'],
 18: ['Other', 'Names'],
 19: ['Entertainment', 'Literature'],
 20: ['Entertainment', 'Music', 'KPop'],
 21: ['Health'],
 22: ['General Expressions', 'Positive Expressions'],
 23: ['Politics and Conflicts', 'UK Politics'],
 24: ['Emotions', 'Love and Happiness'],
 25: ['Other', 'Adult Content'],
 26: ['Social Media', 'Social Media Eng

In [None]:
threshold = 0.019
topic_distr.set_index(df_tweets.loc[pd.notna(df_tweets['filtered_text'])].index)
df_topic_distr = get_relevant_topics(topic_distr, threshold=threshold, topic_names=topics_dict)
df_final = combine_datasets(df_tweets, df_topic_distr)

In [15]:
# Add original tweet's topics to every retweet
for idx,row in df_final.iterrows():
    if (row['type'] == 'RT'):
        df_final.loc[idx, ['topics_list', 'topics_list_general', 'topics_list_specific']] = df_final.loc[idx+1, ['topics_list', 'topics_list_general', 'topics_list_specific']]

In [16]:
df_final

Unnamed: 0.1,Unnamed: 0,tweet_id,user_id,text,created_at,in_reply_to_status_id,in_reply_to_user_id,quote_count,reply_count,retweet_count,...,tokenized_text,stemmed_text,hashtags,emojis,smileys,numbers,urls,topics_list,topics_list_general,topics_list_specific
0,0,1587232917985512960,1340915697384209920,RT @sergioisbetter: ‼️‼️💚SALE ENDING SOON💚‼️‼️...,Mon Oct 31 23:59:51 +0000 2022,,,0,0,0,...,,,[],[],[],[],[],"{'Topics': [0, 6, 22, 28, 33], 'Scores': [0.02...","{'Topics': ('Social Media', 'Finance', 'Genera...","{'Topics': ('Contest Participation', 'Cryptocu..."
1,1,1586896938363802112,1113923052084113024,‼️‼️💚SALE ENDING SOON💚‼️‼️\nWatch our latest h...,Mon Oct 31 01:44:47 +0000 2022,,,2,10,787,...,$EMOJI$SALE ENDING SOON$EMOJI$Watch our latest...,sale end soon watch latest hottest collab new...,[],"['💚', '💚']",[],[],"['https://t.co/kGpSdd3HUR', 'https://t.co/MPkU...","{'Topics': [0, 6, 22, 28, 33], 'Scores': [0.02...","{'Topics': ('Social Media', 'Finance', 'Genera...","{'Topics': ('Contest Participation', 'Cryptocu..."
2,2,1587232917947929088,768539405472542720,Now playing: WJIC Network ID by The WJIC Netwo...,Mon Oct 31 23:59:51 +0000 2022,,,0,0,0,...,Now playing: WJIC Network ID by The WJIC Netwo...,play wjic network id wjic network listen live,[],[],[],[],['https://t.co/Lzezf1IpOn'],"{'Topics': [0, 2, 4, 5, 6, 8, 10, 15, 16, 18, ...","{'Topics': ('Social Media', 'Entertainment', '...","{'Topics': ('Contest Participation', 'Music', ..."
3,3,1587232917972999936,1582752440780313088,RT @Kyzef: Good morning mfers ☀️ Hope everybod...,Mon Oct 31 23:59:51 +0000 2022,,,0,0,0,...,,,[],[],[],[],[],"{'Topics': [1, 71, 72], 'Scores': [0.021520933...","{'Topics': ('General Expressions'), 'Scores': ...","{'Topics': ('Hope and Expectation'), 'Scores':..."
4,4,1587109985929945088,1262722155357199872,Good morning mfers ☀️ Hope everybody has a gre...,Mon Oct 31 15:51:22 +0000 2022,,,0,23,8,...,Good morning mfers $EMOJI$ Hope everybody has ...,good morn mfer hope everybodi great halloween,[],['☀'],[],[],[],"{'Topics': [1, 71, 72], 'Scores': [0.021520933...","{'Topics': ('General Expressions'), 'Scores': ...","{'Topics': ('Hope and Expectation'), 'Scores':..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
153619,153619,1587006429537313024,1193921600799665920,she's so cute 😭 https://t.co/dwTqW8NVY2,Mon Oct 31 08:59:52 +0000 2022,,,7,1,93,...,she's so cute $EMOJI$ $URL$,cute,[],['😭'],[],[],['https://t.co/dwTqW8NVY2'],"{'Topics': [22, 30, 67, 97], 'Scores': [0.0204...","{'Topics': ('General Expressions', 'Physical A...","{'Topics': ('Positive Expressions', 'Gratitude..."
153620,153620,1587263620265742080,1484210607872167936,RT @charts_k: [Exclusive] K-media reports @BTS...,Tue Nov 01 02:01:51 +0000 2022,,,0,0,0,...,,,[],[],[],[],[],"{'Topics': [49, 75, 76, 80, 91], 'Scores': [0....","{'Topics': ('Sports', 'Other', 'General Expres...","{'Topics': ('Fighting and Combat Sports', 'Sho..."
153621,153621,1587263620278234880,1418611699863621888,@QuoteSunTzu My shield is proud for my blade l...,Tue Nov 01 02:01:51 +0000 2022,1.587248e+18,1.434815e+18,0,0,0,...,$MENTION$ My shield is proud for my blade lies...,shield proud blade lie hidden,[],[],[],[],[],"{'Topics': [49, 75, 76, 80, 91], 'Scores': [0....","{'Topics': ('Sports', 'Other', 'General Expres...","{'Topics': ('Fighting and Combat Sports', 'Sho..."
153622,153622,1587263620265778944,962369786301833216,"RT @coyodee__: So is it mombod Monday, milf or...",Tue Nov 01 02:01:51 +0000 2022,,,0,0,0,...,,,[],[],[],[],[],"{'Topics': [], 'Scores': []}","{'Topics': (), 'Scores': ()}","{'Topics': (), 'Scores': ()}"


In [17]:
PATH = 'Data/Original dataset 11-2022/'
FILE_NAME = 'tweets_with_topics'

df_final.to_excel(PATH+FILE_NAME+'.xlsx')