In [1]:
import pandas as pd
import numpy as np
import re
import emoji

In [5]:
def remove_emoji(text):
    return emoji.get_emoji_regexp().sub(u'', text)

URL = re.compile('((([A-Za-z]{3,9}:(?:\/\/)?)(?:[\-;:&=\+\$,\w]+@)?[A-Za-z0-9\.\-]+|(?:www\.|[\-;:&=\+\$,\w]+@)[A-Za-z0-9\.\-]+)((?:\/[\+~%\/\.\w\-_]*)?\??(?:[\-\+=&;%@\.\w_]*)#?(?:[\.\!\/\\\w]*))?)')

def preprocess_word(word):
    # Remove punctuation
    word = word.strip('\'"?!,.():;')
    # Convert more than 2 letter repetitions to 2 letter
    # funnnnny --> funny
    word = re.sub(r'(.)\1+', r'\1\1', word)
    # Remove - & '
    word = re.sub(r'(-|\')', '', word)
    return word

def pre_process(tweet, keep_hashtag = False, keep_special_symbols = False, lower_case = False):

# Replaces URLs with the word URL
    tweet = re.sub(URL, '', tweet)
# Replace @handle with the word USER_MENTION
#     tweet = re.sub(r'@[\S]+', 'USER_MENTION', tweet)
    tweet = re.sub(r'@[\S]+', '', tweet)
# Replaces #hashtag with hashtag
    if keep_hashtag:
        tweet = re.sub(r'#(\S+)', r' \1 ', tweet)
    else:
        tweet = re.sub(r'#(\S+)', '', tweet)
# Remove RT (retweet)
    tweet = re.sub(r'\brt\b', '', tweet)
# Remove emoji with the word EMOJI
#     tweet = re.sub(EMOJIS, '', tweet)
    tweet = remove_emoji(tweet)
# Add spacs into camel case sentences
    tweet = re.sub('([A-Z][a-z]+)', r' \1', re.sub('([A-Z]+)', r'\1', tweet))
# Replace 2+ dots with space
    tweet = re.sub(r'\.{2,}', ' ', tweet)
# Strip space, " and ' from tweet
    tweet = tweet.strip(' "\'')
# Replace multiple spaces with a single space
    tweet = re.sub(r'\s+', ' ', tweet)
# # Convert to lower case
    if lower_case:
        tweet = tweet.lower()
        
        
    if keep_special_symbols is False: 
        words = tweet.split()
        processed_tweet = []
        for word in words:
            word = preprocess_word(word)
            processed_tweet.append(word)

        return ' '.join(processed_tweet)
    else:
        return tweet

In [3]:
df_news_tweets = pd.read_pickle('news_tweets_english.pkl')

In [7]:
clean_text = []  # text for summarization
processed_text = [] # text for embedding and clustering
processed_text_lower_case = []
for index, row in df_news_tweets.iterrows():
    clean_text.append(pre_process(row['tweet_text'], keep_hashtag = True, keep_special_symbols = True))
    processed_text.append(pre_process(row['tweet_text']))
    processed_text_lower_case.append(pre_process(row['tweet_text'], lower_case = True))

df_news_tweets['clean_text'] = clean_text
df_news_tweets['processed_text'] = processed_text
df_news_tweets['processed_text_lower_case'] = processed_text_lower_case

### Obtain news summary

The dataset has two candidate for news summary: claims and statements. Based on observation, claim is more suitable than statememts, as most news pieces have empty statements, and those that are not empty are shorter than claims.

In [10]:
news_summary = []
for index, row in df_news_tweets.iterrows():
    if pd.isna(row['claim']) :
        news_summary.append(row['statement'])
    else:
        claim = re.sub(r'\s+', ' ', row['claim'])
        if claim == ' ':
            news_summary.append(row['statement'])
        else:
            news_summary.append(row['claim'])

In [11]:
df_news_tweets['news_summary'] = news_summary

### Remove less revelant tweets based on bertscore

In [8]:
from datasets import load_metric
# import bert_score
metric = load_metric("bertscore")

In [12]:
results = metric.compute(predictions=df_news_tweets['processed_text'].tolist(), references=df_news_tweets['news_summary'].tolist(), lang='en', rescale_with_baseline=True)

In [13]:
df_news_tweets['bertscore'] = results['f1']

In [21]:
df_news_tweets.to_pickle('news_tweets_preprocessed_en.pkl')

In [14]:
df_filtered = df_news_tweets[df_news_tweets['bertscore']>=0.1] # relevence threshold is 0.1

In [15]:
len(df_filtered)

28947

### Remove short tweets (shorter than 5 words)

In [17]:
from nltk import word_tokenize

In [18]:
df_filtered['totalwords'] = [len(word_tokenize(x)) for x in df_filtered['processed_text'].tolist()]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [19]:
df_filtered.drop(df_filtered[df_filtered.totalwords < 5].index, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


In [20]:
len(df_filtered)

28818

In [22]:
df_filtered.to_pickle('news_tweets_filtered_en.pkl')