# This file includes all the helper functions used in data_processing.ipynb.

In [2]:
import re

# Section 2: Process

In [None]:
# create regex expressions for different elements one would find in a tweet
regex_str = [
    r'<[^>]+>', # HTML tags
    r'(?:@[\w_]+)', # @-mentions
    r"(?:\#+[\w_]+[\w\'_\-]*[\w_]+)", # hash-tags
    r'http[s]?://(?:[a-z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-f][0-9a-f]))+', # URLs
 
    r'(?:(?:\d+,?)+(?:\.?\d+)?)', # numbers
    r"(?:[a-z][a-z'\-_]+[a-z])", # words with - and '
    r'(?:[\w_]+)', # other words
    r'(?:\S)+' # anything else
]

tokens_re = re.compile(r'('+'|'.join(regex_str)+')', re.VERBOSE | re.IGNORECASE)

In [None]:
"""
This function preprocesses a text and deals with urls, mentions, and hashtags

Inputs:
s (string): text to preprocess
lowercase (boolean): if True then convert string to all lowercase 

Outputs:
preprocessed (string): preprocessed string
"""

def preprocess(s, lowercase=True):
    
    if type(s) != str and math.isnan(s):
        return np.nan
    
    tokens = tokenize(s)
    tokens = [token.lower() for token in tokens]

    html_regex = re.compile('<[^>]+>')
    tokens = [token for token in tokens if not html_regex.match(token)]

    mention_regex = re.compile('(?:@[\w_]+)')
    tokens = ['@user' if mention_regex.match(token) else token for token in tokens]

    url_regex = re.compile('http[s]?://(?:[a-z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-f][0-9a-f]))+')
    tokens = ['!url' if url_regex.match(token) else token for token in tokens]

    hashtag_regex = re.compile("(?:\#+[\w_]+[\w\'_\-]*[\w_]+)")
    tokens = ['' if hashtag_regex.match(token) else token for token in tokens]

    flag = False
    for item in tokens:
        if item=='rt':
            flag = True
            continue
        if flag and item=='@user':
            return ''
        else:
            flag = False
    
    preprocessed = ' '.join([t for t in tokens if t]).replace('rt @user : ','')

    return preprocessed

In [None]:
"""
This function tokenizes a single string

Inputs:
s (string): text to tokenize

Outputs: 
tokenized (list): all tokens of the given string given regex expressions above
"""
def tokenize(s):
    tokenized = tokens_re.findall(s)
    return tokenized

# Section 3: Exclusions

In [1]:
"""
This function applies relevant exclusions to the input dataframe

Inputs:
df (pandas df): input dataframe

Outputs:
df (pandas df): dataframe with a
"""

def apply_exclusions(df):
    
    # drop NaN and empty CRT score 
    for c in df.columns:
        if c.startswith('CRT'):    
            df = df[df.c.notna()]
            df = df[df.c != ' ']

    # drop participants with NaN or empty screen name 
    df = df[df['screen_name'].notna()]
    df = df[df.screen_name!=' ']
    
    # drop participants with age < 18
    df = df[df['age']>=18] 
    
    # drop participants with followers >= 7,000 followers
    df = df[df['followers_count']<7000] 

    return df

In [3]:
"""
This function checks if the order of text in df_1 is the same as that of df_2

Inputs:
df_1 (pandas df): dataframe from t2v_single.txt
d2_2 (pandas df): dataframe from t2v_mappings.txt

Outputs:
is_ordered (boolean): if True, text columns from df_1 and df_2 are identical 

"""

def is_ordered(df_1, df_2):
    is_ordered = df_1['text'].equals(df_2['text'])
    return is_ordered