## Extracting Tweet Metadata <a class="anchor"  id="chapter4"></a>

In [3]:
def add_tweet_metadata(df: pd.DataFrame) -> pd.DataFrame:
    """
    Extracts metadata from a DataFrame of tweets and adds the information as columns to the DataFrame.

    Parameters:
        df (pandas.DataFrame): The DataFrame of tweets to add metadata to.

    Returns:
        pandas.DataFrame: The original DataFrame with added columns for tweet metadata.

    """
    # Number of words in each tweet (excluding mentions, URLs, and hashtags)
    df['word_count'] = df['text'].apply(lambda x: len(re.findall(r'\b(?<![#@])\w+\b', x)))

    # Number of unique words in each tweet (excluding mentions, URLs, and hashtags)
    df['unique_word_count'] = df['text'].apply(lambda x: len(set(re.findall(r'\b(?<![#@])\w+\b', x))))

    # Number of characters
    df['character_count'] = df['text'].apply(lambda x: len(x))

    # Number of hashtags
    df['hashtag_count'] = df['text'].apply(lambda x: len(re.findall(r'#\w+', x)))

    # Number of mentions
    df['mention_count'] = df['text'].apply(lambda x: len(re.findall(r'@\w+', x)))

    # Number of URLs 
    df['url_count'] = df['text'].apply(lambda x: len(re.findall\
            (r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', x)))

    # Count the number of capitalized words
    df['capitalized_word_count'] = df['text'].apply(lambda x: len(re.findall(r'\b[A-Z][a-z]*\b', x)))

    # Count the proportion of capitalized words
    df['capitalized_word_proportion'] = df['capitalized_word_count'] / df['word_count']
    
    return df

# Create new columns in dataframe with metadata
add_tweet_metadata(df)

Unnamed: 0,id,keyword,location,text,target,word_count,unique_word_count,character_count,hashtag_count,mention_count,url_count,capitalized_word_count,capitalized_word_proportion
0,1,,,Our Deeds are the Reason of this #earthquake M...,1.0,12,12,69,1,0,0,5,0.416667
1,4,,,Forest fire near La Ronge Sask. Canada,1.0,7,7,38,0,0,0,5,0.714286
2,5,,,All residents asked to 'shelter in place' are ...,1.0,22,18,133,0,0,0,2,0.090909
3,6,,,"13,000 people receive #wildfires evacuation or...",1.0,8,8,65,1,0,0,1,0.125000
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1.0,14,13,88,2,0,0,3,0.214286
...,...,...,...,...,...,...,...,...,...,...,...,...,...
3258,10861,,,EARTHQUAKE SAFETY LOS ANGELES ÛÒ SAFETY FASTE...,,8,7,55,0,0,0,0,0.000000
3259,10865,,,Storm in RI worse than last hurricane. My city...,,25,24,139,0,0,0,4,0.160000
3260,10868,,,Green Line derailment in Chicago http://t.co/U...,,9,9,55,0,0,1,3,0.333333
3261,10874,,,MEG issues Hazardous Weather Outlook (HWO) htt...,,10,10,65,0,0,1,3,0.300000
