In [22]:
# Prepare data for regressing engagement on sponsorship

import pandas as pd
from pathlib import Path
from ast import literal_eval
import re

pd.options.display.max_rows = 500

In [None]:
# Directories
cwd = Path.cwd()
root = cwd / '..' / '..'
data = root / 'data'

In [None]:
# Post data
posts = pd.read_csv(data / 'data_initial_regressions.csv')

# Make sure there are no NaN values when I convert these columns to lists later
posts = posts.fillna({'caption_hashtags': '[]',
                      'sponsors': '[]',
                      'caption_mention': '[]',
                      'owner_comment_hashtags': '[]'})
posts

In [None]:
# The column originally called "sponsored" is true if the post has an explicit "paid partnership with @xyz" disclosure,
# and the column originally called "sponsors" contains the list of partners. Rename these columns so the names are more
# descriptive
posts = posts.rename(columns = {'sponsored': 'paid_partnership',
                                'sponsors': 'paid_partners'})
posts

In [None]:
# Parse columns whose values are lists
posts['caption_hashtags'] = posts['caption_hashtags'].apply(literal_eval)
posts['caption_mention'] = posts['caption_mention'].apply(literal_eval)
posts['paid_partners'] = posts['paid_partners'].apply(literal_eval)
posts['owner_comment_hashtags'] = posts['owner_comment_hashtags'].apply(literal_eval)

In [20]:
# Terms to use to classify sponsored posts

# Terms that must match exactly. Matching all words that start with these terms would
# give a lot of false positives
exact_match = ['ad', 'sp', 'spon']

# Terms to match at the beginning. We don't want to match foodnetwork, postworkout, preworkout, etc.
start_match ['work']

# Terms to match anywhere
any_match = ['partner', 'sponsor', 'paid']

In [51]:
# Determine whether a word w indicates a sponsored posts
def classify_sponsored_word(w):
    re_exact = re.compile('|'.join(exact_match), re.IGNORECASE)
    re_start = re.compile('|'.join(start_match), re.IGNORECASE)
    re_any = re.compile('|'.join(any_match), re.IGNORECASE)
    
    return (bool(re_exact.fullmatch(w)) or bool(re_start.match(w)) or bool(re_any.findall(w)))

In [52]:
posts['caption_hashtags_sponsored'] = [[h for h in tags if classify_sponsored_word(h)] for tags in posts['caption_hashtags']]
posts[['caption_hashtags', 'caption_hashtags_sponsored']]

Unnamed: 0,caption_hashtags,caption_hashtags_sponsored
0,[],[]
1,"[letsgo, foodblogger, concertblogger, photodum...",[]
2,"[hellomarch, march, dayinthelife, yearinreview...",[]
3,"[oatmeal, oats, bakedoats, healthybreakfast, b...",[]
4,"[plantbased, plantbasedfood, plantpower, vegan...",[]
5,[girldad],[]
6,"[cheese, macandcheese, nycfoodcoma]",[]
7,"[healthylifestyle, balancedlifestyle, focusony...",[]
8,"[smorescookies, smoresseason, oatmealcookies, ...",[]
9,"[overnightoats, porridge, oatmeal, frühstücksr...",[]
