# Making Bachelorette Demo Dataset
### Combining Demographics and Elimination Data
### Adding Additional Created Variables

In [1]:
# load up libraries
import pandas as pd
import numpy as np
%matplotlib inline

In [7]:
# bring in data
# read in bachelorette contestant demographic data
ette_demo = pd.read_csv("ette_cont_occ_group.csv")

# read in elimination data
elim = pd.read_csv("bachelorette_538.csv")
elim = elim.drop([0])

# read in bachelorette data
ettes = pd.read_csv("bachelorettes.csv")

In [8]:
# only select bachelorette elimination data
elim_ette = elim[elim.SHOW == 'Bachelorette'].copy()
# keep only numbered seasons
elim_ette = elim_ette[elim_ette.SEASON != 'SEASON'].copy()

elim_ette['SEASON'] = elim_ette['SEASON'].astype(str).astype(int)
elim_ette = elim_ette.merge(ettes, left_on=['SEASON'], right_on=['Season'])

In [9]:
elim_ette.columns

Index(['SHOW', 'SEASON', 'CONTESTANT', 'ELIMINATION-1', 'ELIMINATION-2',
       'ELIMINATION-3', 'ELIMINATION-4', 'ELIMINATION-5', 'ELIMINATION-6',
       'ELIMINATION-7', 'ELIMINATION-8', 'ELIMINATION-9', 'ELIMINATION-10',
       'DATES-1', 'DATES-2', 'DATES-3', 'DATES-4', 'DATES-5', 'DATES-6',
       'DATES-7', 'DATES-8', 'DATES-9', 'DATES-10', 'Name', 'Age',
       'Occupation', 'Hometown', 'Season'],
      dtype='object')

In [10]:
# setting up data to combine the two sets

# split elim CONTESTANT on "_"
elim_ette[['CON_SEASON', 'CON_FIRST', 'CON_INIT']] = elim_ette['CONTESTANT'].str.split('_',expand=True)
#elim_ette.head()

# split bach CONTESTANT on " ",
ette_demo[['CON_FIRST', 'CON_LAST', 'CON_OTHER']] = ette_demo['Name'].str.split(' ', expand=True)
#ette_demo.head()

In [11]:
# make everything uppercase 
ette_demo.CON_FIRST = ette_demo.CON_FIRST.str.upper()
ette_demo.CON_LAST = ette_demo.CON_LAST.str.upper()
ette_demo.CON_OTHER = ette_demo.CON_OTHER.str.upper()

# keep only first initial of last name
ette_demo['CON_LAST2'] = ette_demo.CON_LAST.str[0]
ette_demo['CON_LAST3'] = ette_demo.CON_LAST.str[1]
ette_demo['CON_OTHER2'] = ette_demo.CON_OTHER.str[0]
# iterate through rows in order to keep correct letter
# logic will be: 
# if CON_LAST2 == '"' or if CON_LAST3 == '.' then keep CON_OTHER2
# if CON_OTHER2 == '(' or if CON_OTHER2.isnull then keep CON_LAST2, otherwise keep CON_OTHER2

ette_demo['INITIAL'] = np.where(ette_demo.CON_OTHER2.isnull or ette_demo.CON_OTHER2 == '(', ette_demo.CON_LAST2, ette_demo.CON_OTHER2)
ette_demo['INITIAL_FINAL'] = np.where((ette_demo.CON_LAST2 == '"') | (ette_demo.CON_LAST3 == '.'), ette_demo.CON_OTHER2, ette_demo.INITIAL)

#ette_demo.INITIAL_FINAL.value_counts()
#ette_demo.head()

In [13]:
# remove excess columns
ette_demo = ette_demo[['Name', 'Age', 'Occupation', 'Agreement_1', 'Hometown', 'ElimWeek', 'Season', 'CON_FIRST', 'INITIAL_FINAL']].copy()
#ette_demo.head()

# make data types on merge columns the same
#print(elim_ette.dtypes)
ette_demo.dtypes
elim_ette['SEASON'] = elim_ette['SEASON'].astype(str).astype(int)

In [14]:
# merge/match on first name, last initial season number
all_data = elim_ette.merge(ette_demo, left_on = ['SEASON', 'CON_FIRST', 'CON_INIT'], right_on = ['Season', 'CON_FIRST', 'INITIAL_FINAL'])
#all_data.head()

In [15]:
# keep seasons with full contestant data
# only keep seasons 4 5 6 8 9 10 11 12
bachelorette = all_data[all_data.SEASON.isin([4, 5, 6, 8, 9, 10, 11, 12])].copy()

In [17]:
bachelorette.columns
bachelorette.head()

Unnamed: 0,SHOW,SEASON,CONTESTANT,ELIMINATION-1,ELIMINATION-2,ELIMINATION-3,ELIMINATION-4,ELIMINATION-5,ELIMINATION-6,ELIMINATION-7,...,CON_FIRST,CON_INIT,Name_y,Age_y,Occupation_y,Agreement_1,Hometown_y,ElimWeek,Season_y,INITIAL_FINAL
0,Bachelorette,12,12_JORDAN_R,R1,,,R,R,,,...,JORDAN,R,Jordan Rodgers,27,Former Pro Quarterback,Sports,"Chico, California",,12,R
1,Bachelorette,12,12_ROBBY_H,,,,,R,,R,...,ROBBY,H,Robby Hayes,27,Former Competitive Swimmer,Sports,"St. Augustine, Florida",10.0,12,H
2,Bachelorette,12,12_CHASE_M,,,R,,,R,,...,CHASE,M,Chase McNary,27,Medical Sales Rep,Sales,"Castle Rock, Colorado",9.0,12,M
3,Bachelorette,12,12_LUKE_P,,,,R,,R,,...,LUKE,P,Luke Pell,31,War Veteran,Military,"Burnet, Texas",8.0,12,P
4,Bachelorette,12,12_JAMES_T,,R,R,,,,E,...,JAMES,T,James Taylor,29,Singer-Songwriter,Entertainment and Media,"Katy, Texas",7.0,12,T


In [18]:
bachelorette = bachelorette[['SHOW', 'SEASON', 'CONTESTANT', 'Name_y', 'Age_y', 'Occupation_y', 'Agreement_1',
       'Hometown_y', 'ELIMINATION-1', 'ELIMINATION-2',
       'ELIMINATION-3', 'ELIMINATION-4', 'ELIMINATION-5', 'ELIMINATION-6',
       'ELIMINATION-7', 'ELIMINATION-8', 'ELIMINATION-9', 'ELIMINATION-10',
       'DATES-1', 'DATES-2', 'DATES-3', 'DATES-4', 'DATES-5', 'DATES-6',
       'DATES-7', 'DATES-8', 'DATES-9', 'DATES-10', 'Name_x', 'Age_x',
       'Occupation_x', 'Hometown_x']].copy()
bachelorette.columns = ['SHOW', 'SEASON', 'CONTESTANT', 'NAME', 'AGE', 'OCCUPATION', 'OCCUPATION_GROUP',
       'HOMETOWN', 'ELIMINATION-1', 'ELIMINATION-2',
       'ELIMINATION-3', 'ELIMINATION-4', 'ELIMINATION-5', 'ELIMINATION-6',
       'ELIMINATION-7', 'ELIMINATION-8', 'ELIMINATION-9', 'ELIMINATION-10',
       'DATES-1', 'DATES-2', 'DATES-3', 'DATES-4', 'DATES-5', 'DATES-6',
       'DATES-7', 'DATES-8', 'DATES-9', 'DATES-10', 'ETTE_NAME', 'ETTE_AGE',
       'ETTE_OCCUPATION', 'ETTE_HOMETOWN']
bachelorette.head()

Unnamed: 0,SHOW,SEASON,CONTESTANT,NAME,AGE,OCCUPATION,OCCUPATION_GROUP,HOMETOWN,ELIMINATION-1,ELIMINATION-2,...,DATES-5,DATES-6,DATES-7,DATES-8,DATES-9,DATES-10,ETTE_NAME,ETTE_AGE,ETTE_OCCUPATION,ETTE_HOMETOWN
0,Bachelorette,12,12_JORDAN_R,Jordan Rodgers,27,Former Pro Quarterback,Sports,"Chico, California",R1,,...,D1,D5,D1,D1,D1,D1,"Joelle ""JoJo"" Fletcher",25.0,Real Estate Developer,"Dallas, Texas"
1,Bachelorette,12,12_ROBBY_H,Robby Hayes,27,Former Competitive Swimmer,Sports,"St. Augustine, Florida",,,...,D9,D5,D3,D1,D1,D1,"Joelle ""JoJo"" Fletcher",25.0,Real Estate Developer,"Dallas, Texas"
2,Bachelorette,12,12_CHASE_M,Chase McNary,27,Medical Sales Rep,Sales,"Castle Rock, Colorado",,,...,D9,D2,D3,D1,D1,,"Joelle ""JoJo"" Fletcher",25.0,Real Estate Developer,"Dallas, Texas"
3,Bachelorette,12,12_LUKE_P,Luke Pell,31,War Veteran,Military,"Burnet, Texas",,,...,D9,D5,D1,D1,,,"Joelle ""JoJo"" Fletcher",25.0,Real Estate Developer,"Dallas, Texas"
4,Bachelorette,12,12_JAMES_T,James Taylor,29,Singer-Songwriter,Entertainment and Media,"Katy, Texas",,R,...,D9,D5,D3,,,,"Joelle ""JoJo"" Fletcher",25.0,Real Estate Developer,"Dallas, Texas"


In [19]:
bachelorette.ETTE_OCCUPATION.unique()

array(['Real Estate Developer', 'Dance Instructor',
       'Assistant District Attorney', 'Bridal stylist',
       "Children's hospital event planner", 'Advertising account manager',
       'Interior designer', 'Real estate agent'], dtype=object)

## Add in Rose Variables

In [20]:
#bachelorette['ROSE-1'] = np.where(bachelorette['ELIMINATION-1'] == 'R', 1, 0)
bachelorette['ROSE-2'] = np.where(bachelorette['ELIMINATION-2'] == 'R', 1, 0)
bachelorette['ROSE-3'] = np.where(bachelorette['ELIMINATION-3'] == 'R', 1, 0)
bachelorette['ROSE-4'] = np.where(bachelorette['ELIMINATION-4'] == 'R', 1, 0)
bachelorette['ROSE-5'] = np.where(bachelorette['ELIMINATION-5'] == 'R', 1, 0)
bachelorette['ROSE-6'] = np.where(bachelorette['ELIMINATION-6'] == 'R', 1, 0)
bachelorette['ROSE-7'] = np.where(bachelorette['ELIMINATION-7'] == 'R', 1, 0)
#bachelorette['ROSE-8'] = np.where(bachelorette['ELIMINATION-8'] == 'R', 1, 0)
#bachelorette['ROSE-9'] = np.where(bachelorette['ELIMINATION-9'] == 'R', 1, 0)
#bachelorette['ROSE-10'] = np.where(bachelorette['ELIMINATION-10'] == 'R', 1, 0)

In [21]:
bachelorette['FIR-1'] = np.where(bachelorette['ELIMINATION-1'] == 'R1', 1, 0)
#bachelorette['FIR-2'] = np.where(bachelorette['ELIMINATION-2'] == 'R1', 1, 0)
#bachelorette['FIR-3'] = np.where(bachelorette['ELIMINATION-3'] == 'R1', 1, 0)
#bachelorette['FIR-4'] = np.where(bachelorette['ELIMINATION-4'] == 'R1', 1, 0)
#bachelorette['FIR-5'] = np.where(bachelorette['ELIMINATION-5'] == 'R1', 1, 0)
#bachelorette['FIR-6'] = np.where(bachelorette['ELIMINATION-6'] == 'R1', 1, 0)
#bachelorette['FIR-7'] = np.where(bachelorette['ELIMINATION-7'] == 'R1', 1, 0)
#bachelorette['FIR-8'] = np.where(bachelorette['ELIMINATION-8'] == 'R1', 1, 0)
#bachelorette['FIR-9'] = np.where(bachelorette['ELIMINATION-9'] == 'R1', 1, 0)
#bachelorette['FIR-10'] = np.where(bachelorette['ELIMINATION-10'] == 'R1', 1, 0)

## Add in WINNER column

In [22]:
roses = bachelorette[['SHOW', 'SEASON', 'CONTESTANT', 'ELIMINATION-1', 'ELIMINATION-2',
       'ELIMINATION-3', 'ELIMINATION-4', 'ELIMINATION-5', 'ELIMINATION-6',
       'ELIMINATION-7', 'ELIMINATION-8', 'ELIMINATION-9', 'ELIMINATION-10']].copy()
roses = pd.melt(roses, id_vars=['SHOW', 'SEASON', 'CONTESTANT'])
roses.head()
winners = roses[roses.value == 'W'].CONTESTANT.tolist()

In [23]:
bachelorette['WINNER'] = np.where(bachelorette.CONTESTANT.isin(winners), 1, 0)

In [24]:
bachelorette['WINNER'].value_counts()
bachelorette.columns

Index(['SHOW', 'SEASON', 'CONTESTANT', 'NAME', 'AGE', 'OCCUPATION',
       'OCCUPATION_GROUP', 'HOMETOWN', 'ELIMINATION-1', 'ELIMINATION-2',
       'ELIMINATION-3', 'ELIMINATION-4', 'ELIMINATION-5', 'ELIMINATION-6',
       'ELIMINATION-7', 'ELIMINATION-8', 'ELIMINATION-9', 'ELIMINATION-10',
       'DATES-1', 'DATES-2', 'DATES-3', 'DATES-4', 'DATES-5', 'DATES-6',
       'DATES-7', 'DATES-8', 'DATES-9', 'DATES-10', 'ETTE_NAME', 'ETTE_AGE',
       'ETTE_OCCUPATION', 'ETTE_HOMETOWN', 'ROSE-2', 'ROSE-3', 'ROSE-4',
       'ROSE-5', 'ROSE-6', 'ROSE-7', 'FIR-1', 'WINNER'],
      dtype='object')

## Age Related Variables

In [25]:
# Age Difference between Contestant and Bachelorette
bachelorette['AGE_DIFF'] = bachelorette['ETTE_AGE'] - bachelorette['AGE']
# is positive if the bachelorette is older than the contestant
# is negative if the bachelorette is younger than the contestant

# Age Difference between Contestant and Mean of Contestant Pool for the Season
bachelorette = bachelorette.merge(bachelorette[['SEASON', 'AGE']].groupby(['SEASON'], as_index=False).mean(), on='SEASON', suffixes=['', '_MEAN'])
#bach.head()

# Age Difference between Contestant and the mean age of contestants in their season
bachelorette['AGE_DIFF_MEAN_POOL'] = bachelorette['AGE'] - bachelorette['AGE_MEAN']
# is positive if the contestant is older than the mean 
# is negative if the contestant is younger than the mean

# Age Difference between the Bachelorette and the Contestant Pool
bachelorette['AGE_DIFF_BACH_POOL'] = bachelorette['ETTE_AGE'] - bachelorette['AGE_MEAN']

# Create a categorical 'younger', 'same', 'older' variable for bachelor age vs. contestant age
bachelorette['AGE_DIFF_CAT'] = np.where(bachelorette['AGE_DIFF'].isnull(), 4, np.where(bachelorette['AGE_DIFF'] > 0, 2, np.where(bachelorette['AGE_DIFF'] == 0, 1, 0)))
bachelorette['AGE_DIFF_CAT'].value_counts()
# if the age difference is missing, assign value 4, otherwise [missing]
# if the age difference is greater than 0, assign value 2, otherwise [older]
# if the age difference is equal to 0, assign value 1, otherwise [same]
# assign value 0 [younger]


0    107
4     50
2     38
1     12
Name: AGE_DIFF_CAT, dtype: int64

## Geography

In [None]:
# need to split up hometown data...

In [26]:
states = {
        'AK': 'O',
        'AL': 'S',
        'AR': 'S',
        'AS': 'O',
        'AZ': 'W',
        'CA': 'W',
        'CO': 'W',
        'CT': 'N',
        'DC': 'N',
        'DE': 'N',
        'FL': 'S',
        'GA': 'S',
        'GU': 'O',
        'HI': 'O',
        'IA': 'M',
        'ID': 'W',
        'IL': 'M',
        'IN': 'M',
        'KS': 'M',
        'KY': 'S',
        'LA': 'S',
        'MA': 'N',
        'MD': 'N',
        'ME': 'N',
        'MI': 'W',
        'MN': 'M',
        'MO': 'M',
        'MP': 'O',
        'MS': 'S',
        'MT': 'W',
        'NA': 'O',
        'NC': 'S',
        'ND': 'M',
        'NE': 'W',
        'NH': 'N',
        'NJ': 'N',
        'NM': 'W',
        'NV': 'W',
        'NY': 'N',
        'OH': 'M',
        'OK': 'S',
        'OR': 'W',
        'PA': 'N',
        'PR': 'O',
        'RI': 'N',
        'SC': 'S',
        'SD': 'M',
        'TN': 'S',
        'TX': 'S',
        'UT': 'W',
        'VA': 'S',
        'VI': 'O',
        'VT': 'N',
        'WA': 'W',
        'WI': 'M',
        'WV': 'S',
        'WY': 'W'
}

In [None]:
# Binary for same city or state
bachelorette['SAME_CITY'] = np.where(bach['BACHELOR_HOMETOWN_CITY'] == bach['CONTESTANT_HOMETOWN_CITY'], 1, 0)
bachelorette['SAME_STATE'] = np.where(bach['BACHELOR_HOMETOWN_STATE'] == bach['BACHELOR_HOMETOWN_STATE'], 1, 0)

In [None]:
# Convert Bachelor state to region
StateRegion = []
for index, row in bach.iterrows():
    #print(row['c1'], row['c2'])
    if row['BACHELOR_HOMETOWN_STATE'] in states: 
        StateRegion.append(states[row['BACHELOR_HOMETOWN_STATE']])
    else: 
        StateRegion.append(row['BACHELOR_HOMETOWN_STATE'])
        
bach['BACHELOR_REGION'] = StateRegion

In [None]:
# Convert Contestant state to region
StateRegion = []
for index, row in bach.iterrows():
    #print(row['c1'], row['c2'])
    if row['CONTESTANT_HOMETOWN_STATE'] in states: 
        StateRegion.append(states[row['CONTESTANT_HOMETOWN_STATE']])
    else: 
        StateRegion.append(row['CONTESTANT_HOMETOWN_STATE'])
        
bach['CONTESTANT_REGION'] = StateRegion

In [None]:
# binary if bachelor region and contestant region are the same
bach['SAME_REGION'] = np.where(bach['BACHELOR_REGION'] == bach['CONTESTANT_REGION'], 1, 0)