# Making Bachelorette Dataset
### Combining Demographics and Elimination Data
### Adding Additional Created Variables
### Adding Twitter data

In [1]:
# load up libraries
import pandas as pd
import numpy as np
%matplotlib inline

In [2]:
# bring in data
# read in bachelorette contestant demographic data
ette_demo = pd.read_csv("ette_cont_occ_group.csv", encoding='latin-1')

# read in elimination data
elim = pd.read_csv("bachelorette_538.csv")
elim = elim.drop([0])

# read in bachelorette data
ettes = pd.read_csv("bachelorettes.csv")

In [3]:
# only select bachelorette elimination data
elim_ette = elim[elim.SHOW == 'Bachelorette'].copy()
# keep only numbered seasons
elim_ette = elim_ette[elim_ette.SEASON != 'SEASON'].copy()

elim_ette['SEASON'] = elim_ette['SEASON'].astype(str).astype(int)
elim_ette = elim_ette.merge(ettes, left_on=['SEASON'], right_on=['Season'])

In [4]:
elim_ette.columns

Index(['SHOW', 'SEASON', 'CONTESTANT', 'ELIMINATION-1', 'ELIMINATION-2',
       'ELIMINATION-3', 'ELIMINATION-4', 'ELIMINATION-5', 'ELIMINATION-6',
       'ELIMINATION-7', 'ELIMINATION-8', 'ELIMINATION-9', 'ELIMINATION-10',
       'DATES-1', 'DATES-2', 'DATES-3', 'DATES-4', 'DATES-5', 'DATES-6',
       'DATES-7', 'DATES-8', 'DATES-9', 'DATES-10', 'Name', 'Age',
       'Occupation', 'OCC_AGREE', 'Hometown', 'Season'],
      dtype='object')

In [5]:
# setting up data to combine the two sets

# split elim CONTESTANT on "_"
elim_ette[['CON_SEASON', 'CON_FIRST', 'CON_INIT']] = elim_ette['CONTESTANT'].str.split('_',expand=True)
#elim_ette.head()

# split bach CONTESTANT on " ",
ette_demo[['CON_FIRST', 'CON_LAST', 'CON_OTHER']] = ette_demo['Name'].str.split(' ', expand=True)
#ette_demo.head()

In [6]:
# make everything uppercase 
ette_demo.CON_FIRST = ette_demo.CON_FIRST.str.upper()
ette_demo.CON_LAST = ette_demo.CON_LAST.str.upper()
ette_demo.CON_OTHER = ette_demo.CON_OTHER.str.upper()

# keep only first initial of last name
ette_demo['CON_LAST2'] = ette_demo.CON_LAST.str[0]
ette_demo['CON_LAST3'] = ette_demo.CON_LAST.str[1]
ette_demo['CON_OTHER2'] = ette_demo.CON_OTHER.str[0]
# iterate through rows in order to keep correct letter
# logic will be: 
# if CON_LAST2 == '"' or if CON_LAST3 == '.' then keep CON_OTHER2
# if CON_OTHER2 == '(' or if CON_OTHER2.isnull then keep CON_LAST2, otherwise keep CON_OTHER2

ette_demo['INITIAL'] = np.where(ette_demo.CON_OTHER2.isnull or ette_demo.CON_OTHER2 == '(', ette_demo.CON_LAST2, ette_demo.CON_OTHER2)
ette_demo['INITIAL_FINAL'] = np.where((ette_demo.CON_LAST2 == '"') | (ette_demo.CON_LAST3 == '.'), ette_demo.CON_OTHER2, ette_demo.INITIAL)

#ette_demo.INITIAL_FINAL.value_counts()
#ette_demo.head()

In [7]:
# remove excess columns
ette_demo = ette_demo[['Name', 'Age', 'Occupation', 'Agreement_1', 'Hometown', 'ElimWeek', 'Season', 'CON_FIRST', 'INITIAL_FINAL']].copy()
#ette_demo.head()

# make data types on merge columns the same
#print(elim_ette.dtypes)
ette_demo.dtypes
elim_ette['SEASON'] = elim_ette['SEASON'].astype(str).astype(int)

In [8]:
# merge/match on first name, last initial season number
all_data = elim_ette.merge(ette_demo, left_on = ['SEASON', 'CON_FIRST', 'CON_INIT'], right_on = ['Season', 'CON_FIRST', 'INITIAL_FINAL'])
#all_data.head()

In [9]:
elim_ette.SEASON.value_counts()

13    31
5     30
14    28
12    26
11    26
10    25
9     25
8     25
7     25
6     25
4     25
3     25
2     25
1     25
Name: SEASON, dtype: int64

In [10]:
ette_demo.Season.value_counts()

13    31
5     30
14    28
12    26
11    26
10    25
9     25
8     25
7     25
6     25
4     25
2     25
1     25
Name: Season, dtype: int64

In [11]:
all_data.SEASON.value_counts()

13    31
5     30
14    28
7     27
12    26
11    26
10    25
9     25
8     25
6     25
4     25
2     25
1     25
Name: SEASON, dtype: int64

In [12]:
# keep seasons with full contestant data
# only keep seasons 1 2 4 5 6 8 9 10 11 12 13 14
bachelorette = all_data[(all_data.SEASON != 3) & (all_data.SEASON != 7)].copy()

In [13]:
bachelorette.SEASON.value_counts()

13    31
5     30
14    28
12    26
11    26
10    25
9     25
8     25
6     25
4     25
2     25
1     25
Name: SEASON, dtype: int64

In [14]:
bachelorette.columns
#bachelorette.head()

Index(['SHOW', 'SEASON', 'CONTESTANT', 'ELIMINATION-1', 'ELIMINATION-2',
       'ELIMINATION-3', 'ELIMINATION-4', 'ELIMINATION-5', 'ELIMINATION-6',
       'ELIMINATION-7', 'ELIMINATION-8', 'ELIMINATION-9', 'ELIMINATION-10',
       'DATES-1', 'DATES-2', 'DATES-3', 'DATES-4', 'DATES-5', 'DATES-6',
       'DATES-7', 'DATES-8', 'DATES-9', 'DATES-10', 'Name_x', 'Age_x',
       'Occupation_x', 'OCC_AGREE', 'Hometown_x', 'Season_x', 'CON_SEASON',
       'CON_FIRST', 'CON_INIT', 'Name_y', 'Age_y', 'Occupation_y',
       'Agreement_1', 'Hometown_y', 'ElimWeek', 'Season_y', 'INITIAL_FINAL'],
      dtype='object')

In [15]:
bachelorette = bachelorette[['SHOW', 'SEASON', 'CONTESTANT', 'Name_y', 'Age_y', 'Occupation_y', 'Agreement_1',
       'Hometown_y', 'ELIMINATION-1', 'ELIMINATION-2',
       'ELIMINATION-3', 'ELIMINATION-4', 'ELIMINATION-5', 'ELIMINATION-6',
       'ELIMINATION-7', 'ELIMINATION-8', 'ELIMINATION-9', 'ELIMINATION-10',
       'DATES-1', 'DATES-2', 'DATES-3', 'DATES-4', 'DATES-5', 'DATES-6',
       'DATES-7', 'DATES-8', 'DATES-9', 'DATES-10', 'Name_x', 'Age_x',
       'Occupation_x', 'OCC_AGREE', 'Hometown_x']].copy()
bachelorette.columns = ['SHOW', 'SEASON', 'CONTESTANT', 'NAME', 'AGE', 'OCCUPATION', 'OCCUPATION_GROUP',
       'HOMETOWN', 'ELIMINATION-1', 'ELIMINATION-2',
       'ELIMINATION-3', 'ELIMINATION-4', 'ELIMINATION-5', 'ELIMINATION-6',
       'ELIMINATION-7', 'ELIMINATION-8', 'ELIMINATION-9', 'ELIMINATION-10',
       'DATES-1', 'DATES-2', 'DATES-3', 'DATES-4', 'DATES-5', 'DATES-6',
       'DATES-7', 'DATES-8', 'DATES-9', 'DATES-10', 'ETTE_NAME', 'ETTE_AGE',
       'ETTE_OCCUPATION', 'ETTE_OCCUPATION_GROUP', 'ETTE_HOMETOWN']
bachelorette.head()

Unnamed: 0,SHOW,SEASON,CONTESTANT,NAME,AGE,OCCUPATION,OCCUPATION_GROUP,HOMETOWN,ELIMINATION-1,ELIMINATION-2,...,DATES-6,DATES-7,DATES-8,DATES-9,DATES-10,ETTE_NAME,ETTE_AGE,ETTE_OCCUPATION,ETTE_OCCUPATION_GROUP,ETTE_HOMETOWN
0,Bachelorette,14,14_GARRETT_Y,Garrett Yrigoyen,29,Medical Sales Representative,Sales,"Manteca, California",R1,,...,D7,D1,D1,D1,D1,Becca Kufrin,28.0,Publicist,Entertainment and Media,"Prior Lake, Minnesota"
1,Bachelorette,14,14_BLAKE_H,Blake Horstmann,28,Sales Rep,Sales,"Bailey, Colorado",,R,...,D7,D1,D1,D1,D1,Becca Kufrin,28.0,Publicist,Entertainment and Media,"Prior Lake, Minnesota"
2,Bachelorette,14,14_JASON_T,Jason Tartick,29,Senior Corporate Banker,FinanceInvestmentBanking,"Buffalo, New York",,,...,D1,D3,D1,D1,,Becca Kufrin,28.0,Publicist,Entertainment and Media,"Prior Lake, Minnesota"
3,Bachelorette,14,14_COLTON_U,Colton Underwood,26,Former Pro Football Player,Sports,"Washington, Illinois",,,...,D7,D1,D1,,,Becca Kufrin,28.0,Publicist,Entertainment and Media,"Prior Lake, Minnesota"
4,Bachelorette,14,14_WILLS_R,Wills Reid,29,Graphic Designer,Art and Design,"Los Angeles, California",,R,...,D7,D3,,,,Becca Kufrin,28.0,Publicist,Entertainment and Media,"Prior Lake, Minnesota"


In [16]:
bachelorette.ETTE_OCCUPATION.unique()

array(['Publicist', 'Attorney', 'Real Estate Developer',
       'Dance Instructor', 'Assistant District Attorney',
       'Bridal stylist', "Children's hospital event planner",
       'Advertising account manager', 'Interior designer',
       'Real estate agent', 'Makeup artist', 'Physical therapist'],
      dtype=object)

In [17]:
bachelorette.SEASON.value_counts()

13    31
5     30
14    28
12    26
11    26
10    25
9     25
8     25
6     25
4     25
2     25
1     25
Name: SEASON, dtype: int64

## Add in Rose Variables

In [18]:
bachelorette['ROSE-1'] = np.where(bachelorette['ELIMINATION-1'] == 'R', 1, 0)
bachelorette['ROSE-2'] = np.where(bachelorette['ELIMINATION-2'] == 'R', 1, 0)
bachelorette['ROSE-3'] = np.where(bachelorette['ELIMINATION-3'] == 'R', 1, 0)
bachelorette['ROSE-4'] = np.where(bachelorette['ELIMINATION-4'] == 'R', 1, 0)
bachelorette['ROSE-5'] = np.where(bachelorette['ELIMINATION-5'] == 'R', 1, 0)
bachelorette['ROSE-6'] = np.where(bachelorette['ELIMINATION-6'] == 'R', 1, 0)
bachelorette['ROSE-7'] = np.where(bachelorette['ELIMINATION-7'] == 'R', 1, 0)
bachelorette['ROSE-8'] = np.where(bachelorette['ELIMINATION-8'] == 'R', 1, 0)
bachelorette['ROSE-9'] = np.where(bachelorette['ELIMINATION-9'] == 'R', 1, 0)
bachelorette['ROSE-10'] = np.where(bachelorette['ELIMINATION-10'] == 'R', 1, 0)

In [19]:
bachelorette['FIR-1'] = np.where(bachelorette['ELIMINATION-1'] == 'R1', 1, 0)
bachelorette['FIR-2'] = np.where(bachelorette['ELIMINATION-2'] == 'R1', 1, 0)
bachelorette['FIR-3'] = np.where(bachelorette['ELIMINATION-3'] == 'R1', 1, 0)
bachelorette['FIR-4'] = np.where(bachelorette['ELIMINATION-4'] == 'R1', 1, 0)
bachelorette['FIR-5'] = np.where(bachelorette['ELIMINATION-5'] == 'R1', 1, 0)
bachelorette['FIR-6'] = np.where(bachelorette['ELIMINATION-6'] == 'R1', 1, 0)
bachelorette['FIR-7'] = np.where(bachelorette['ELIMINATION-7'] == 'R1', 1, 0)
bachelorette['FIR-8'] = np.where(bachelorette['ELIMINATION-8'] == 'R1', 1, 0)
bachelorette['FIR-9'] = np.where(bachelorette['ELIMINATION-9'] == 'R1', 1, 0)
bachelorette['FIR-10'] = np.where(bachelorette['ELIMINATION-10'] == 'R1', 1, 0)

## Add in WINNER column

In [20]:
roses = bachelorette[['SHOW', 'SEASON', 'CONTESTANT', 'ELIMINATION-1', 'ELIMINATION-2',
       'ELIMINATION-3', 'ELIMINATION-4', 'ELIMINATION-5', 'ELIMINATION-6',
       'ELIMINATION-7', 'ELIMINATION-8', 'ELIMINATION-9', 'ELIMINATION-10']].copy()
roses = pd.melt(roses, id_vars=['SHOW', 'SEASON', 'CONTESTANT'])
roses.head()
winners = roses[roses.value == 'W'].CONTESTANT.tolist()

In [21]:
bachelorette['WINNER'] = np.where(bachelorette.CONTESTANT.isin(winners), 1, 0)

In [22]:
bachelorette['WINNER'].value_counts()
bachelorette.columns

Index(['SHOW', 'SEASON', 'CONTESTANT', 'NAME', 'AGE', 'OCCUPATION',
       'OCCUPATION_GROUP', 'HOMETOWN', 'ELIMINATION-1', 'ELIMINATION-2',
       'ELIMINATION-3', 'ELIMINATION-4', 'ELIMINATION-5', 'ELIMINATION-6',
       'ELIMINATION-7', 'ELIMINATION-8', 'ELIMINATION-9', 'ELIMINATION-10',
       'DATES-1', 'DATES-2', 'DATES-3', 'DATES-4', 'DATES-5', 'DATES-6',
       'DATES-7', 'DATES-8', 'DATES-9', 'DATES-10', 'ETTE_NAME', 'ETTE_AGE',
       'ETTE_OCCUPATION', 'ETTE_OCCUPATION_GROUP', 'ETTE_HOMETOWN', 'ROSE-1',
       'ROSE-2', 'ROSE-3', 'ROSE-4', 'ROSE-5', 'ROSE-6', 'ROSE-7', 'ROSE-8',
       'ROSE-9', 'ROSE-10', 'FIR-1', 'FIR-2', 'FIR-3', 'FIR-4', 'FIR-5',
       'FIR-6', 'FIR-7', 'FIR-8', 'FIR-9', 'FIR-10', 'WINNER'],
      dtype='object')

## Add in Top 5 column

In [48]:
week = 1
b = bachelorette.copy()
top3s = []

seasons = [1, 2, 4, 5, 6, 8, 9, 10, 11, 12, 13, 14]
for each_season in seasons:
    bs = b[b.SEASON == each_season].copy()
    week = 1
    while week <= 10:
        #print(len(bs))
        e_week_name = 'ELIMINATION-' + str(week)
        bs = bs[(bs[e_week_name].isnull()) | (bs[e_week_name] == 'R') | (bs[e_week_name] == 'R1')]
        if len(bs) == 3:
            print(each_season)
            for each in bs.CONTESTANT.unique(): 
                top3s.append(each)
        week = week + 1
        
top3s

1
2
4
5
6
8
9
10
11
12
13
14


['01_RYAN_S',
 '01_CHARLIE_M',
 '01_RUSS_X',
 '02_IAN_M',
 '02_MATTHEW_H',
 '02_CHAD_X',
 '04_JESSE_C',
 '04_JASON_M',
 '04_JEREMY_A',
 '05_ED_S',
 '05_KIPTYN_L',
 '05_REID_R',
 '06_ROBERT_M',
 '06_CHRIS_L',
 '06_FRANK_N',
 '08_JEF_H',
 '08_ARIE_L',
 '08_SEAN_L',
 '09_CHRIS_S',
 '09_DREW_K',
 '09_BROOKS_F',
 '10_JOSH_M',
 '10_NICK_V',
 '10_CHRIS_S',
 '11_SHAWN_B',
 '11_NICK_V',
 '11_BEN_H',
 '12_JORDAN_R',
 '12_ROBBY_H',
 '12_CHASE_M',
 '13_BRYAN_A',
 '13_PETER_K',
 '13_ERIC_B',
 '14_GARRETT_Y',
 '14_BLAKE_H',
 '14_JASON_T']

In [49]:
bachelorette['TOP_THREE'] = np.where(bachelorette.CONTESTANT.isin(top3s), 1, 0)

## Age Related Variables

In [50]:
# Age Difference between Contestant and Bachelorette
bachelorette['AGE_DIFF'] = bachelorette['ETTE_AGE'] - bachelorette['AGE']
# is positive if the bachelorette is older than the contestant
# is negative if the bachelorette is younger than the contestant

# Age Difference between Contestant and Mean of Contestant Pool for the Season
bachelorette = bachelorette.merge(bachelorette[['SEASON', 'AGE']].groupby(['SEASON'], as_index=False).mean(), on='SEASON', suffixes=['', '_MEAN'])
#bach.head()

# Age Difference between Contestant and the mean age of contestants in their season
bachelorette['AGE_DIFF_MEAN_POOL'] = bachelorette['AGE'] - bachelorette['AGE_MEAN']
# is positive if the contestant is older than the mean 
# is negative if the contestant is younger than the mean

# Age Difference between the Bachelorette and the Contestant Pool
bachelorette['AGE_DIFF_BACH_POOL'] = bachelorette['ETTE_AGE'] - bachelorette['AGE_MEAN']

# Create a categorical 'younger', 'same', 'older' variable for bachelor age vs. contestant age
bachelorette['AGE_DIFF_CAT'] = np.where(bachelorette['AGE_DIFF'].isnull(), 4, np.where(bachelorette['AGE_DIFF'] > 0, 2, np.where(bachelorette['AGE_DIFF'] == 0, 1, 0)))
bachelorette['AGE_DIFF_CAT'].value_counts()
# if the age difference is missing, assign value 4, otherwise [missing]
# if the age difference is greater than 0, assign value 2, otherwise [older]
# if the age difference is equal to 0, assign value 1, otherwise [same]
# assign value 0 [younger]


0    133
2     85
4     75
1     23
Name: AGE_DIFF_CAT, dtype: int64

In [51]:
bachelorette.columns

Index(['SHOW', 'SEASON', 'CONTESTANT', 'NAME', 'AGE', 'OCCUPATION',
       'OCCUPATION_GROUP', 'HOMETOWN', 'ELIMINATION-1', 'ELIMINATION-2',
       'ELIMINATION-3', 'ELIMINATION-4', 'ELIMINATION-5', 'ELIMINATION-6',
       'ELIMINATION-7', 'ELIMINATION-8', 'ELIMINATION-9', 'ELIMINATION-10',
       'DATES-1', 'DATES-2', 'DATES-3', 'DATES-4', 'DATES-5', 'DATES-6',
       'DATES-7', 'DATES-8', 'DATES-9', 'DATES-10', 'ETTE_NAME', 'ETTE_AGE',
       'ETTE_OCCUPATION', 'ETTE_OCCUPATION_GROUP', 'ETTE_HOMETOWN', 'ROSE-1',
       'ROSE-2', 'ROSE-3', 'ROSE-4', 'ROSE-5', 'ROSE-6', 'ROSE-7', 'ROSE-8',
       'ROSE-9', 'ROSE-10', 'FIR-1', 'FIR-2', 'FIR-3', 'FIR-4', 'FIR-5',
       'FIR-6', 'FIR-7', 'FIR-8', 'FIR-9', 'FIR-10', 'WINNER', 'TOP_THREE',
       'AGE_DIFF', 'AGE_MEAN', 'AGE_DIFF_MEAN_POOL', 'AGE_DIFF_BACH_POOL',
       'AGE_DIFF_CAT'],
      dtype='object')

## Geography

In [52]:
us_state_abbrev = {
    'Alabama': 'AL',
    'Alaska': 'AK',
    'Arizona': 'AZ',
    'Arkansas': 'AR',
    'California': 'CA',
    'Colorado': 'CO',
    'Connecticut': 'CT',
    'Delaware': 'DE',
    'Florida': 'FL',
    'Georgia': 'GA',
    'Hawaii': 'HI',
    'Idaho': 'ID',
    'Illinois': 'IL',
    'Indiana': 'IN',
    'Iowa': 'IA',
    'Kansas': 'KS',
    'Kentucky': 'KY',
    'Louisiana': 'LA',
    'Maine': 'ME',
    'Maryland': 'MD',
    'Massachusetts': 'MA',
    'Michigan': 'MI',
    'Minnesota': 'MN',
    'Mississippi': 'MS',
    'Missouri': 'MO',
    'Montana': 'MT',
    'Nebraska': 'NE',
    'Nevada': 'NV',
    'New Hampshire': 'NH',
    'New Jersey': 'NJ',
    'New Mexico': 'NM',
    'New York': 'NY',
    'North Carolina': 'NC',
    'North Dakota': 'ND',
    'Ohio': 'OH',
    'Oklahoma': 'OK',
    'Oregon': 'OR',
    'Pennsylvania': 'PA',
    'Rhode Island': 'RI',
    'South Carolina': 'SC',
    'South Dakota': 'SD',
    'Tennessee': 'TN',
    'Texas': 'TX',
    'Utah': 'UT',
    'Vermont': 'VT',
    'Virginia': 'VA',
    'Washington': 'WA',
    'West Virginia': 'WV',
    'Wisconsin': 'WI',
    'Wyoming': 'WY',
}

In [53]:
bachelorette[['ETTE_HOMECITY', 'ETTE_HOMESTATE', 'Other']] = bachelorette['ETTE_HOMETOWN'].str.split(', ',expand=True)

# Convert Hometown_State to us state abbreviation if relevant
StateAbbrev = []
for index, row in bachelorette.iterrows():
    #print(row['c1'], row['c2'])
    if row['ETTE_HOMESTATE'] in us_state_abbrev: 
        StateAbbrev.append(us_state_abbrev[row['ETTE_HOMESTATE']])
    else: 
        StateAbbrev.append(row['ETTE_HOMESTATE'])
        
bachelorette['ETTE_STATESHORT'] = StateAbbrev

In [54]:
bachelorette[['HOMETOWN_CITY', 'HOMETOWN_STATE']] = bachelorette['HOMETOWN'].str.split(', ',expand=True)

# Convert Hometown_State to us state abbreviation if relevant
StateAbbrev = []
for index, row in bachelorette.iterrows():
    #print(row['c1'], row['c2'])
    if row['HOMETOWN_STATE'] in us_state_abbrev: 
        StateAbbrev.append(us_state_abbrev[row['HOMETOWN_STATE']])
    else: 
        StateAbbrev.append(row['HOMETOWN_STATE'])
        
bachelorette['HOMESTATE_SHORT'] = StateAbbrev

In [55]:
states = {
        'AK': 'O',
        'AL': 'S',
        'AR': 'S',
        'AS': 'O',
        'AZ': 'W',
        'CA': 'W',
        'CO': 'W',
        'CT': 'N',
        'DC': 'N',
        'DE': 'N',
        'FL': 'S',
        'GA': 'S',
        'GU': 'O',
        'HI': 'O',
        'IA': 'M',
        'ID': 'W',
        'IL': 'M',
        'IN': 'M',
        'KS': 'M',
        'KY': 'S',
        'LA': 'S',
        'MA': 'N',
        'MD': 'N',
        'ME': 'N',
        'MI': 'W',
        'MN': 'M',
        'MO': 'M',
        'MP': 'O',
        'MS': 'S',
        'MT': 'W',
        'NA': 'O',
        'NC': 'S',
        'ND': 'M',
        'NE': 'W',
        'NH': 'N',
        'NJ': 'N',
        'NM': 'W',
        'NV': 'W',
        'NY': 'N',
        'OH': 'M',
        'OK': 'S',
        'OR': 'W',
        'PA': 'N',
        'PR': 'O',
        'RI': 'N',
        'SC': 'S',
        'SD': 'M',
        'TN': 'S',
        'TX': 'S',
        'UT': 'W',
        'VA': 'S',
        'VI': 'O',
        'VT': 'N',
        'WA': 'W',
        'WI': 'M',
        'WV': 'S',
        'WY': 'W'
}

In [56]:
# Binary for same city or state
bachelorette['SAME_CITY'] = np.where(bachelorette['ETTE_HOMECITY'] == bachelorette['HOMETOWN_CITY'], 1, 0)
bachelorette['SAME_STATE'] = np.where(bachelorette['ETTE_STATESHORT'] == bachelorette['HOMESTATE_SHORT'], 1, 0)

In [57]:
# Convert Bachelor state to region
StateRegion = []
for index, row in bachelorette.iterrows():
    #print(row['c1'], row['c2'])
    if row['ETTE_STATESHORT'] in states: 
        StateRegion.append(states[row['ETTE_STATESHORT']])
    else: 
        StateRegion.append(row['ETTE_STATESHORT'])
        
bachelorette['ETTE_REGION'] = StateRegion

In [58]:
# Convert Contestant state to region
StateRegion = []
for index, row in bachelorette.iterrows():
    #print(row['c1'], row['c2'])
    if row['HOMESTATE_SHORT'] in states: 
        StateRegion.append(states[row['HOMESTATE_SHORT']])
    else: 
        StateRegion.append(row['HOMESTATE_SHORT'])
        
bachelorette['REGION'] = StateRegion

In [59]:
# binary if bachelor region and contestant region are the same
bachelorette['SAME_REGION'] = np.where(bachelorette['ETTE_REGION'] == bachelorette['REGION'], 1, 0)

## Occupation

In [60]:
#bachelorette['ETTE_OCCUPATION'].unique()

In [61]:
#bachelorette['ETTE_OCCGROUP'] = np.where(bachelorette['ETTE_OCCUPATION'] == 'Real Estate Developer', 'Real Estate/Realtor', 0)
#bachelorette['ETTE_OCCGROUP'] = np.where(bachelorette['ETTE_OCCUPATION'] == 'Dance Instructor', 'Sports', bachelorette['ETTE_OCCGROUP'])
#bachelorette['ETTE_OCCGROUP'] = np.where(bachelorette['ETTE_OCCUPATION'] == 'Assistant District Attorney', 'Legal', bachelorette['ETTE_OCCGROUP'])
#bachelorette['ETTE_OCCGROUP'] = np.where(bachelorette['ETTE_OCCUPATION'] == 'Bridal Stylist', 'Other', bachelorette['ETTE_OCCGROUP'])
#bachelorette['ETTE_OCCGROUP'] = np.where(bachelorette['ETTE_OCCUPATION'] == "Children's hospital event planner", 'Other', bachelorette['ETTE_OCCGROUP'])
#bachelorette['ETTE_OCCGROUP'] = np.where(bachelorette['ETTE_OCCUPATION'] == 'Advertising account manager', 'Sales', bachelorette['ETTE_OCCGROUP'])
#bachelorette['ETTE_OCCGROUP'] = np.where(bachelorette['ETTE_OCCUPATION'] == 'Interior designer', 'Art and Design', bachelorette['ETTE_OCCGROUP'])
#bachelorette['ETTE_OCCGROUP'] = np.where(bachelorette['ETTE_OCCUPATION'] == 'Real estate agent', 'Real Estate/Realtor', bachelorette['ETTE_OCCGROUP'])

bachelorette['SAME_OCCUPATIONGROUP'] = np.where(bachelorette['OCCUPATION_GROUP'] == bachelorette['ETTE_OCCUPATION_GROUP'], 1, 0)

## Cut down to only necessary variables

In [62]:
bachelorette.head()

Unnamed: 0,SHOW,SEASON,CONTESTANT,NAME,AGE,OCCUPATION,OCCUPATION_GROUP,HOMETOWN,ELIMINATION-1,ELIMINATION-2,...,ETTE_STATESHORT,HOMETOWN_CITY,HOMETOWN_STATE,HOMESTATE_SHORT,SAME_CITY,SAME_STATE,ETTE_REGION,REGION,SAME_REGION,SAME_OCCUPATIONGROUP
0,Bachelorette,14,14_GARRETT_Y,Garrett Yrigoyen,29,Medical Sales Representative,Sales,"Manteca, California",R1,,...,MN,Manteca,California,CA,0,0,M,W,0,0
1,Bachelorette,14,14_BLAKE_H,Blake Horstmann,28,Sales Rep,Sales,"Bailey, Colorado",,R,...,MN,Bailey,Colorado,CO,0,0,M,W,0,0
2,Bachelorette,14,14_JASON_T,Jason Tartick,29,Senior Corporate Banker,FinanceInvestmentBanking,"Buffalo, New York",,,...,MN,Buffalo,New York,NY,0,0,M,N,0,0
3,Bachelorette,14,14_COLTON_U,Colton Underwood,26,Former Pro Football Player,Sports,"Washington, Illinois",,,...,MN,Washington,Illinois,IL,0,0,M,M,1,0
4,Bachelorette,14,14_WILLS_R,Wills Reid,29,Graphic Designer,Art and Design,"Los Angeles, California",,R,...,MN,Los Angeles,California,CA,0,0,M,W,0,0


In [63]:
bachelorette.columns

Index(['SHOW', 'SEASON', 'CONTESTANT', 'NAME', 'AGE', 'OCCUPATION',
       'OCCUPATION_GROUP', 'HOMETOWN', 'ELIMINATION-1', 'ELIMINATION-2',
       'ELIMINATION-3', 'ELIMINATION-4', 'ELIMINATION-5', 'ELIMINATION-6',
       'ELIMINATION-7', 'ELIMINATION-8', 'ELIMINATION-9', 'ELIMINATION-10',
       'DATES-1', 'DATES-2', 'DATES-3', 'DATES-4', 'DATES-5', 'DATES-6',
       'DATES-7', 'DATES-8', 'DATES-9', 'DATES-10', 'ETTE_NAME', 'ETTE_AGE',
       'ETTE_OCCUPATION', 'ETTE_OCCUPATION_GROUP', 'ETTE_HOMETOWN', 'ROSE-1',
       'ROSE-2', 'ROSE-3', 'ROSE-4', 'ROSE-5', 'ROSE-6', 'ROSE-7', 'ROSE-8',
       'ROSE-9', 'ROSE-10', 'FIR-1', 'FIR-2', 'FIR-3', 'FIR-4', 'FIR-5',
       'FIR-6', 'FIR-7', 'FIR-8', 'FIR-9', 'FIR-10', 'WINNER', 'TOP_THREE',
       'AGE_DIFF', 'AGE_MEAN', 'AGE_DIFF_MEAN_POOL', 'AGE_DIFF_BACH_POOL',
       'AGE_DIFF_CAT', 'ETTE_HOMECITY', 'ETTE_HOMESTATE', 'Other',
       'ETTE_STATESHORT', 'HOMETOWN_CITY', 'HOMETOWN_STATE', 'HOMESTATE_SHORT',
       'SAME_CITY', 'SAME_STATE'

In [65]:
ette_final = bachelorette[['SHOW', 'SEASON', 'ETTE_NAME', 'CONTESTANT', 'NAME', 'AGE', 
       'ELIMINATION-1', 'ELIMINATION-2', 'ELIMINATION-3', 'ELIMINATION-4', 'ELIMINATION-5', 'ELIMINATION-6', 'ELIMINATION-7', 'ELIMINATION-8', 'ELIMINATION-9', 'ELIMINATION-10',
       'DATES-1', 'DATES-2', 'DATES-3', 'DATES-4', 'DATES-5', 'DATES-6', 'DATES-7', 'DATES-8', 'DATES-9', 'DATES-10', 
       'FIR-1', 'FIR-2', 'FIR-3', 'FIR-4', 'FIR-5', 'FIR-6', 'FIR-7', 'FIR-8', 'FIR-9', 'FIR-10',
       'ROSE-1', 'ROSE-2', 'ROSE-3', 'ROSE-4', 'ROSE-5', 'ROSE-6', 'ROSE-7', 'ROSE-8', 'ROSE-9', 'ROSE-10',
       'AGE_DIFF', 'AGE_DIFF_MEAN_POOL', 'AGE_DIFF_CAT',
       'SAME_CITY', 'SAME_STATE', 'SAME_REGION', 'SAME_OCCUPATIONGROUP', 'TOP_THREE',
       'WINNER']].copy()

## Need to change how DATES are formatted

In [66]:
ette_final['DATES-1'] = ette_final['DATES-1'].str[1:]
ette_final['DATES-2'] = ette_final['DATES-2'].str[1:]
ette_final['DATES-3'] = ette_final['DATES-3'].str[1:]
ette_final['DATES-4'] = ette_final['DATES-4'].str[1:]
ette_final['DATES-5'] = ette_final['DATES-5'].str[1:]
ette_final['DATES-6'] = ette_final['DATES-6'].str[1:]
ette_final['DATES-7'] = ette_final['DATES-7'].str[1:]
ette_final['DATES-8'] = ette_final['DATES-8'].str[1:]
ette_final['DATES-9'] = ette_final['DATES-9'].str[1:]
ette_final['DATES-10'] = ette_final['DATES-10'].str[1:]

In [67]:
ette_final['DATES-1'] = pd.to_numeric(ette_final['DATES-1'], errors='coerce')
ette_final['DATES-2'] = pd.to_numeric(ette_final['DATES-2'], errors='coerce')
ette_final['DATES-3'] = pd.to_numeric(ette_final['DATES-3'], errors='coerce')
ette_final['DATES-4'] = pd.to_numeric(ette_final['DATES-4'], errors='coerce')
ette_final['DATES-5'] = pd.to_numeric(ette_final['DATES-5'], errors='coerce')
ette_final['DATES-6'] = pd.to_numeric(ette_final['DATES-6'], errors='coerce')
ette_final['DATES-7'] = pd.to_numeric(ette_final['DATES-7'], errors='coerce')
ette_final['DATES-8'] = pd.to_numeric(ette_final['DATES-8'], errors='coerce')
ette_final['DATES-9'] = pd.to_numeric(ette_final['DATES-9'], errors='coerce')
ette_final['DATES-10'] = pd.to_numeric(ette_final['DATES-10'], errors='coerce')

In [68]:
# 'AGE', 'AGE_DIFF', 'AGE_DIFF_MEAN_POOL', 'AGE_DIFF_CAT', 'SAME_CITY', 'SAME_STATE', 'SAME_REGION', 'SAME_OCCUPATIONGROUP'
ette_final.SAME_OCCUPATIONGROUP.value_counts(dropna=False)

0    293
1     23
Name: SAME_OCCUPATIONGROUP, dtype: int64

Not using AGE_DIFF, AGE_DIFF_MEAN_POOL because of NaNs
Not using SAME_CITY because they are all zeros

In [69]:
## change all NaNs in ROSES, FIR, and DATES to 0
n = 1

while n <= 10: 
    d = 'DATES-' + str(n)
    f = 'FIR-' + str(n)
    r = 'ROSE-' + str(n)
    
    ette_final[d] = ette_final[d].fillna(0)
    ette_final[f] = ette_final[f].fillna(0)
    ette_final[r] = ette_final[r].fillna(0)
    
    n = n + 1

### Add in Twitter Sentiment Data

In [None]:
sentiment = pd.read_csv('twitterfeatures3.csv')

In [None]:
# take only seasons we have matching data for
#sentiment = sentiment[(sentiment.Season != 13)&(sentiment.Season != 14)].copy()

print(len(sentiment))
print(len(ette_final))

# merge with bachelorette 
ette_all = ette_final.merge(sentiment, left_on = ['NAME', 'SEASON'], right_on = ['Name', 'Season'])

print(len(ette_all))

In [None]:
ette_all.SEASON.unique()
ette_all.columns

In [None]:
# select only the subset of variables we care about
ette_all = ette_all[['SHOW', 'SEASON', 'ETTE_NAME', 'CONTESTANT', 'NAME',
       'AGE', 'ELIMINATION-1', 'ELIMINATION-2', 'ELIMINATION-3',
       'ELIMINATION-4', 'ELIMINATION-5', 'ELIMINATION-6', 'ELIMINATION-7',
       'ELIMINATION-8', 'ELIMINATION-9', 'ELIMINATION-10', 'DATES-1',
       'DATES-2', 'DATES-3', 'DATES-4', 'DATES-5', 'DATES-6', 'DATES-7',
       'DATES-8', 'DATES-9', 'DATES-10', 'FIR-1', 'FIR-2', 'FIR-3', 'FIR-4',
       'FIR-5', 'FIR-6', 'FIR-7', 'FIR-8', 'FIR-9', 'FIR-10', 'ROSE-1',
       'ROSE-2', 'ROSE-3', 'ROSE-4', 'ROSE-5', 'ROSE-6', 'ROSE-7', 'ROSE-8',
       'ROSE-9', 'ROSE-10', '10.0-Negative', '10.0-Neutral',
       '10.0-Positive', '10.0-Total', '10.0-choose', '10.0-win',
       '11.0-Negative', '11.0-Neutral', '11.0-Positive', '11.0-Total',
       '11.0-choose', '11.0-win', '2.0-Negative', '2.0-Neutral',
       '2.0-Positive', '2.0-Total', '2.0-choose', '2.0-win', '3.0-Negative',
       '3.0-Neutral', '3.0-Positive', '3.0-Total', '3.0-choose', '3.0-win',
       '4.0-Negative', '4.0-Neutral', '4.0-Positive', '4.0-Total',
       '4.0-choose', '4.0-win', '5.0-Negative', '5.0-Neutral', '5.0-Positive',
       '5.0-Total', '5.0-choose', '5.0-win', '6.0-Negative', '6.0-Neutral',
       '6.0-Positive', '6.0-Total', '6.0-choose', '6.0-win', '7.0-Negative',
       '7.0-Neutral', '7.0-Positive', '7.0-Total', '7.0-choose', '7.0-win',
       '8.0-Negative', '8.0-Neutral', '8.0-Positive', '8.0-Total',
       '8.0-choose', '8.0-win', '9.0-Negative', '9.0-Neutral', '9.0-Positive',
       '9.0-Total', '9.0-choose', '9.0-win', 'AGE_DIFF', 'AGE_DIFF_MEAN_POOL', 'AGE_DIFF_CAT',
       'SAME_CITY', 'SAME_STATE', 'SAME_REGION', 'SAME_OCCUPATIONGROUP', 'TOP_THREE',
       'WINNER']].copy()

# '1.0-Negative', '1.0-Neutral', '1.0-Positive', '1.0-Total', '1.0-choose', '1.0-win',

In [None]:
ette_all.dtypes

In [None]:
## fill missing data with zeros
ette_all[['10.0-Negative', '10.0-Neutral',
       '10.0-Positive', '10.0-Total', '10.0-choose', '10.0-win',
       '11.0-Negative', '11.0-Neutral', '11.0-Positive', '11.0-Total',
       '11.0-choose', '11.0-win', '2.0-Negative', '2.0-Neutral',
       '2.0-Positive', '2.0-Total', '2.0-choose', '2.0-win', '3.0-Negative',
       '3.0-Neutral', '3.0-Positive', '3.0-Total', '3.0-choose', '3.0-win',
       '4.0-Negative', '4.0-Neutral', '4.0-Positive', '4.0-Total',
       '4.0-choose', '4.0-win', '5.0-Negative', '5.0-Neutral', '5.0-Positive',
       '5.0-Total', '5.0-choose', '5.0-win', '6.0-Negative', '6.0-Neutral',
       '6.0-Positive', '6.0-Total', '6.0-choose', '6.0-win', '7.0-Negative',
       '7.0-Neutral', '7.0-Positive', '7.0-Total', '7.0-choose', '7.0-win',
       '8.0-Negative', '8.0-Neutral', '8.0-Positive', '8.0-Total',
       '8.0-choose', '8.0-win', '9.0-Negative', '9.0-Neutral', '9.0-Positive',
       '9.0-Total', '9.0-choose', '9.0-win']] = ette_all[['10.0-Negative', '10.0-Neutral',
       '10.0-Positive', '10.0-Total', '10.0-choose', '10.0-win',
       '11.0-Negative', '11.0-Neutral', '11.0-Positive', '11.0-Total',
       '11.0-choose', '11.0-win', '2.0-Negative', '2.0-Neutral',
       '2.0-Positive', '2.0-Total', '2.0-choose', '2.0-win', '3.0-Negative',
       '3.0-Neutral', '3.0-Positive', '3.0-Total', '3.0-choose', '3.0-win',
       '4.0-Negative', '4.0-Neutral', '4.0-Positive', '4.0-Total',
       '4.0-choose', '4.0-win', '5.0-Negative', '5.0-Neutral', '5.0-Positive',
       '5.0-Total', '5.0-choose', '5.0-win', '6.0-Negative', '6.0-Neutral',
       '6.0-Positive', '6.0-Total', '6.0-choose', '6.0-win', '7.0-Negative',
       '7.0-Neutral', '7.0-Positive', '7.0-Total', '7.0-choose', '7.0-win',
       '8.0-Negative', '8.0-Neutral', '8.0-Positive', '8.0-Total',
       '8.0-choose', '8.0-win', '9.0-Negative', '9.0-Neutral', '9.0-Positive',
       '9.0-Total', '9.0-choose', '9.0-win']].fillna(0)

#'1.0-Negative', '1.0-Neutral', '1.0-Positive', '1.0-Total','1.0-choose', '1.0-win'
# '1.0-Negative', '1.0-Neutral', '1.0-Positive', '1.0-Total','1.0-choose', '1.0-win',

### Create Condensed Past/Future Variable

In [None]:
# newvarhead == the main variable name you want the past variable to have
# full_set == the data table used for the alterations
# refvarhead == the variable ending used to make the past variable

def past_current(newvarhead, full_set, refvarhead):
    n = 3
    while n < 12: 
        new_var = newvarhead + str(n + 1)
        previous_var = newvarhead + str(n)
        if n == 3: # this is the beginning
            full_set[new_var] = (full_set['2' + refvarhead] + full_set[('3' + refvarhead)]) / 2
            #print(n)
            #print(new_var)
        else: 
            #print(n)
            #print(new_var)
            #print(previous_var)
            
            reference = str(n) + refvarhead
            full_set[new_var] = (full_set[previous_var] + full_set[reference]) / 2
            #print(reference)
        n = n + 1
    return(full_set)

In [None]:
ette_all = past_current('PAST_POS_', ette_all, ".0-Positive")
ette_all = past_current('PAST_NEG_', ette_all, ".0-Negative")
ette_all = past_current('PAST_NEU_', ette_all, ".0-Neutral")
ette_all = past_current('PAST_TOT_', ette_all, ".0-Total")
ette_all = past_current('PAST_WIN_', ette_all, ".0-win")
ette_all = past_current('PAST_CHOOSE_', ette_all, ".0-choose")

In [70]:
# write out to csv file
ette_all.to_csv('bachelorette_forprediction.csv')