# Data Consolidation
#### Building a Dataset 

In [1]:
# library load
import pandas as pd
import numpy as np
%matplotlib inline

In [2]:
elim_data = pd.read_csv('bachelorette_538.csv')
# remove first line, which is just the header info repeated
elim_data = elim_data.drop([0])
elim_data.head()

Unnamed: 0,SHOW,SEASON,CONTESTANT,ELIMINATION-1,ELIMINATION-2,ELIMINATION-3,ELIMINATION-4,ELIMINATION-5,ELIMINATION-6,ELIMINATION-7,...,DATES-1,DATES-2,DATES-3,DATES-4,DATES-5,DATES-6,DATES-7,DATES-8,DATES-9,DATES-10
1,Bachelorette,13,13_BRYAN_A,R1,,,R,R,,R,...,,,D6,D13,D1,D7,D1,D1,D1,D1
2,Bachelorette,13,13_PETER_K,,R,,,,R,R,...,,D1,D6,D13,D9,D7,D1,D1,D1,D1
3,Bachelorette,13,13_ERIC_B,,,R,,,R,R,...,,D10,D8,D13,D9,D1,D3,D1,D1,
4,Bachelorette,13,13_DEAN_U,,R,,R,,,R,...,,D8,D8,D1,D9,D7,D1,D1,,
5,Bachelorette,13,13_ADAM_G,,,,,,,ED,...,,D10,D8,D13,D9,D7,D3,,,


In [3]:
# need to change from wide form data to long form data
# want it to be:
# SHOW, SEASON, CONTESTANT, 'TYPE', 'VALUE'
# with 'TYPE' as the current column names
elim1 = pd.melt(elim_data, id_vars=['SHOW', 'SEASON', 'CONTESTANT'])
elim1.head()

Unnamed: 0,SHOW,SEASON,CONTESTANT,variable,value
0,Bachelorette,13,13_BRYAN_A,ELIMINATION-1,R1
1,Bachelorette,13,13_PETER_K,ELIMINATION-1,
2,Bachelorette,13,13_ERIC_B,ELIMINATION-1,
3,Bachelorette,13,13_DEAN_U,ELIMINATION-1,
4,Bachelorette,13,13_ADAM_G,ELIMINATION-1,


In [4]:
# rename variable and value
elim1.columns = ['SHOW', 'SEASON', 'CONTESTANT', 'TYPE', 'VALUE']
elim1.head()

Unnamed: 0,SHOW,SEASON,CONTESTANT,TYPE,VALUE
0,Bachelorette,13,13_BRYAN_A,ELIMINATION-1,R1
1,Bachelorette,13,13_PETER_K,ELIMINATION-1,
2,Bachelorette,13,13_ERIC_B,ELIMINATION-1,
3,Bachelorette,13,13_DEAN_U,ELIMINATION-1,
4,Bachelorette,13,13_ADAM_G,ELIMINATION-1,


In [5]:
# split TYPE variable
elim1[['ELIM_DATE', 'NUMBER']] = elim1['TYPE'].str.split('-',expand=True)

In [6]:
elim1.head()

Unnamed: 0,SHOW,SEASON,CONTESTANT,TYPE,VALUE,ELIM_DATE,NUMBER
0,Bachelorette,13,13_BRYAN_A,ELIMINATION-1,R1,ELIMINATION,1
1,Bachelorette,13,13_PETER_K,ELIMINATION-1,,ELIMINATION,1
2,Bachelorette,13,13_ERIC_B,ELIMINATION-1,,ELIMINATION,1
3,Bachelorette,13,13_DEAN_U,ELIMINATION-1,,ELIMINATION,1
4,Bachelorette,13,13_ADAM_G,ELIMINATION-1,,ELIMINATION,1


### this is the information we need to model 'survival'

# Need to Bring In the Contestant & Bachelor/ette data
#### modify region/state and occupation
### Then combine it to create a possible demographic predictive set
#### and make some other possible useful variables

In [12]:
bach = pd.read_csv("bachelors.csv")
bach_cont = pd.read_csv("bachelor-contestants.csv")
ette = pd.read_csv("bachelorettes.csv")
ette_cont = pd.read_csv("bachelorette-contestants.csv")

In [14]:
us_state_abbrev = {
    'Alabama': 'AL',
    'Alaska': 'AK',
    'Arizona': 'AZ',
    'Arkansas': 'AR',
    'California': 'CA',
    'Colorado': 'CO',
    'Connecticut': 'CT',
    'Delaware': 'DE',
    'Florida': 'FL',
    'Georgia': 'GA',
    'Hawaii': 'HI',
    'Idaho': 'ID',
    'Illinois': 'IL',
    'Indiana': 'IN',
    'Iowa': 'IA',
    'Kansas': 'KS',
    'Kentucky': 'KY',
    'Louisiana': 'LA',
    'Maine': 'ME',
    'Maryland': 'MD',
    'Massachusetts': 'MA',
    'Michigan': 'MI',
    'Minnesota': 'MN',
    'Mississippi': 'MS',
    'Missouri': 'MO',
    'Montana': 'MT',
    'Nebraska': 'NE',
    'Nevada': 'NV',
    'New Hampshire': 'NH',
    'New Jersey': 'NJ',
    'New Mexico': 'NM',
    'New York': 'NY',
    'North Carolina': 'NC',
    'North Dakota': 'ND',
    'Ohio': 'OH',
    'Oklahoma': 'OK',
    'Oregon': 'OR',
    'Pennsylvania': 'PA',
    'Rhode Island': 'RI',
    'South Carolina': 'SC',
    'South Dakota': 'SD',
    'Tennessee': 'TN',
    'Texas': 'TX',
    'Utah': 'UT',
    'Vermont': 'VT',
    'Virginia': 'VA',
    'Washington': 'WA',
    'West Virginia': 'WV',
    'Wisconsin': 'WI',
    'Wyoming': 'WY',
}

### Bachelors First

In [11]:
bach.head()

Unnamed: 0,Name,Age,Hometown,Height,Season,City,State
0,Alex Michel,32,"Charlottesville, Virginia",,1,Charlottesville,Virginia
1,Aaron Buerge,28,"Butler, Missouri",,2,Butler,Missouri
2,Jesse Palmer,34,"Toronto, Ontario",,5,Toronto,Ontario
3,Lorenzo Borghese,34,"Milan, Italy",,9,Milan,Italy
4,Andy Baldwin,30,"Lancaster, Pennsylvania",,10,Lancaster,Pennsylvania


In [13]:
# Split Hometown column
bach[['Hometown_City', 'Hometown_State']] = bach['Hometown'].str.split(', ',expand=True)
bach.head()

Unnamed: 0,Name,Age,Hometown,Height,Season,Hometown_City,Hometown_State
0,Alex Michel,32,"Charlottesville, Virginia",,1,Charlottesville,Virginia
1,Aaron Buerge,28,"Butler, Missouri",,2,Butler,Missouri
2,Jesse Palmer,34,"Toronto, Ontario",,5,Toronto,Ontario
3,Lorenzo Borghese,34,"Milan, Italy",,9,Milan,Italy
4,Andy Baldwin,30,"Lancaster, Pennsylvania",,10,Lancaster,Pennsylvania


In [16]:
# Convert Hometown_State to us state abbreviation if relevant
StateAbbrev = []
for index, row in bach.iterrows():
    #print(row['c1'], row['c2'])
    if row['Hometown_State'] in us_state_abbrev: 
        StateAbbrev.append(us_state_abbrev[row['Hometown_State']])
    else: 
        StateAbbrev.append(row['Hometown_State'])
        
bach['Hometown_StateAbbrev'] = StateAbbrev
bach.head()

Unnamed: 0,Name,Age,Hometown,Height,Season,Hometown_City,Hometown_State,Hometown_StateAbbrev
0,Alex Michel,32,"Charlottesville, Virginia",,1,Charlottesville,Virginia,VA
1,Aaron Buerge,28,"Butler, Missouri",,2,Butler,Missouri,MO
2,Jesse Palmer,34,"Toronto, Ontario",,5,Toronto,Ontario,Ontario
3,Lorenzo Borghese,34,"Milan, Italy",,9,Milan,Italy,Italy
4,Andy Baldwin,30,"Lancaster, Pennsylvania",,10,Lancaster,Pennsylvania,PA


### Variables We'll Keep:
#### Name
#### Age
#### Season
#### Hometown_City
#### Hometown_StateAbbrev

###### Removing the summary Hometown column (we've broken it up), Hometown_State (because we're using the abbreviation version instead)

In [23]:
final_bach = bach[['Name', 'Age', 'Height', 'Season', 'Hometown_City', 'Hometown_StateAbbrev']].copy()
final_bach.columns = ['BACHELOR', 'BACHELOR_AGE', 'BACHELOR_HEIGHT', 'SEASON', 'BACHELOR_HOMETOWN_CITY', 'BACHELOR_HOMETOWN_STATE']
final_bach.head()

Unnamed: 0,BACHELOR,BACHELOR_AGE,BACHELOR_HEIGHT,SEASON,BACHELOR_HOMETOWN_CITY,BACHELOR_HOMETOWN_STATE
0,Alex Michel,32,,1,Charlottesville,VA
1,Aaron Buerge,28,,2,Butler,MO
2,Jesse Palmer,34,,5,Toronto,Ontario
3,Lorenzo Borghese,34,,9,Milan,Italy
4,Andy Baldwin,30,,10,Lancaster,PA


### Bachelor Contestants

In [20]:
bach_cont.head()

Unnamed: 0,Name,Age,Occupation,Hometown,Height,ElimWeek,Season
0,Amanda Marsh,23.0,Event Planner,"Chanute, Kansas",,,1
1,Trista Rehn,29.0,Miami Heat Dancer,"Miami, Florida",,6.0,1
2,Shannon Oliver,24.0,Financial Management Consultant,"Dallas, Texas",,5.0,1
3,Kim,24.0,Nanny,"Tempe, Arizona",,4.0,1
4,Cathy Grimes,22.0,Graduate Student,"Terra Haute, Indiana",,3.0,1


In [22]:
bach_cont['Height'].value_counts()

67.00    6
64.00    5
65.00    3
62.00    3
66.00    3
68.00    2
61.00    2
67.75    1
63.50    1
63.00    1
62.50    1
70.00    1
67.50    1
Name: Height, dtype: int64

##### Merge final_bach and final_bach_cont on SEASON