# Iterative Random Forest Prediction
### Using Bachelorette Data, Predicts the Winner Each Week

In [13]:
# load in libraries
import pandas as pd
import numpy as np
%matplotlib inline
import scipy as sp
import sklearn as sk
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
import sklearn.ensemble as skens
import sklearn.metrics as skmetric
import sklearn.naive_bayes as sknb
import sklearn.tree as sktree
import matplotlib.pyplot as plt
import seaborn as sns
sns.set(style='white', color_codes=True, font_scale=1.3)
import sklearn.externals.six as sksix
import IPython.display as ipd
from sklearn.model_selection import cross_val_score
from sklearn import metrics
import os

In [14]:
# load in data files
bachelorette = pd.read_csv('bachelorette_forprediction.csv')
sentiment = pd.read_csv('twitterfeatures.csv')

In [15]:
sentiment.head()
# take only seasons we have matching data for
sentiment = sentiment[(sentiment.Season == 12)|(sentiment.Season == 11)].copy()

print(len(sentiment))
print(len(bachelorette))

# merge with bachelorette 
ette_all = bachelorette.merge(sentiment, left_on = ['NAME', 'SEASON'], right_on = ['Name', 'Season'])

print(len(ette_all))

52
207
52


In [22]:
ette_all.SEASON.unique()

array([12, 11], dtype=int64)

In [23]:
ette_all.columns

Index(['Unnamed: 0', 'SHOW', 'SEASON', 'ETTE_NAME', 'CONTESTANT', 'NAME',
       'AGE', 'ELIMINATION-1', 'ELIMINATION-2', 'ELIMINATION-3',
       ...
       '9.0-Total', '9.0-choose', '9.0-win', 'Age', 'ElimWeek', 'Hometown',
       'Name', 'Occupation', 'Season', 'firstname'],
      dtype='object', length=128)

In [25]:
ette_all = ette_all[['SHOW', 'SEASON', 'ETTE_NAME', 'CONTESTANT', 'NAME',
       'AGE', 'ELIMINATION-1', 'ELIMINATION-2', 'ELIMINATION-3',
       'ELIMINATION-4', 'ELIMINATION-5', 'ELIMINATION-6', 'ELIMINATION-7',
       'ELIMINATION-8', 'ELIMINATION-9', 'ELIMINATION-10', 'DATES-1',
       'DATES-2', 'DATES-3', 'DATES-4', 'DATES-5', 'DATES-6', 'DATES-7',
       'DATES-8', 'DATES-9', 'DATES-10', 'FIR-1', 'FIR-2', 'FIR-3', 'FIR-4',
       'FIR-5', 'FIR-6', 'FIR-7', 'FIR-8', 'FIR-9', 'FIR-10', 'ROSE-1',
       'ROSE-2', 'ROSE-3', 'ROSE-4', 'ROSE-5', 'ROSE-6', 'ROSE-7', 'ROSE-8',
       'ROSE-9', 'ROSE-10', '1.0-Negative', '1.0-Neutral', '1.0-Positive', '1.0-Total',
       '1.0-choose', '1.0-win', '10.0-Negative', '10.0-Neutral',
       '10.0-Positive', '10.0-Total', '10.0-choose', '10.0-win',
       '11.0-Negative', '11.0-Neutral', '11.0-Positive', '11.0-Total',
       '11.0-choose', '11.0-win', '2.0-Negative', '2.0-Neutral',
       '2.0-Positive', '2.0-Total', '2.0-choose', '2.0-win', '3.0-Negative',
       '3.0-Neutral', '3.0-Positive', '3.0-Total', '3.0-choose', '3.0-win',
       '4.0-Negative', '4.0-Neutral', '4.0-Positive', '4.0-Total',
       '4.0-choose', '4.0-win', '5.0-Negative', '5.0-Neutral', '5.0-Positive',
       '5.0-Total', '5.0-choose', '5.0-win', '6.0-Negative', '6.0-Neutral',
       '6.0-Positive', '6.0-Total', '6.0-choose', '6.0-win', '7.0-Negative',
       '7.0-Neutral', '7.0-Positive', '7.0-Total', '7.0-choose', '7.0-win',
       '8.0-Negative', '8.0-Neutral', '8.0-Positive', '8.0-Total',
       '8.0-choose', '8.0-win', '9.0-Negative', '9.0-Neutral', '9.0-Positive',
       '9.0-Total', '9.0-choose', '9.0-win', 'AGE_DIFF', 'AGE_DIFF_MEAN_POOL', 'AGE_DIFF_CAT',
       'SAME_CITY', 'SAME_STATE', 'SAME_REGION', 'SAME_OCCUPATIONGROUP',
       'WINNER']].copy()

In [26]:
ette_all.dtypes

SHOW                     object
SEASON                    int64
ETTE_NAME                object
CONTESTANT               object
NAME                     object
AGE                       int64
ELIMINATION-1            object
ELIMINATION-2            object
ELIMINATION-3            object
ELIMINATION-4            object
ELIMINATION-5            object
ELIMINATION-6            object
ELIMINATION-7            object
ELIMINATION-8            object
ELIMINATION-9            object
ELIMINATION-10           object
DATES-1                 float64
DATES-2                 float64
DATES-3                 float64
DATES-4                 float64
DATES-5                 float64
DATES-6                 float64
DATES-7                 float64
DATES-8                 float64
DATES-9                 float64
DATES-10                float64
FIR-1                     int64
FIR-2                     int64
FIR-3                     int64
FIR-4                     int64
                         ...   
6.0-Posi

## Prediction System

In [29]:
test_bach = ette_all[ette_all.SEASON == 12].copy()
train_bach = ette_all[ette_all.SEASON != 12].copy()

keep_set = test_bach[['NAME', 'WINNER']].copy()

week = 1
# 'Occupation', 'Agreement_1', 'Hometown'
vars_to_use = ['WINNER', 'NAME', 'AGE', 'AGE_DIFF_CAT', 'SAME_STATE', 'SAME_REGION', 'SAME_OCCUPATIONGROUP'] ### will need  to add other relevant predictor variables 

while week <= 9: # ten weeks of data available
        

    new_var = 'FIR-' + str(week)
    vars_to_use.append(new_var)
    
    new_var = 'ROSE-' + str(week)
    vars_to_use.append(new_var)
    
    
    new_var = 'DATES-' + str(week)
    vars_to_use.append(new_var)
    
    # create model
    rf_model = skens.RandomForestClassifier(n_estimators=10,oob_score=True, criterion='entropy')
    
    # train on correct variables based on week
    ### need 'WINNER' binary as right/wrong
    this_train = train_bach[vars_to_use].copy()
    
    print(vars_to_use)

    # train model on our train set
    rf_model.fit(this_train[vars_to_use[2:]], this_train.WINNER)
    
    # set up test set
    this_test = test_bach[vars_to_use].copy()
    
    # test on correct variables based on week
    # hide winners
    predicted_labels = rf_model.predict(this_test[vars_to_use[2:]])
    
    # add in predictions
    pred_week = 'PREDICTED' + str(week)
    this_test[pred_week] = predicted_labels
    to_output = this_test[['NAME', 'WINNER', pred_week]].copy()
    keep_set = keep_set.merge(to_output, on = ['NAME', 'WINNER'], how = 'left')
    
    # remove contestants based on ELIMINATION
    e_week_name = 'ELIMINATION-' + str(week)
    
    # keep them if their elimination value is null or is R (rose)
    test_bach = test_bach[(test_bach[e_week_name].isnull()) | (test_bach[e_week_name] == 'R') | (test_bach[e_week_name] == 'R1')]
    train_bach = train_bach[(train_bach[e_week_name].isnull()) | (train_bach[e_week_name] == 'R') | (train_bach[e_week_name] == 'R1')]
    #print(len(test_bach))
    #print(len(train_bach))
    
    # increment time
    week = week + 1


['WINNER', 'NAME', 'AGE', 'AGE_DIFF_CAT', 'SAME_STATE', 'SAME_REGION', 'SAME_OCCUPATIONGROUP', 'FIR-1', 'ROSE-1', 'DATES-1']
['WINNER', 'NAME', 'AGE', 'AGE_DIFF_CAT', 'SAME_STATE', 'SAME_REGION', 'SAME_OCCUPATIONGROUP', 'FIR-1', 'ROSE-1', 'DATES-1', 'FIR-2', 'ROSE-2', 'DATES-2']
['WINNER', 'NAME', 'AGE', 'AGE_DIFF_CAT', 'SAME_STATE', 'SAME_REGION', 'SAME_OCCUPATIONGROUP', 'FIR-1', 'ROSE-1', 'DATES-1', 'FIR-2', 'ROSE-2', 'DATES-2', 'FIR-3', 'ROSE-3', 'DATES-3']
['WINNER', 'NAME', 'AGE', 'AGE_DIFF_CAT', 'SAME_STATE', 'SAME_REGION', 'SAME_OCCUPATIONGROUP', 'FIR-1', 'ROSE-1', 'DATES-1', 'FIR-2', 'ROSE-2', 'DATES-2', 'FIR-3', 'ROSE-3', 'DATES-3', 'FIR-4', 'ROSE-4', 'DATES-4']
['WINNER', 'NAME', 'AGE', 'AGE_DIFF_CAT', 'SAME_STATE', 'SAME_REGION', 'SAME_OCCUPATIONGROUP', 'FIR-1', 'ROSE-1', 'DATES-1', 'FIR-2', 'ROSE-2', 'DATES-2', 'FIR-3', 'ROSE-3', 'DATES-3', 'FIR-4', 'ROSE-4', 'DATES-4', 'FIR-5', 'ROSE-5', 'DATES-5']


  warn("Some inputs do not have OOB scores. "
  predictions[k].sum(axis=1)[:, np.newaxis])


['WINNER', 'NAME', 'AGE', 'AGE_DIFF_CAT', 'SAME_STATE', 'SAME_REGION', 'SAME_OCCUPATIONGROUP', 'FIR-1', 'ROSE-1', 'DATES-1', 'FIR-2', 'ROSE-2', 'DATES-2', 'FIR-3', 'ROSE-3', 'DATES-3', 'FIR-4', 'ROSE-4', 'DATES-4', 'FIR-5', 'ROSE-5', 'DATES-5', 'FIR-6', 'ROSE-6', 'DATES-6']
['WINNER', 'NAME', 'AGE', 'AGE_DIFF_CAT', 'SAME_STATE', 'SAME_REGION', 'SAME_OCCUPATIONGROUP', 'FIR-1', 'ROSE-1', 'DATES-1', 'FIR-2', 'ROSE-2', 'DATES-2', 'FIR-3', 'ROSE-3', 'DATES-3', 'FIR-4', 'ROSE-4', 'DATES-4', 'FIR-5', 'ROSE-5', 'DATES-5', 'FIR-6', 'ROSE-6', 'DATES-6', 'FIR-7', 'ROSE-7', 'DATES-7']
['WINNER', 'NAME', 'AGE', 'AGE_DIFF_CAT', 'SAME_STATE', 'SAME_REGION', 'SAME_OCCUPATIONGROUP', 'FIR-1', 'ROSE-1', 'DATES-1', 'FIR-2', 'ROSE-2', 'DATES-2', 'FIR-3', 'ROSE-3', 'DATES-3', 'FIR-4', 'ROSE-4', 'DATES-4', 'FIR-5', 'ROSE-5', 'DATES-5', 'FIR-6', 'ROSE-6', 'DATES-6', 'FIR-7', 'ROSE-7', 'DATES-7', 'FIR-8', 'ROSE-8', 'DATES-8']
['WINNER', 'NAME', 'AGE', 'AGE_DIFF_CAT', 'SAME_STATE', 'SAME_REGION', 'SAME_OCCUPATI

  warn("Some inputs do not have OOB scores. "
  predictions[k].sum(axis=1)[:, np.newaxis])


In [30]:
keep_set

Unnamed: 0,NAME,WINNER,PREDICTED1,PREDICTED2,PREDICTED3,PREDICTED4,PREDICTED5,PREDICTED6,PREDICTED7,PREDICTED8,PREDICTED9
0,Jordan Rodgers,1,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,Robby Hayes,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,Chase McNary,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,Luke Pell,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,
4,James Taylor,0,0,0.0,0.0,0.0,0.0,0.0,1.0,,
5,Alex Woytkiw,0,0,0.0,0.0,0.0,0.0,0.0,0.0,,
6,Derek Peth,0,0,0.0,0.0,0.0,0.0,0.0,,,
7,Wells Adams,0,0,0.0,0.0,0.0,0.0,0.0,,,
8,"Vincent ""Vinny"" Ventiera",0,0,0.0,0.0,0.0,0.0,,,,
9,Grant Kemp,0,0,0.0,0.0,0.0,0.0,,,,


In [39]:
# accuracy = (true positive + true negative) / total examples
# true positive = WINNER == 1 and PREDICTED == 1
# true negative = WINNER == 0 and PREDICTED == 0
# total examples = count(PREDICTED)

week = 1
metrics = {}

while week <= 9:
    tp = 'TRUE_POSITIVE_' + str(week)
    tn = 'TRUE_NEGATIVE_' + str(week)
    fp = 'FALSE_POSITIVE_' + str(week)
    fn = 'FALSE_NEGATIVE_' + str(week)
    predict_use = 'PREDICTED' + str(week)
    keep_set[tp] = np.where((keep_set.WINNER == 1) & (keep_set[predict_use] == 1), 1, 0)
    keep_set[tn] = np.where((keep_set.WINNER == 0) & (keep_set[predict_use] == 0), 1, 0)
    keep_set[fp] = np.where((keep_set.WINNER == 0) & (keep_set[predict_use] == 1), 1, 0)
    keep_set[fn] = np.where((keep_set.WINNER == 1) & (keep_set[predict_use] == 0), 1, 0)

    total_examples = keep_set[predict_use].count()

    accuracy = (keep_set[tp].sum() + keep_set[tn].sum()) / total_examples
    accuracy_use = 'ACCURACY' + str(week)
    metrics[accuracy_use] = accuracy

    precision = keep_set[tp].sum() / (keep_set[tp].sum() + keep_set[fp].sum())
    precision_use = 'PRECISION' + str(week)
    metrics[precision_use] = precision
    
    recall = keep_set[tp].sum() / (keep_set[tp].sum() + keep_set[fn].sum())
    recall_use = 'RECALL' + str(week)
    metrics[recall_use] = recall

    week = week + 1
    
print(metrics)


{'ACCURACY1': 0.9615384615384616, 'PRECISION1': nan, 'RECALL1': 0.0, 'ACCURACY2': 0.95, 'PRECISION2': nan, 'RECALL2': 0.0, 'ACCURACY3': 0.9411764705882353, 'PRECISION3': nan, 'RECALL3': 0.0, 'ACCURACY4': 0.9285714285714286, 'PRECISION4': nan, 'RECALL4': 0.0, 'ACCURACY5': 0.9090909090909091, 'PRECISION5': nan, 'RECALL5': 0.0, 'ACCURACY6': 0.875, 'PRECISION6': nan, 'RECALL6': 0.0, 'ACCURACY7': 0.6666666666666666, 'PRECISION7': 0.0, 'RECALL7': 0.0, 'ACCURACY8': 0.75, 'PRECISION8': nan, 'RECALL8': 0.0, 'ACCURACY9': 0.6666666666666666, 'PRECISION9': nan, 'RECALL9': 0.0}




Unnamed: 0,NAME,WINNER,PREDICTED1,PREDICTED2,PREDICTED3,PREDICTED4,PREDICTED5,PREDICTED6,PREDICTED7,PREDICTED8,...,FALSE_POSITIVE_7,FALSE_NEGATIVE_7,TRUE_POSITIVE_8,TRUE_NEGATIVE_8,FALSE_POSITIVE_8,FALSE_NEGATIVE_8,TRUE_POSITIVE_9,TRUE_NEGATIVE_9,FALSE_POSITIVE_9,FALSE_NEGATIVE_9
0,Jordan Rodgers,1,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,1,0,0,0,1,0,0,0,1
1,Robby Hayes,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,1,0,0,0,1,0,0
2,Chase McNary,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,1,0,0,0,1,0,0
3,Luke Pell,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,1,0,0,0,0,0,0
4,James Taylor,0,0,0.0,0.0,0.0,0.0,0.0,1.0,,...,1,0,0,0,0,0,0,0,0,0
5,Alex Woytkiw,0,0,0.0,0.0,0.0,0.0,0.0,0.0,,...,0,0,0,0,0,0,0,0,0,0
6,Derek Peth,0,0,0.0,0.0,0.0,0.0,0.0,,,...,0,0,0,0,0,0,0,0,0,0
7,Wells Adams,0,0,0.0,0.0,0.0,0.0,0.0,,,...,0,0,0,0,0,0,0,0,0,0
8,"Vincent ""Vinny"" Ventiera",0,0,0.0,0.0,0.0,0.0,,,,...,0,0,0,0,0,0,0,0,0,0
9,Grant Kemp,0,0,0.0,0.0,0.0,0.0,,,,...,0,0,0,0,0,0,0,0,0,0
