In [1]:
import pandas as pd 
import numpy as np
import seaborn as sns
import sklearn
import matplotlib.pyplot as plt
import statsmodels.api as sm
from sklearn import metrics as mt

#SET ENVIRO
pd.set_option('display.max_columns', None)

# RuPaul's Drag Race Analysis: Part 2

Below is the continuation of the RuPauls Drag Race Python Analysis. The "Challenge Outcomes" have now been cleaned and are prepared for one-hot encoded and the final creation of the Queen's table which contains their total stats for each type. This final table is what will be used when looking at the majority of my analysis. 

In [2]:
#Import the files 
ru1 = pd.read_csv('ru_theremix.csv')#cleaned base for Stats table 
ru2 = pd.read_csv('rpdr_contestants.csv')#city & age merge into later after groupby

In [3]:
#check columns ru1 columns
ru1.columns

Index(['index', 'season', 'episode', 'episode_num', 'snatch_game_episode',
       'ru_rank', 'missc', 'queen', 'challenge_outcome', 'eliminated',
       'participant', 'minichalw', 'finale', 'penultimate'],
      dtype='object')

In [4]:
#Drop unnecessary cols 
colsdel= ['index','missc','eliminated','participant','finale','penultimate']
ru1= ru1.drop(colsdel, axis=1)
ru1 #check

Unnamed: 0,season,episode,episode_num,snatch_game_episode,ru_rank,queen,challenge_outcome,minichalw
0,S01,1,S01E1,0,1,BeBe Zahara Benet,SAFE,0
1,S01,1,S01E1,0,2,Nina Flowers,WIN,0
2,S01,1,S01E1,0,3,Rebecca Glasscock,LOW,0
3,S01,1,S01E1,0,4,Shannel,SAFE,0
4,S01,1,S01E1,0,5,Ongina,HIGH,0
...,...,...,...,...,...,...,...,...
1134,S12,12,S12E12,0,DISQ,Sherry Pie,SAFE,0
1135,S12,13,S12E13,0,DISQ,Sherry Pie,SAFE,0
1136,S12,14,S12E14,0,1,Jaida Essence Hall,WIN,0
1137,S12,14,S12E14,0,2,Crystal Methyd,FINALE,0


## STATS ENCODE 
Each stat will be split out into an individual column to then be SUM() together for each Queen. The Snatch Game stats have also now been added. These stats are represented by 'SG_STATNAME' and will have a 1 or 0 depending on the Queens performance. There was no Snatch Game performed on Season 1 so these Queens will not have this stat listed for them. 

In [5]:
#Create Stats cols
stats_encoded = pd.get_dummies(ru1['challenge_outcome']) #one hot encode stats 
stats_encoded #check

Unnamed: 0,BTM,ELIM,FINALE,HIGH,LOW,OUT,QUEEN SUPREME,SAFE,SG BTM,SG ELIM,SG HIGH,SG LOW,SG SAFE,SG WIN,WIN
0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
2,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0
4,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1134,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0
1135,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0
1136,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
1137,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0


In [6]:
#Merge stats back with Queens
ru1= pd.merge(ru1,stats_encoded, left_index=True, right_index=True)
ru1 #check

Unnamed: 0,season,episode,episode_num,snatch_game_episode,ru_rank,queen,challenge_outcome,minichalw,BTM,ELIM,FINALE,HIGH,LOW,OUT,QUEEN SUPREME,SAFE,SG BTM,SG ELIM,SG HIGH,SG LOW,SG SAFE,SG WIN,WIN
0,S01,1,S01E1,0,1,BeBe Zahara Benet,SAFE,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0
1,S01,1,S01E1,0,2,Nina Flowers,WIN,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
2,S01,1,S01E1,0,3,Rebecca Glasscock,LOW,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0
3,S01,1,S01E1,0,4,Shannel,SAFE,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0
4,S01,1,S01E1,0,5,Ongina,HIGH,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1134,S12,12,S12E12,0,DISQ,Sherry Pie,SAFE,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0
1135,S12,13,S12E13,0,DISQ,Sherry Pie,SAFE,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0
1136,S12,14,S12E14,0,1,Jaida Essence Hall,WIN,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
1137,S12,14,S12E14,0,2,Crystal Methyd,FINALE,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0


## GROUP BY QUEENS - SUM STATS 

In [7]:
cols= stats_encoded.columns.to_list()
ru1 = ru1.groupby(['queen','season'])[cols].sum().reset_index() #create total stats for each queen 
ru1

Unnamed: 0,queen,season,BTM,ELIM,FINALE,HIGH,LOW,OUT,QUEEN SUPREME,SAFE,SG BTM,SG ELIM,SG HIGH,SG LOW,SG SAFE,SG WIN,WIN
0,A'Keria C. Davenport,S11,2,0,1,2,0,0,0,5,0,0,0,0,1,0,2
1,Acid Betty,S08,0,0,0,3,0,0,0,1,0,1,0,0,0,0,0
2,Adore Delano,S06,2,0,1,1,2,0,0,1,0,0,1,0,0,0,4
3,Aiden Zhane,S12,0,0,0,2,1,0,0,1,0,1,0,0,0,0,0
4,Aja,S09,1,1,0,0,0,0,0,4,0,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
152,Widow Von'Du,S12,1,1,0,1,0,0,0,3,0,0,0,0,1,0,1
153,Willam,S04,1,1,0,0,0,0,0,4,0,0,1,0,0,0,1
154,Yara Sofia,S03,2,0,0,4,1,0,0,3,0,0,0,1,0,0,1
155,Yuhua Hamasaki,S10,0,1,0,1,0,0,0,1,0,0,0,0,0,0,0


In [8]:
#Check df before merge to gain back queen info
ru2

Unnamed: 0,season,ru_rank,queen,age,dob,city,state
0,S01,1,BeBe Zahara Benet,28,1981-03-20,Minneapolis,Minnesota
1,S01,2,Nina Flowers,34,1974-02-22,Bayamon,Puerto Rico
2,S01,3,Rebecca Glasscock,26,1983-05-25,Fort Lauderdale,Florida
3,S01,4,Shannel,26,1979-07-03,Las Vegas,Nevada
4,S01,5,Ongina,26,1982-01-06,Los Angeles,California
...,...,...,...,...,...,...,...
152,S12,8,Brita,34,1985-09-16,New York,New York
153,S12,9,Aiden Zhane,29,1990-07-16,Acworth,Georgia
154,S12,10,Nicky Doll,28,1991-03-14,New York,New York
155,S12,11,Rock M. Sakura,28,1990-10-15,San Francisco,California


In [9]:
#Merge frames
ru= pd.merge(ru2, ru1,left_on=['queen','season'],right_on=['queen','season'])
ru #check

Unnamed: 0,season,ru_rank,queen,age,dob,city,state,BTM,ELIM,FINALE,HIGH,LOW,OUT,QUEEN SUPREME,SAFE,SG BTM,SG ELIM,SG HIGH,SG LOW,SG SAFE,SG WIN,WIN
0,S01,1,BeBe Zahara Benet,28,1981-03-20,Minneapolis,Minnesota,1,0,0,1,0,0,1,2,0,0,0,0,0,0,2
1,S01,2,Nina Flowers,34,1974-02-22,Bayamon,Puerto Rico,0,0,1,3,1,0,0,1,0,0,0,0,0,0,1
2,S01,3,Rebecca Glasscock,26,1983-05-25,Fort Lauderdale,Florida,2,0,1,1,1,0,0,1,0,0,0,0,0,0,1
3,S01,4,Shannel,26,1979-07-03,Las Vegas,Nevada,1,1,0,1,1,0,0,2,0,0,0,0,0,0,0
4,S01,5,Ongina,26,1982-01-06,Los Angeles,California,0,1,0,2,0,0,0,0,0,0,0,0,0,0,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
152,S12,8,Brita,34,1985-09-16,New York,New York,1,1,0,1,1,0,0,1,1,0,0,0,0,0,0
153,S12,9,Aiden Zhane,29,1990-07-16,Acworth,Georgia,0,0,0,2,1,0,0,1,0,1,0,0,0,0,0
154,S12,10,Nicky Doll,28,1991-03-14,New York,New York,1,1,0,1,1,0,0,0,0,0,0,0,0,0,0
155,S12,11,Rock M. Sakura,28,1990-10-15,San Francisco,California,0,1,0,0,1,0,0,1,0,0,0,0,0,0,0


### Due to the nature of the show, there are a few queens who have appeared on multiple seasons of the show.

In [10]:
ru['queen'].value_counts() #check queens 

Cynthia Lee Fontaine    2
Eureka O'Hara           2
Shangela                2
Vanessa Vanjie Mateo    2
Alexis Michelle         1
                       ..
Ivy Winters             1
Jade Jolie              1
Lineysha Sparx          1
Vivienne Pinay          1
Dahlia Sin              1
Name: queen, Length: 153, dtype: int64

In [11]:
ru.loc[ru['queen']=='Cynthia Lee Fontaine'] #Competed on two different seasons 

Unnamed: 0,season,ru_rank,queen,age,dob,city,state,BTM,ELIM,FINALE,HIGH,LOW,OUT,QUEEN SUPREME,SAFE,SG BTM,SG ELIM,SG HIGH,SG LOW,SG SAFE,SG WIN,WIN
98,S08,10,Cynthia Lee Fontaine,34,1981-02-16,Austin,Texas,0,1,0,0,0,0,0,2,0,0,0,0,0,0,0
110,S09,10,Cynthia Lee Fontaine,34,1981-02-16,Austin,Texas,1,0,0,0,0,0,0,3,0,1,0,0,0,0,0


In [12]:
ru.columns #check

Index(['season', 'ru_rank', 'queen', 'age', 'dob', 'city', 'state', 'BTM',
       'ELIM', 'FINALE', 'HIGH', 'LOW', 'OUT', 'QUEEN SUPREME', 'SAFE',
       'SG BTM', 'SG ELIM', 'SG HIGH', 'SG LOW', 'SG SAFE', 'SG WIN', 'WIN'],
      dtype='object')

In [13]:
#Drop unnesscary cols
ru.drop(['dob','OUT'], axis=1, inplace=True)
ru

Unnamed: 0,season,ru_rank,queen,age,city,state,BTM,ELIM,FINALE,HIGH,LOW,QUEEN SUPREME,SAFE,SG BTM,SG ELIM,SG HIGH,SG LOW,SG SAFE,SG WIN,WIN
0,S01,1,BeBe Zahara Benet,28,Minneapolis,Minnesota,1,0,0,1,0,1,2,0,0,0,0,0,0,2
1,S01,2,Nina Flowers,34,Bayamon,Puerto Rico,0,0,1,3,1,0,1,0,0,0,0,0,0,1
2,S01,3,Rebecca Glasscock,26,Fort Lauderdale,Florida,2,0,1,1,1,0,1,0,0,0,0,0,0,1
3,S01,4,Shannel,26,Las Vegas,Nevada,1,1,0,1,1,0,2,0,0,0,0,0,0,0
4,S01,5,Ongina,26,Los Angeles,California,0,1,0,2,0,0,0,0,0,0,0,0,0,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
152,S12,8,Brita,34,New York,New York,1,1,0,1,1,0,1,1,0,0,0,0,0,0
153,S12,9,Aiden Zhane,29,Acworth,Georgia,0,0,0,2,1,0,1,0,1,0,0,0,0,0
154,S12,10,Nicky Doll,28,New York,New York,1,1,0,1,1,0,0,0,0,0,0,0,0,0
155,S12,11,Rock M. Sakura,28,San Francisco,California,0,1,0,0,1,0,1,0,0,0,0,0,0,0


## RENAME & REORDER COLUMNS 

In [14]:
ru.columns= ru.columns.str.lower() #change to lower case
ru.head()

Unnamed: 0,season,ru_rank,queen,age,city,state,btm,elim,finale,high,low,queen supreme,safe,sg btm,sg elim,sg high,sg low,sg safe,sg win,win
0,S01,1,BeBe Zahara Benet,28,Minneapolis,Minnesota,1,0,0,1,0,1,2,0,0,0,0,0,0,2
1,S01,2,Nina Flowers,34,Bayamon,Puerto Rico,0,0,1,3,1,0,1,0,0,0,0,0,0,1
2,S01,3,Rebecca Glasscock,26,Fort Lauderdale,Florida,2,0,1,1,1,0,1,0,0,0,0,0,0,1
3,S01,4,Shannel,26,Las Vegas,Nevada,1,1,0,1,1,0,2,0,0,0,0,0,0,0
4,S01,5,Ongina,26,Los Angeles,California,0,1,0,2,0,0,0,0,0,0,0,0,0,2


In [15]:
ru.columns

Index(['season', 'ru_rank', 'queen', 'age', 'city', 'state', 'btm', 'elim',
       'finale', 'high', 'low', 'queen supreme', 'safe', 'sg btm', 'sg elim',
       'sg high', 'sg low', 'sg safe', 'sg win', 'win'],
      dtype='object')

In [16]:
rucols= {'season':'season','ru_rank':'season_rank',
        'queen':'queen_name','age':'age',
        'city':'city', 'state':'state',
        'btm':'btm', 'elim':'elim',
        'finale':'finale', 'high':'high',
        'low':'low', 'queen supreme':'queen_supreme',
        'safe':'safe','sg btm':'sg_btm',
        'sg elim':'sg_elim','sg high':'sg_high',
        'sg low':'sg_low','sg safe':'sg_safe',
        'sg win':'sg_win','win':'chall_wins'}

In [17]:
ru.rename(columns=rucols,inplace=True)
ru.head()

Unnamed: 0,season,season_rank,queen_name,age,city,state,btm,elim,finale,high,low,queen_supreme,safe,sg_btm,sg_elim,sg_high,sg_low,sg_safe,sg_win,chall_wins
0,S01,1,BeBe Zahara Benet,28,Minneapolis,Minnesota,1,0,0,1,0,1,2,0,0,0,0,0,0,2
1,S01,2,Nina Flowers,34,Bayamon,Puerto Rico,0,0,1,3,1,0,1,0,0,0,0,0,0,1
2,S01,3,Rebecca Glasscock,26,Fort Lauderdale,Florida,2,0,1,1,1,0,1,0,0,0,0,0,0,1
3,S01,4,Shannel,26,Las Vegas,Nevada,1,1,0,1,1,0,2,0,0,0,0,0,0,0
4,S01,5,Ongina,26,Los Angeles,California,0,1,0,2,0,0,0,0,0,0,0,0,0,2


In [18]:
ru=ru[['season','season_rank','queen_name','age','city','state','queen_supreme','finale','chall_wins',
       'high','safe','low','btm','elim','sg_win','sg_high','sg_safe','sg_low','sg_btm','sg_elim']]
ru.head()

Unnamed: 0,season,season_rank,queen_name,age,city,state,queen_supreme,finale,chall_wins,high,safe,low,btm,elim,sg_win,sg_high,sg_safe,sg_low,sg_btm,sg_elim
0,S01,1,BeBe Zahara Benet,28,Minneapolis,Minnesota,1,0,2,1,2,0,1,0,0,0,0,0,0,0
1,S01,2,Nina Flowers,34,Bayamon,Puerto Rico,0,1,1,3,1,1,0,0,0,0,0,0,0,0
2,S01,3,Rebecca Glasscock,26,Fort Lauderdale,Florida,0,1,1,1,1,1,2,0,0,0,0,0,0,0
3,S01,4,Shannel,26,Las Vegas,Nevada,0,0,0,1,2,1,1,1,0,0,0,0,0,0
4,S01,5,Ongina,26,Los Angeles,California,0,0,2,2,0,0,0,1,0,0,0,0,0,0


In [19]:
ru.dtypes

season           object
season_rank      object
queen_name       object
age               int64
city             object
state            object
queen_supreme     uint8
finale            uint8
chall_wins        uint8
high              uint8
safe              uint8
low               uint8
btm               uint8
elim              uint8
sg_win            uint8
sg_high           uint8
sg_safe           uint8
sg_low            uint8
sg_btm            uint8
sg_elim           uint8
dtype: object

I noticed some of the Queen Supremes where still missing so I have cleaned them up again. 

In [20]:
winners= ru.loc[ru['season_rank']=='1']
ru.loc[ru['season_rank']=='1','queen_supreme']=1
winners

Unnamed: 0,season,season_rank,queen_name,age,city,state,queen_supreme,finale,chall_wins,high,safe,low,btm,elim,sg_win,sg_high,sg_safe,sg_low,sg_btm,sg_elim
0,S01,1,BeBe Zahara Benet,28,Minneapolis,Minnesota,1,0,2,1,2,0,1,0,0,0,0,0,0,0
9,S02,1,Tyra Sanchez,21,Orlando,Florida,1,0,3,2,2,1,0,0,0,0,1,0,0,0
21,S03,1,Raja,36,Los Angeles,California,1,0,3,3,3,1,1,0,0,1,0,0,0,0
34,S04,1,Sharon Needles,29,Pittsburgh,Pennsylvania,1,0,4,1,3,1,1,0,0,1,0,0,0,0
47,S05,1,Jinkx Monsoon,24,Seattle,Washington,1,0,1,7,1,0,1,0,1,0,0,0,0,0
61,S06,1,Bianca Del Rio,36,New Orleans,Louisiana,1,0,4,3,3,0,0,0,0,1,0,0,0,0
75,S07,1,Violet Chachki,22,Atlanta,Georgia,1,0,4,2,3,2,0,0,0,0,1,0,0,0
89,S08,1,Bob the Drag Queen,29,Brooklyn,New York,1,0,3,0,4,0,1,0,1,0,0,0,0,0
101,S09,1,Sasha Velour,29,Brooklyn,New York,0,0,3,4,3,1,0,0,0,1,0,0,0,0
115,S10,1,Aquaria,21,Brooklyn,New York,0,0,3,1,5,2,0,0,1,0,0,0,0,0


Since Queen Supreme was missing I decided to double check the Finale Queens as well. Looked good. 

In [21]:
ru.loc[(ru['season_rank']=='2') | (ru['season_rank']=='3')|(ru['season_rank']=='4')]

Unnamed: 0,season,season_rank,queen_name,age,city,state,queen_supreme,finale,chall_wins,high,safe,low,btm,elim,sg_win,sg_high,sg_safe,sg_low,sg_btm,sg_elim
1,S01,2,Nina Flowers,34,Bayamon,Puerto Rico,0,1,1,3,1,1,0,0,0,0,0,0,0,0
2,S01,3,Rebecca Glasscock,26,Fort Lauderdale,Florida,0,1,1,1,1,1,2,0,0,0,0,0,0,0
3,S01,4,Shannel,26,Las Vegas,Nevada,0,0,0,1,2,1,1,1,0,0,0,0,0,0
10,S02,2,Raven,30,Riverside,California,0,1,2,3,1,0,2,0,0,0,1,0,0,0
11,S02,3,Jujubee,25,Boston,Massachusetts,0,1,0,2,3,0,3,0,0,0,0,1,0,0
12,S02,4,Tatianna,21,Falls Church,Virginia,0,0,0,1,4,1,1,1,1,0,0,0,0,0
22,S03,2,Manila Luzon,28,Cottage Grove,Minnesota,0,1,3,4,2,1,1,0,0,0,1,0,0,0
23,S03,3,Alexis Mateo,30,Saint Petersburg,Florida,0,1,3,1,2,2,3,0,0,1,0,0,0,0
24,S03,4,Yara Sofia,26,Manati,Puerto Rico,0,0,1,4,3,1,2,0,0,0,0,1,0,0
35,S04,2,Chad Michaels,40,San Diego,California,0,1,1,3,4,1,1,0,1,0,0,0,0,0


# DFL SCORE
The DFL has a unique system for awarded Queens points based on their performances on the show. For this particular project the dataset was missing idenitifiers for 'Lip Sync for Your Life Winners' as well as 'Mini Challenge Winners'. Traditionally, these are included in a Queens DFL Score but for this project we have decided to work with the stats given in the data. In the future, I will look into how to include these metrics in the analysis. Below is a breakdown of how the DFL Score is calculated. 

## DFL Regular Points
- ru['queen_supreme'] = 10 pts
- ru['chall_wins'] = 8 pts
- ru['high'] = 5 pts
- ru['safe'] = 3 pts
- ru['low'] = 2 pts
- ru['btm'] = 1 pts
- ru['elim']= 0 pts

## Snatch Game Points

- ru['sg_win'] = 8 pts
- ru['sg_high'] = 5 pts
- ru['sg_safe'] = 3 pts
- ru['sg_low'] = 2 pts
- ru['sg_btm'] = 1 pts
- ru['sg_elim'] = 0 pts

In [22]:
#Calculate DFL Score for each queen based on total stats 
regstatscore=(ru['queen_supreme']*10)+(ru['chall_wins']*8)+(ru['high']*5)+(ru['safe']*3)+(ru['low']*2)+(ru['btm']*1)
snatchscore=(ru['sg_win']*8)+(ru['sg_high']*5)+(ru['sg_safe']*3)+(ru['sg_low']*2)+(ru['sg_btm']*1)

In [23]:
ru['dfl_score']=regstatscore+snatchscore #create DFL Score col
ru #check

Unnamed: 0,season,season_rank,queen_name,age,city,state,queen_supreme,finale,chall_wins,high,safe,low,btm,elim,sg_win,sg_high,sg_safe,sg_low,sg_btm,sg_elim,dfl_score
0,S01,1,BeBe Zahara Benet,28,Minneapolis,Minnesota,1,0,2,1,2,0,1,0,0,0,0,0,0,0,38
1,S01,2,Nina Flowers,34,Bayamon,Puerto Rico,0,1,1,3,1,1,0,0,0,0,0,0,0,0,28
2,S01,3,Rebecca Glasscock,26,Fort Lauderdale,Florida,0,1,1,1,1,1,2,0,0,0,0,0,0,0,20
3,S01,4,Shannel,26,Las Vegas,Nevada,0,0,0,1,2,1,1,1,0,0,0,0,0,0,14
4,S01,5,Ongina,26,Los Angeles,California,0,0,2,2,0,0,0,1,0,0,0,0,0,0,26
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
152,S12,8,Brita,34,New York,New York,0,0,0,1,1,1,1,1,0,0,0,0,1,0,12
153,S12,9,Aiden Zhane,29,Acworth,Georgia,0,0,0,2,1,1,0,0,0,0,0,0,0,1,15
154,S12,10,Nicky Doll,28,New York,New York,0,0,0,1,0,1,1,1,0,0,0,0,0,0,8
155,S12,11,Rock M. Sakura,28,San Francisco,California,0,0,0,0,1,1,0,1,0,0,0,0,0,0,5


## DFL RANK

The DFL score is then used to assign each queen a DFL Rank per season. I am interested to see how the DFL Rank and the Season Rank compare to each other and will be exploring it further in the future of this project. Queens who have a tied DFL Rank will receive a '0.5' rank. For example, Alaska and Roxxxy Andrews both tied for 2nd in the DFL Rank so they are both assigned as '2.5'.In the future, I would like to look at using the Season Rank to possibly break these ties and get a more accurate DFL Rank. 

In [24]:
ru['dfl_rank']= ru.groupby('season')['dfl_score'].rank(ascending=False)
ru

Unnamed: 0,season,season_rank,queen_name,age,city,state,queen_supreme,finale,chall_wins,high,safe,low,btm,elim,sg_win,sg_high,sg_safe,sg_low,sg_btm,sg_elim,dfl_score,dfl_rank
0,S01,1,BeBe Zahara Benet,28,Minneapolis,Minnesota,1,0,2,1,2,0,1,0,0,0,0,0,0,0,38,1.0
1,S01,2,Nina Flowers,34,Bayamon,Puerto Rico,0,1,1,3,1,1,0,0,0,0,0,0,0,0,28,2.0
2,S01,3,Rebecca Glasscock,26,Fort Lauderdale,Florida,0,1,1,1,1,1,2,0,0,0,0,0,0,0,20,4.0
3,S01,4,Shannel,26,Las Vegas,Nevada,0,0,0,1,2,1,1,1,0,0,0,0,0,0,14,5.0
4,S01,5,Ongina,26,Los Angeles,California,0,0,2,2,0,0,0,1,0,0,0,0,0,0,26,3.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
152,S12,8,Brita,34,New York,New York,0,0,0,1,1,1,1,1,0,0,0,0,1,0,12,10.0
153,S12,9,Aiden Zhane,29,Acworth,Georgia,0,0,0,2,1,1,0,0,0,0,0,0,0,1,15,9.0
154,S12,10,Nicky Doll,28,New York,New York,0,0,0,1,0,1,1,1,0,0,0,0,0,0,8,11.0
155,S12,11,Rock M. Sakura,28,San Francisco,California,0,0,0,0,1,1,0,1,0,0,0,0,0,0,5,12.0


In [25]:
ru.value_counts('dfl_rank')

dfl_rank
1.0     12
2.0     11
6.0     11
7.0     11
12.0    10
3.0     10
11.0    10
4.0     10
5.0     10
10.0     9
9.0      9
8.0      9
13.0     7
14.0     5
8.5      4
7.5      2
5.5      2
9.5      2
4.5      2
10.5     2
3.5      2
2.5      2
12.5     2
13.5     2
15.0     1
dtype: int64

### TIED DFL RANK EXAMPLE BELOW

In [26]:
ru.loc[ru['dfl_rank']==2.5]

Unnamed: 0,season,season_rank,queen_name,age,city,state,queen_supreme,finale,chall_wins,high,safe,low,btm,elim,sg_win,sg_high,sg_safe,sg_low,sg_btm,sg_elim,dfl_score,dfl_rank
48,S05,2,Alaska,27,Pittsburgh,Pennsylvania,0,1,2,3,3,2,0,0,0,1,0,0,0,0,49,2.5
49,S05,2,Roxxxy Andrews,28,Orlando,Florida,0,1,2,4,1,2,1,0,0,1,0,0,0,0,49,2.5


# EXPORT TO EXCEL FOR TABLEAU

In [27]:
ru.to_csv('ru_final2_wrank.csv')

# LETS PLAY THE SNATCH GAME!

Here I will be exploring modelling to help determine if a WIN in Snatch Game will predict if a Queen will make it into the Finale of the show. 
- does a snatchgame win: [ru['sg_win']==1] determine if the queen will make it to the finale [ru['finale']==1]


## SNATCH GAME EXPLORING

In [28]:
#Finding all the Winners of Snatch Game across all Seasons. 
#REMINDER: S1 did not have a Snatch Game so S1 Queens will not be included 
sgwinners= ru.loc[ru['sg_win']==1]
sgwinners

Unnamed: 0,season,season_rank,queen_name,age,city,state,queen_supreme,finale,chall_wins,high,safe,low,btm,elim,sg_win,sg_high,sg_safe,sg_low,sg_btm,sg_elim,dfl_score,dfl_rank
12,S02,4,Tatianna,21,Falls Church,Virginia,0,0,0,1,4,1,1,1,1,0,0,0,0,0,28,3.5
28,S03,8,Stacy Layne Matthews,25,Back Swamp,North Carolina,0,0,0,1,2,0,1,1,1,0,0,0,0,0,20,7.0
35,S04,2,Chad Michaels,40,San Diego,California,0,1,1,3,4,1,1,0,1,0,0,0,0,0,46,2.0
47,S05,1,Jinkx Monsoon,24,Seattle,Washington,1,0,1,7,1,0,1,0,1,0,0,0,0,0,65,1.0
65,S06,5,BenDeLaCreme,31,Seattle,Washington,0,0,1,4,1,1,2,0,1,0,0,0,0,0,43,4.0
76,S07,2,Ginger Minj,29,Orlando,Florida,0,1,3,2,4,0,2,0,1,0,0,0,0,0,56,2.0
78,S07,4,Kennedy Davenport,33,Dallas,Texas,0,0,1,3,4,0,2,1,1,0,0,0,0,0,45,5.0
89,S08,1,Bob the Drag Queen,29,Brooklyn,New York,1,0,3,0,4,0,1,0,1,0,0,0,0,0,55,1.0
105,S09,5,Alexis Michelle,33,New York,New York,0,0,0,1,5,2,1,1,1,0,0,0,0,0,33,5.5
115,S10,1,Aquaria,21,Brooklyn,New York,1,0,3,1,5,2,0,0,1,0,0,0,0,0,66,1.0


In [29]:
#Comparing the Snatch Game Winners to the Queens who have won the most challenges overall 
ru.sort_values(['chall_wins'], ascending=False).head(12) #12 becuase there are 12 winners of snatch game 

Unnamed: 0,season,season_rank,queen_name,age,city,state,queen_supreme,finale,chall_wins,high,safe,low,btm,elim,sg_win,sg_high,sg_safe,sg_low,sg_btm,sg_elim,dfl_score,dfl_rank
62,S06,2,Adore Delano,23,Azusa,California,0,1,4,1,1,2,2,0,0,1,0,0,0,0,51,2.0
146,S12,1,Jaida Essence Hall,32,Milwaukee,Wisconsin,1,0,4,2,4,0,1,0,0,0,1,0,0,0,68,1.0
103,S09,3,Shea Couleé,27,Chicago,Illinois,0,1,4,3,2,0,1,0,0,0,1,0,0,0,57,2.0
34,S04,1,Sharon Needles,29,Pittsburgh,Pennsylvania,1,0,4,1,3,1,1,0,0,1,0,0,0,0,64,1.0
145,S12,2,Gigi Goode,21,Los Angeles,California,0,1,4,1,3,2,0,0,1,0,0,0,0,0,58,2.0
61,S06,1,Bianca Del Rio,36,New Orleans,Louisiana,1,0,4,3,3,0,0,0,0,1,0,0,0,0,71,1.0
75,S07,1,Violet Chachki,22,Atlanta,Georgia,1,0,4,2,3,2,0,0,0,0,1,0,0,0,68,1.0
147,S12,DISQ,Sherry Pie,27,New York,New York,0,0,3,1,6,1,0,0,0,1,0,0,0,0,54,3.0
130,S11,2,Brooke Lynn Hytes,32,Toronto,Ontario,0,1,3,4,2,1,1,0,0,0,0,0,1,0,54,2.0
104,S09,3,Trinity Taylor,31,Orlando,Florida,0,1,3,2,2,2,1,0,0,0,1,0,0,0,48,3.0


### Snatch Game Table 
- All the queens except STACY LANE MATTHEWS (S03) scored in the Top 5 in both their Season Rank as well as their DFL Rank. 
- I noticed a few Snatch Game Winners also had alot of ['chall_wins'] so I was curious to see the Top 12 Queens who had won the most challenges during their season. 
- The Queen who scored the most amount of ['chall_wins'] ending their seasons ranking in the Top 3 Queens for both the season and the DFL ranks. 

In [30]:
ru.dtypes

season            object
season_rank       object
queen_name        object
age                int64
city              object
state             object
queen_supreme      uint8
finale             uint8
chall_wins         uint8
high               uint8
safe               uint8
low                uint8
btm                uint8
elim               uint8
sg_win             uint8
sg_high            uint8
sg_safe            uint8
sg_low             uint8
sg_btm             uint8
sg_elim            uint8
dfl_score          uint8
dfl_rank         float64
dtype: object

CONVERT SEASON_RANK FROM OBJECT TO INT 

In [31]:
ru['season_rank'].unique()

array(['1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12',
       '13', '14', '15', 'DISQ'], dtype=object)

In [32]:
ru.loc[ru['season_rank']=='DISQ','season_rank']='0' #this is due to Sherry Pies DISQ in her Season Rank 

In [33]:
ru['season_rank'].unique()

array(['1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12',
       '13', '14', '15', '0'], dtype=object)

In [34]:
ru['season_rank'] = ru['season_rank'].astype(str).astype(int) #Convert here

In [35]:
ru.dtypes

season            object
season_rank        int64
queen_name        object
age                int64
city              object
state             object
queen_supreme      uint8
finale             uint8
chall_wins         uint8
high               uint8
safe               uint8
low                uint8
btm                uint8
elim               uint8
sg_win             uint8
sg_high            uint8
sg_safe            uint8
sg_low             uint8
sg_btm             uint8
sg_elim            uint8
dfl_score          uint8
dfl_rank         float64
dtype: object

# ROUND 1 - REGRESSION
To begin my modelling I chose to use the regression model and systematically removed unnecessary columns that have the lowest statistical significance before re-running the model each time. 

In [36]:
ru.columns

Index(['season', 'season_rank', 'queen_name', 'age', 'city', 'state',
       'queen_supreme', 'finale', 'chall_wins', 'high', 'safe', 'low', 'btm',
       'elim', 'sg_win', 'sg_high', 'sg_safe', 'sg_low', 'sg_btm', 'sg_elim',
       'dfl_score', 'dfl_rank'],
      dtype='object')

In [37]:
regressData = ru[['season_rank','age','queen_supreme','finale','chall_wins','high','low','btm',
                  'sg_win','sg_high','sg_safe','sg_low','sg_btm']].copy()

In [38]:
d_var=regressData['finale']
i_var=regressData.drop(columns='finale')
i_var['constant'] = np.ones(i_var.shape[0])

In [39]:
lin_reg = sm.OLS(d_var,i_var) #OLS 
reg_result = lin_reg.fit() #store the result instead a variable 
print(reg_result.summary())

                            OLS Regression Results                            
Dep. Variable:                 finale   R-squared:                       0.370
Model:                            OLS   Adj. R-squared:                  0.327
Method:                 Least Squares   F-statistic:                     8.574
Date:                Mon, 25 Apr 2022   Prob (F-statistic):           6.28e-11
Time:                        19:49:32   Log-Likelihood:                -33.500
No. Observations:                 157   AIC:                             89.00
Df Residuals:                     146   BIC:                             122.6
Df Model:                          10                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
age            0.0015      0.005      0.316      0.7

### RE-RUN MODEL - REMOVE AGE

In [40]:
i_var2 = i_var.drop(columns=['age'])

In [41]:
lin_reg2 = sm.OLS(d_var,i_var2) #OLS 
reg_result2 = lin_reg2.fit() #store the result instead a variable 
print(reg_result2.summary())

                            OLS Regression Results                            
Dep. Variable:                 finale   R-squared:                       0.605
Model:                            OLS   Adj. R-squared:                  0.575
Method:                 Least Squares   F-statistic:                     20.20
Date:                Mon, 25 Apr 2022   Prob (F-statistic):           3.21e-24
Time:                        17:29:44   Log-Likelihood:                 3.1679
No. Observations:                 157   AIC:                             17.66
Df Residuals:                     145   BIC:                             54.34
Df Model:                          11                                         
Covariance Type:            nonrobust                                         
                    coef    std err          t      P>|t|      [0.025      0.975]
---------------------------------------------------------------------------------
season_rank      -0.0336      0.010     -3.364

### RE-RUN MODEL - REMOVE LOW

In [42]:
i_var3 = i_var2.drop(columns=['low'])

lin_reg3 = sm.OLS(d_var,i_var3) #OLS 
reg_result3 = lin_reg3.fit() #store the result instead a variable 
print(reg_result3.summary())

                            OLS Regression Results                            
Dep. Variable:                 finale   R-squared:                       0.605
Model:                            OLS   Adj. R-squared:                  0.578
Method:                 Least Squares   F-statistic:                     22.35
Date:                Mon, 25 Apr 2022   Prob (F-statistic):           6.79e-25
Time:                        17:29:45   Log-Likelihood:                 3.1327
No. Observations:                 157   AIC:                             15.73
Df Residuals:                     146   BIC:                             49.35
Df Model:                          10                                         
Covariance Type:            nonrobust                                         
                    coef    std err          t      P>|t|      [0.025      0.975]
---------------------------------------------------------------------------------
season_rank      -0.0345      0.009     -3.685

# CONCLUSION

Overall, through this modelling we can see that a Snatch Game Win ['sg_win'] is negatively correlated with a Queen making it to the Finale. However, I would need several more seasons of RPDR to determine if a Snatch Game actually negatively effects a Queens chances of making it to the end of the show. Additionaly, ['chall_win'] has the highest correlation to being statistically significant to a Queen reaching the Finale of the show. This disproves my hypothesis as I felt like Snatch Game would have a high impact on determining which Queens make it to the end.


