In [74]:
import pandas as pd
import numpy as np
import ast

In [2]:
data = pd.read_csv('rr_func_output.csv')

In [3]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 37 entries, 0 to 36
Data columns (total 18 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   Name of the event   37 non-null     object 
 1   Date                37 non-null     object 
 2   Promotion           37 non-null     object 
 3   Type                37 non-null     object 
 4   Location            37 non-null     object 
 5   Arena               37 non-null     object 
 6   Attendance          35 non-null     object 
 7   Broadcast type      37 non-null     object 
 8   Broadcast date      37 non-null     object 
 9   TV station/network  20 non-null     object 
 10  Commentary by       37 non-null     object 
 11  results             37 non-null     object 
 12  winners             37 non-null     object 
 13  notes               37 non-null     object 
 14  Theme               20 non-null     object 
 15  Buyrate (absolute)  6 non-null      float64
 16  Buyrate (r

In [4]:
data.head()

Unnamed: 0,Name of the event,Date,Promotion,Type,Location,Arena,Attendance,Broadcast type,Broadcast date,TV station/network,Commentary by,results,winners,notes,Theme,Buyrate (absolute),Buyrate (relative),TV rating
0,WWE Royal Rumble 2024,27.01.2024,World Wrestling Entertainment,Premium Live Event,"St. Petersburg, Florida, USA",Tropicana Field,46.082,Live,27.01.2024,WWE Network,"Corey Graves, Michael Cole & Pat McAfee",['Bayley defeats Alba Fyre and Asuka and Becky...,"['Bayley', 'Cody Rhodes']",['- Bayley eliminates Indi Hartwell (10:58)\n-...,,,,
1,WWE Royal Rumble 2023,28.01.2023,World Wrestling Entertainment,Premium Live Event,"San Antonio, Texas, USA",Alamodome,47.585,Live,28.01.2023,WWE Network,"Corey Graves, Michael Cole & Pat McAfee",['Cody Rhodes defeats Angelo Dawkins and Austi...,"['Cody Rhodes', 'Rhea Ripley']",['- Sheamus eliminates The Miz (6:58)\n- Drew ...,Sold Out (HARDY),,,
2,WWE Royal Rumble 2022,29.01.2022,World Wrestling Entertainment,Premium Live Event,"St. Louis, Missouri, USA",The Dome at America's Center,39.417,Live,29.01.2022,WWE Network,"Byron Saxton, Corey Graves, Jimmy Smith, Mich...",['Ronda Rousey defeats Alicia Fox and Aliyah a...,"['Ronda Rousey', 'Brock Lesnar']",['- Sasha Banks eliminates Melina (0:54)\n- Sa...,,,,
3,WWE Royal Rumble 2021,31.01.2021,World Wrestling Entertainment,Premium Live Event,"St. Petersburg, Florida, USA",WWE ThunderDome (Tropicana Field),,Live,31.01.2021,WWE Network,"Byron Saxton, Samoa Joe, Tom Phillips, Corey ...",['Bianca Belair defeats Alexa Bliss and Alicia...,"['Bianca Belair', 'Edge']",['- Shayna Baszler eliminates Shotzi Blackhear...,,,,
4,WWE Royal Rumble 2020,26.01.2020,World Wrestling Entertainment,Premium Live Event,"Houston, Texas, USA",Minute Maid Park,42.715,Live,26.01.2020,WWE Network,"Michael Cole, Corey Graves, Tom Phillips, Jer...",['Charlotte Flair defeats Alexa Bliss and Beth...,"['Charlotte Flair', 'Drew McIntyre']",['- Liv Morgan eliminates Lana\n- Lana elimina...,Rumble (Zayde Wolf),,,


In [5]:
# dropping columns that I will not analyse
data.drop(columns = ['Buyrate (absolute)','Buyrate (relative)','TV rating'], inplace=True)

# EDA

## About the event

In [6]:
data['Name of the event'][0]

' WWE Royal Rumble 2024'

In [7]:
data['Name of the event'][len(data)-1]

' WWF Royal Rumble 1988'

In [8]:
data.shape[0]

37

There have been 37 Royal Rumble Annual events starting. 
The first one was in 1988
The latest one was in 2024

## Date

In [9]:
# convert date to datetype
data['Date'] = pd.to_datetime(data['Date'].str.strip(),format ="%d.%m.%Y")

In [10]:
# promotion and type columns not needed 
data.drop(columns = ['Promotion','Type'], inplace=True)

## Location

In [11]:
data['Location'] = data['Location'].str.strip() # remove spaces

In [12]:
data['Location'].value_counts()

San Antonio, Texas, USA            4
Philadelphia, Pennsylvania, USA    3
St. Petersburg, Florida, USA       2
Boston, Massachusetts, USA         2
Fresno, California, USA            2
Miami, Florida, USA                2
Atlanta, Georgia, USA              2
New York City, New York, USA       2
Orlando, Florida, USA              2
Phoenix, Arizona, USA              2
Houston, Texas, USA                2
St. Louis, Missouri, USA           2
Pittsburgh, Pennsylvania, USA      1
Detroit, Michigan, USA             1
New Orleans, Louisiana, USA        1
Anaheim, California, USA           1
San Jose, California, USA          1
Tampa, Florida, USA                1
Providence, Rhode Island, USA      1
Sacramento, California, USA        1
Albany, New York, USA              1
Hamilton, Ontario, Canada          1
Name: Location, dtype: int64

In [13]:
# finding the cities and states that have hosted the event 
location = data['Location'].apply(lambda x:x.split(","))

In [14]:
# Cities
location.apply(lambda x: x[0]).str.strip().value_counts()

San Antonio       4
Philadelphia      3
St. Petersburg    2
Boston            2
Fresno            2
Miami             2
Atlanta           2
New York City     2
Orlando           2
Phoenix           2
Houston           2
St. Louis         2
Pittsburgh        1
Detroit           1
New Orleans       1
Anaheim           1
San Jose          1
Tampa             1
Providence        1
Sacramento        1
Albany            1
Hamilton          1
Name: Location, dtype: int64

In [15]:
# States
location.apply(lambda x: x[1]).str.strip().value_counts()

Florida          7
Texas            6
California       5
Pennsylvania     4
New York         3
Missouri         2
Arizona          2
Massachusetts    2
Georgia          2
Michigan         1
Louisiana        1
Rhode Island     1
Ontario          1
Name: Location, dtype: int64

In [16]:
# Countries
location.apply(lambda x: x[2]).str.strip().value_counts()

USA       36
Canada     1
Name: Location, dtype: int64

In [17]:
# Create a function where if I pass a location (any of city, state, country), return the details of the events

def data_given_location(location_name):
    '''
    Pass a location and get the details of the event that have taken place at the location
    '''
    idx = data['Location'][data['Location'].apply(lambda x:location_name in x) == True].index
    
    return data.iloc[idx]

In [18]:
data_given_location("Georgia")

Unnamed: 0,Name of the event,Date,Location,Arena,Attendance,Broadcast type,Broadcast date,TV station/network,Commentary by,results,winners,notes,Theme
14,WWE Royal Rumble 2010,2010-01-31,"Atlanta, Georgia, USA",Philips Arena,16.697,Live,31.01.2010,PPV Sender,"Jerry Lawler, Matt Striker & Michael Cole",['Edge defeats Batista and Beth Phoenix and Ca...,['Edge'],['- CM Punk eliminates Evan Bourne\n- CM Punk ...,"Hero (Skillet), Martyr No More (Fozzy)"
22,WWF Royal Rumble 2002,2002-01-20,"Atlanta, Georgia, USA",Phillips Arena,12.915,Live,20.01.2002,,Jim Ross & Jerry Lawler,['Triple H defeats Albert and Al Snow and Bill...,['Triple H'],['- Rikishi eliminates The Big Bossman\n- Al S...,Cocky (Kid Rock)


## Arenas

In [19]:
# There are 2 instances of the Philips Arena due to incorrect spelling
# Updating the name to the "State Farm Arena"

In [20]:
# trimming white spaces

In [21]:
data['Arena'] = data['Arena'].str.strip()

In [22]:
data['Arena'][data['Arena']=="Phillips Arena"] = "State Farm Arena"

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['Arena'][data['Arena']=="Phillips Arena"] = "State Farm Arena"


In [23]:
data['Arena'][data['Arena']=="Philips Arena"] = "State Farm Arena"

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['Arena'][data['Arena']=="Philips Arena"] = "State Farm Arena"


In [24]:
data['Arena'][22]

'State Farm Arena'

In [25]:
# Arenas who have hosted the event more than once 
data['Arena'].value_counts()[data['Arena'].value_counts()!=1]

Alamodome                3
Madison Square Garden    2
Wells Fargo Center       2
State Farm Arena         2
Name: Arena, dtype: int64

## Attendence

In [26]:
idx1 = data['Attendance'][data['Attendance'].isnull()==True].index
data.loc[idx1]

Unnamed: 0,Name of the event,Date,Location,Arena,Attendance,Broadcast type,Broadcast date,TV station/network,Commentary by,results,winners,notes,Theme
3,WWE Royal Rumble 2021,2021-01-31,"St. Petersburg, Florida, USA",WWE ThunderDome (Tropicana Field),,Live,31.01.2021,WWE Network,"Byron Saxton, Samoa Joe, Tom Phillips, Corey ...",['Bianca Belair defeats Alexa Bliss and Alicia...,"['Bianca Belair', 'Edge']",['- Shayna Baszler eliminates Shotzi Blackhear...,
15,WWE Royal Rumble 2009,2009-01-25,"Detroit, Michigan, USA",Joe Louis Arena,,Live,25.01.2009,PPV Sender,"Jerry Lawler, Jim Ross, Michael Cole, Tazz, M...",['Randy Orton defeats Carlito and Chris Jerich...,['Randy Orton'],['- Vladimir Kozlov eliminates The Great Khali...,Let It Rock (Kevin Rudolf)


In [27]:
# from internet search -> 2021 attendence 0 (covid) 
# from internet search -> 2009 attendence 16,685

In [28]:
data['Attendance'].loc[15] = str(16685)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['Attendance'].loc[15] = str(16685)


In [29]:
data['Attendance'].loc[3] = str(0)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['Attendance'].loc[3] = str(0)


In [30]:
data['Attendance']

0          46.082
1          47.585
2          39.417
3               0
4          42.715
5          48.193
6          17.629
7          52.020
8          15.170
9          17.164
10         15.715
11         15.103
12         18.121
13         15.113
14         16.697
15          16685
16         20.798
17     ca. 16.000
18         16.178
19     ca. 12.000
20         17.289
21         14.712
22         12.915
23         17.137
24         19.231
25         14.816
26         18.542
27         60.525
28      ca. 9.600
29     ca. 10.000
30     ca. 14.500
31     ca. 16.000
32     ca. 17.000
33     ca. 16.000
34     ca. 16.000
35     ca. 19.000
36     ca. 18.000
Name: Attendance, dtype: object

In [31]:
# ca. -> capped at -> meaning max capacity.
# The data is not available for that. Assuming the ca as the actual capacity

In [32]:
# cleaning attendance
data['Attendance'] = data['Attendance'].str.replace('ca.','').str.replace('.','')

  data['Attendance'] = data['Attendance'].str.replace('ca.','').str.replace('.','')
  data['Attendance'] = data['Attendance'].str.replace('ca.','').str.replace('.','')


In [33]:
data['Attendance'] = data['Attendance'].astype('int')

In [34]:
# removing cols brodcast type and date and TV station/network
data.drop(columns = ['Broadcast type','Broadcast date'],inplace = True)

In [35]:
data.drop(columns = ['TV station/network'],inplace = True)

In [36]:
data.head()

Unnamed: 0,Name of the event,Date,Location,Arena,Attendance,Commentary by,results,winners,notes,Theme
0,WWE Royal Rumble 2024,2024-01-27,"St. Petersburg, Florida, USA",Tropicana Field,46082,"Corey Graves, Michael Cole & Pat McAfee",['Bayley defeats Alba Fyre and Asuka and Becky...,"['Bayley', 'Cody Rhodes']",['- Bayley eliminates Indi Hartwell (10:58)\n-...,
1,WWE Royal Rumble 2023,2023-01-28,"San Antonio, Texas, USA",Alamodome,47585,"Corey Graves, Michael Cole & Pat McAfee",['Cody Rhodes defeats Angelo Dawkins and Austi...,"['Cody Rhodes', 'Rhea Ripley']",['- Sheamus eliminates The Miz (6:58)\n- Drew ...,Sold Out (HARDY)
2,WWE Royal Rumble 2022,2022-01-29,"St. Louis, Missouri, USA",The Dome at America's Center,39417,"Byron Saxton, Corey Graves, Jimmy Smith, Mich...",['Ronda Rousey defeats Alicia Fox and Aliyah a...,"['Ronda Rousey', 'Brock Lesnar']",['- Sasha Banks eliminates Melina (0:54)\n- Sa...,
3,WWE Royal Rumble 2021,2021-01-31,"St. Petersburg, Florida, USA",WWE ThunderDome (Tropicana Field),0,"Byron Saxton, Samoa Joe, Tom Phillips, Corey ...",['Bianca Belair defeats Alexa Bliss and Alicia...,"['Bianca Belair', 'Edge']",['- Shayna Baszler eliminates Shotzi Blackhear...,
4,WWE Royal Rumble 2020,2020-01-26,"Houston, Texas, USA",Minute Maid Park,42715,"Michael Cole, Corey Graves, Tom Phillips, Jer...",['Charlotte Flair defeats Alexa Bliss and Beth...,"['Charlotte Flair', 'Drew McIntyre']",['- Liv Morgan eliminates Lana\n- Lana elimina...,Rumble (Zayde Wolf)


In [37]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 37 entries, 0 to 36
Data columns (total 10 columns):
 #   Column             Non-Null Count  Dtype         
---  ------             --------------  -----         
 0   Name of the event  37 non-null     object        
 1   Date               37 non-null     datetime64[ns]
 2   Location           37 non-null     object        
 3   Arena              37 non-null     object        
 4   Attendance         37 non-null     int32         
 5   Commentary by      37 non-null     object        
 6   results            37 non-null     object        
 7   winners            37 non-null     object        
 8   notes              37 non-null     object        
 9   Theme              20 non-null     object        
dtypes: datetime64[ns](1), int32(1), object(8)
memory usage: 2.9+ KB


# Commentary

In [38]:
data['Commentary by'] = data['Commentary by'].str.replace('&',',')

In [39]:
comm_text = ','.join(data['Commentary by'])

In [40]:
comm_ls = comm_text.split(',')

In [41]:
comm_ls2 = [i.strip() for i in comm_ls]

In [42]:
comm_ls3 = set(comm_ls2)

In [43]:
# Total no of commentators over the events' history
len(comm_ls3)

33

In [44]:
# Most events commentated by, top 10
pd.Series(comm_ls2).value_counts()[:10]

Jerry Lawler              24
Michael Cole              22
Jim Ross                  11
Corey Graves               8
John Bradshaw Layfield     7
Tazz                       7
Byron Saxton               6
Vince McMahon              6
Tom Phillips               5
Gorilla Monsoon            4
dtype: int64

# Winners

In [54]:
data.rename(columns = {'results':'Results','winners':'Winners','notes':'Notes'}, inplace=True)

In [77]:
# list got converted to string; using this library to transform it to list
data['Winners'] = data['Winners'].apply(ast.literal_eval)

In [83]:
# In 1994 2 people won the event, editing the data
data['Winners'].loc[30] = ['Bret Hart','Lex Luger']

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['Winners'].loc[30] = ['Bret Hart','Lex Luger']


In [93]:
# Total number of winners from 1988 to 2024
data['Winners'].apply(lambda x:len(x)).sum()

45

In [97]:
# Since the RR event for women started in 2018; looking at the data for total women winners
len(data['Winners'][data['Date']>="01-01-2018"])

7

In [99]:
# total number of winners who are Men = 45-7 = 38

# Results