# **Roller Coaster Database**
<div style="text-align: center;">
  <img src="../assets/flipflap.jpg" width="30%" />
</div>

In [1]:
# Import and setup libraries
import pandas as pd
import numpy as np

# default options for pandas rows and cols
pd.set_option('display.max_columns', 100)
pd.set_option('display.max_rows', 100)

# plotting style
import matplotlib.pyplot as plt
plt.style.use('ggplot')

In [2]:
df = pd.read_csv("data/coaster_db.csv")

## **EDA**

### 1. Estructira general del dataset

In [5]:
# ¿Cuál es la forma (shape) del dataset?
df.shape

(1087, 56)

In [7]:
# ¿Cuáles son las primeras 5 filas del dataset?
df.head(5)

Unnamed: 0,coaster_name,Length,Speed,Location,Status,Opening date,Type,Manufacturer,Height restriction,Model,Height,Inversions,Lift/launch system,Cost,Trains,Park section,Duration,Capacity,G-force,Designer,Max vertical angle,Drop,Soft opening date,Fast Lane available,Replaced,Track layout,Fastrack available,Soft opening date.1,Closing date,Opened,Replaced by,Website,Flash Pass Available,Must transfer from wheelchair,Theme,Single rider line available,Restraint Style,Flash Pass available,Acceleration,Restraints,Name,year_introduced,latitude,longitude,Type_Main,opening_date_clean,speed1,speed2,speed1_value,speed1_unit,speed_mph,height_value,height_unit,height_ft,Inversions_clean,Gforce_clean
0,Switchback Railway,600 ft (180 m),6 mph (9.7 km/h),Coney Island,Removed,"June 16, 1884",Wood,LaMarcus Adna Thompson,,Lift Packed,50 ft (15 m),,gravity,,,Coney Island Cyclone Site,1:00,1600 riders per hour,2.9,LaMarcus Adna Thompson,30°,43 ft (13 m),,,,Gravity pulled coaster,,,,,,,,,,,,,,,,1884,40.574,-73.978,Wood,1884-06-16,6 mph,9.7 km/h,6.0,mph,6.0,50.0,ft,,0,2.9
1,Flip Flap Railway,,,Sea Lion Park,Removed,1895,Wood,Lina Beecher,,,,1.0,,,a single car. Riders are arranged 1 across in ...,,,,12.0,Lina Beecher,,,,,,,,,1902.0,,,,,,,,,,,,,1895,40.578,-73.979,Wood,1895-01-01,,,,,,,,,1,12.0
2,Switchback Railway (Euclid Beach Park),,,"Cleveland, Ohio, United States",Closed,,Other,,,,,,,,,,,,,,,,,,,,,,,1895.0,,,,,,,,,,,,1896,41.58,-81.57,Other,,,,,,,,,,0,
3,Loop the Loop (Coney Island),,,Other,Removed,1901,Steel,Edwin Prescott,,,,1.0,,,a single car. Riders are arranged 2 across in ...,,,,,Edward A. Green,,,,,Switchback Railway,,,,1910.0,,Giant Racer,,,,,,,,,,,1901,40.5745,-73.978,Steel,1901-01-01,,,,,,,,,1,
4,Loop the Loop (Young's Pier),,,Other,Removed,1901,Steel,Edwin Prescott,,,,1.0,,,,,,,,Edward A. Green,,,,,,,,,1912.0,,,,,,,,,,,,,1901,39.3538,-74.4342,Steel,1901-01-01,,,,,,,,,1,


In [11]:
# ¿Cuáles son los nombres de las columnas?
df.columns  # equiv: df.axes[1]

Index(['coaster_name', 'Length', 'Speed', 'Location', 'Status', 'Opening date',
       'Type', 'Manufacturer', 'Height restriction', 'Model', 'Height',
       'Inversions', 'Lift/launch system', 'Cost', 'Trains', 'Park section',
       'Duration', 'Capacity', 'G-force', 'Designer', 'Max vertical angle',
       'Drop', 'Soft opening date', 'Fast Lane available', 'Replaced',
       'Track layout', 'Fastrack available', 'Soft opening date.1',
       'Closing date', 'Opened', 'Replaced by', 'Website',
       'Flash Pass Available', 'Must transfer from wheelchair', 'Theme',
       'Single rider line available', 'Restraint Style',
       'Flash Pass available', 'Acceleration', 'Restraints', 'Name',
       'year_introduced', 'latitude', 'longitude', 'Type_Main',
       'opening_date_clean', 'speed1', 'speed2', 'speed1_value', 'speed1_unit',
       'speed_mph', 'height_value', 'height_unit', 'height_ft',
       'Inversions_clean', 'Gforce_clean'],
      dtype='object')

In [18]:
# ¿Cuáles son los tipos de datos de cada columna?
# type(df.dtypes)  # pd.Series
df.dtypes

coaster_name                      object
Length                            object
Speed                             object
Location                          object
Status                            object
Opening date                      object
Type                              object
Manufacturer                      object
Height restriction                object
Model                             object
Height                            object
Inversions                       float64
Lift/launch system                object
Cost                              object
Trains                            object
Park section                      object
Duration                          object
Capacity                          object
G-force                           object
Designer                          object
Max vertical angle                object
Drop                              object
Soft opening date                 object
Fast Lane available               object
Replaced        

In [21]:
df.dtypes.value_counts()

object     46
float64     8
int64       2
Name: count, dtype: int64

In [36]:
# ¿Qué columnas contienen valores nulos y cuántos?
print('total cols with null values: ', df.isnull().any().sum())
print()
print('Columns with null values: \n', df.isnull().any().axes[0])
print()
print('NaN values per col: \n', df.isnull().sum())

total cols with null values:  50

Columns with null values: 
 Index(['coaster_name', 'Length', 'Speed', 'Location', 'Status', 'Opening date',
       'Type', 'Manufacturer', 'Height restriction', 'Model', 'Height',
       'Inversions', 'Lift/launch system', 'Cost', 'Trains', 'Park section',
       'Duration', 'Capacity', 'G-force', 'Designer', 'Max vertical angle',
       'Drop', 'Soft opening date', 'Fast Lane available', 'Replaced',
       'Track layout', 'Fastrack available', 'Soft opening date.1',
       'Closing date', 'Opened', 'Replaced by', 'Website',
       'Flash Pass Available', 'Must transfer from wheelchair', 'Theme',
       'Single rider line available', 'Restraint Style',
       'Flash Pass available', 'Acceleration', 'Restraints', 'Name',
       'year_introduced', 'latitude', 'longitude', 'Type_Main',
       'opening_date_clean', 'speed1', 'speed2', 'speed1_value', 'speed1_unit',
       'speed_mph', 'height_value', 'height_unit', 'height_ft',
       'Inversions_clean', '

In [49]:
# ¿Existen filas o columnas completamente vacías?
print('Columns with more than 75% of null values: ')
percent_of_nulls = 100 * df.isnull().mean().round(4)
print(percent_of_nulls[percent_of_nulls > 75])

Columns with more than 75% of null values: 
Soft opening date                91.17
Fast Lane available              93.65
Replaced                         84.08
Fastrack available               98.25
Soft opening date.1              91.17
Closing date                     78.29
Opened                           97.52
Replaced by                      91.90
Website                          92.00
Flash Pass Available             95.40
Must transfer from wheelchair    90.25
Theme                            95.95
Single rider line available      92.55
Restraint Style                  97.98
Flash Pass available             95.77
Acceleration                     94.48
Restraints                       97.79
Name                             96.78
height_ft                        84.27
dtype: float64


#### Calidad de datos

In [55]:
# ¿Hay filas duplicadas? ¿Cuántas?
# df.duplicated().any()
print('Total of duplicates rows: ', df.duplicated().sum())

Total of duplicates rows:  0


In [61]:
# ¿Qué columnas tienen valores nulos y en qué proporción?
null_percent_per_col = 100 * df.isnull().mean().round(2)
print('cols with null values: ')
print(null_percent_per_col[null_percent_per_col > 0])

cols with null values: 
Length                           12.0
Speed                            14.0
Status                           20.0
Opening date                     23.0
Manufacturer                      5.0
Height restriction               24.0
Model                            32.0
Height                           11.0
Inversions                       14.0
Lift/launch system               27.0
Cost                             65.0
Trains                           34.0
Park section                     55.0
Duration                         30.0
Capacity                         47.0
G-force                          67.0
Designer                         47.0
Max vertical angle               67.0
Drop                             55.0
Soft opening date                91.0
Fast Lane available              94.0
Replaced                         84.0
Track layout                     69.0
Fastrack available               98.0
Soft opening date.1              91.0
Closing date              

In [76]:
most_null_cols = percent_of_nulls[percent_of_nulls > 75].sort_values(ascending=False).index
most_null_cols

Index(['Fastrack available', 'Restraint Style', 'Restraints', 'Opened', 'Name',
       'Theme', 'Flash Pass available', 'Flash Pass Available', 'Acceleration',
       'Fast Lane available', 'Single rider line available', 'Website',
       'Replaced by', 'Soft opening date.1', 'Soft opening date',
       'Must transfer from wheelchair', 'height_ft', 'Replaced',
       'Closing date'],
      dtype='object')

In [83]:
# ¿Se pueden imputar, eliminar o transformar esos valores?
most_nulls_df = df[most_null_cols]

In [84]:
most_nulls_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1087 entries, 0 to 1086
Data columns (total 19 columns):
 #   Column                         Non-Null Count  Dtype  
---  ------                         --------------  -----  
 0   Fastrack available             19 non-null     object 
 1   Restraint Style                22 non-null     object 
 2   Restraints                     24 non-null     object 
 3   Opened                         27 non-null     object 
 4   Name                           35 non-null     object 
 5   Theme                          44 non-null     object 
 6   Flash Pass available           46 non-null     object 
 7   Flash Pass Available           50 non-null     object 
 8   Acceleration                   60 non-null     object 
 9   Fast Lane available            69 non-null     object 
 10  Single rider line available    81 non-null     object 
 11  Website                        87 non-null     object 
 12  Replaced by                    88 non-null     o

In [94]:
def nans(df: pd.DataFrame): return df[df.isnull().any(axis=1)]
def Nonans(df: pd.DataFrame): return df[df.notnull().any(axis=1)]

In [103]:
# ¿Qué columnas tienen datos inconsistentes o atípicos (e.g., alturas negativas)?
df.columns

Index(['coaster_name', 'Length', 'Speed', 'Location', 'Status', 'Opening date',
       'Type', 'Manufacturer', 'Height restriction', 'Model', 'Height',
       'Inversions', 'Lift/launch system', 'Cost', 'Trains', 'Park section',
       'Duration', 'Capacity', 'G-force', 'Designer', 'Max vertical angle',
       'Drop', 'Soft opening date', 'Fast Lane available', 'Replaced',
       'Track layout', 'Fastrack available', 'Soft opening date.1',
       'Closing date', 'Opened', 'Replaced by', 'Website',
       'Flash Pass Available', 'Must transfer from wheelchair', 'Theme',
       'Single rider line available', 'Restraint Style',
       'Flash Pass available', 'Acceleration', 'Restraints', 'Name',
       'year_introduced', 'latitude', 'longitude', 'Type_Main',
       'opening_date_clean', 'speed1', 'speed2', 'speed1_value', 'speed1_unit',
       'speed_mph', 'height_value', 'height_unit', 'height_ft',
       'Inversions_clean', 'Gforce_clean'],
      dtype='object')

In [121]:
df[['Length', 'Speed', 'Height', 'Cost', 'G-force', 'year_introduced', 'Acceleration']]

Unnamed: 0,Length,Speed,Height,Cost,G-force,year_introduced,Acceleration
0,600 ft (180 m),6 mph (9.7 km/h),50 ft (15 m),,2.9,1884,
1,,,,,12,1895,
2,,,,,,1896,
3,,,,,,1901,
4,,,,,,1901,
...,...,...,...,...,...,...,...
1082,"3,444 ft (1,050 m)",53 mph (85 km/h),111 ft (34 m),,4.7,2022,
1083,"3,328 ft (1,014 m)",73 mph (117 km/h),178 ft (54 m),,,2022,
1084,"3,169.3 ft (966.0 m)",59.3[1] mph (95.4 km/h),78.1 ft (23.8 m),,4,2022,
1085,770 ft (230 m),34 mph (55 km/h),112 ft (34 m),,,2022,


#### Resumen estadístico

#### Exploración de variables categóricas

In [10]:
# ¿Cuáles son las categorías únicas en columnas como Type, Material Type, Status, Seating Type o Location?
# for col in df: 
#     print(df[col].unique())
list(df.dtypes.unique())

[dtype('O'), dtype('float64'), dtype('int64')]

In [23]:
df.columns.sort_values()

Index(['Acceleration', 'Capacity', 'Closing date', 'Cost', 'Designer', 'Drop',
       'Duration', 'Fast Lane available', 'Fastrack available',
       'Flash Pass Available', 'Flash Pass available', 'G-force',
       'Gforce_clean', 'Height', 'Height restriction', 'Inversions',
       'Inversions_clean', 'Length', 'Lift/launch system', 'Location',
       'Manufacturer', 'Max vertical angle', 'Model',
       'Must transfer from wheelchair', 'Name', 'Opened', 'Opening date',
       'Park section', 'Replaced', 'Replaced by', 'Restraint Style',
       'Restraints', 'Single rider line available', 'Soft opening date',
       'Soft opening date.1', 'Speed', 'Status', 'Theme', 'Track layout',
       'Trains', 'Type', 'Type_Main', 'Website', 'coaster_name', 'height_ft',
       'height_unit', 'height_value', 'latitude', 'longitude',
       'opening_date_clean', 'speed1', 'speed1_unit', 'speed1_value', 'speed2',
       'speed_mph', 'year_introduced'],
      dtype='object')

In [11]:
# ¿Cuál es la frecuencia de cada categoría?


In [20]:
# ¿Qué tipos de montañas rusas son más comunes?
df.Type.value_counts()

Type
Steel                                   308
Wood                                    164
Other                                    72
Steel – Launched                         63
Steel – Inverted                         60
Steel – Junior                           28
Steel – Wild Mouse                       21
Steel – Floorless Coaster                20
Steel – Spinning                         18
Steel – Mine Train                       18
Steel – Flying                           17
Steel – Suspended                        16
Steel – Enclosed                         15
Steel – Stand-up                         14
Wood – Racing                            14
Steel – Euro-Fighter                     13
Steel – Launched – Shuttle               12
Steel – Dive Coaster                     11
Steel – Shuttle                          10
Steel – Family                            9
Steel – 4th Dimension – Wing Coaster      9
Steel – Shuttle – Boomerang               8
Steel – Wing Coaster       

In [25]:
# ¿Qué países tienen más montañas rusas en el dataset?
df['Location'].unique()

array(['Coney Island', 'Sea Lion Park', 'Cleveland, Ohio, United States',
       'Other', 'Lakemont Park', 'Olentangy Park', 'Crystal Beach',
       'Kennywood', 'Revere Beach', 'Luna Park Melbourne',
       'Tolchester Beach Park', 'Lake Compounce', 'Six Flags America',
       'Clementon Park', 'Seabreeze Amusement Park', 'Dreamland Margate',
       'Lagoon Amusement Park', 'Blackpool Pleasure Beach', 'Hersheypark',
       'Riverview Park', 'Santa Cruz Beach Boardwalk',
       'Dorney Park & Wildwater Kingdom', 'Geauga Lake',
       'Rocky Glen Park', 'Savin Rock', 'Crystal Beach Park',
       'Sesquicentennial Exposition', 'Luna Park, Coney Island',
       'Palisades Amusement Park', 'Oaks Amusement Park', 'Playland',
       'Monte Igueldo Amusement Park', 'Canobie Lake Park',
       'Great Yarmouth Pleasure Beach', 'Conneaut Lake Park',
       'Idlewild and Soak Zone', 'Lakeside Amusement Park', 'Whalom Park',
       'Six Flags New England', 'Lincoln Park', 'Battersea Park',
       

In [26]:
df['Location'].value_counts()

Location
Other                              250
Kings Island                        19
Cedar Point                         19
Six Flags Magic Mountain            17
Hersheypark                         16
                                  ... 
Plopsaland De Panne                  1
Fun Spot America Atlanta             1
Glenwood Caverns Adventure Park      1
Fårup Sommerland                     1
Epcot                                1
Name: count, Length: 280, dtype: int64

In [30]:
# ¿Cuál es el estado operativo más común (e.g., Operative, Removed)?
df['Status']

0                  Removed
1                  Removed
2                   Closed
3                  Removed
4                  Removed
               ...        
1082                   NaN
1083    Under construction
1084                   NaN
1085    Under construction
1086    Under construction
Name: Status, Length: 1087, dtype: object

In [31]:
df['Status'].unique()  # Status types

array(['Removed', 'Closed', 'Operating', nan, 'Not Currently Operating',
       'In Production', 'Discontinued',
       'closed for maintenance as of july 30 no reopening date known',
       'Closed in 2021', 'SBNO December 2019', 'Under construction',
       'Temporarily Closed', 'SBNO (Standing But Not Operating)',
       'Temporarily closed',
       'Chapter 7 bankruptcy; rides dismantled and sold; property sold',
       'Under Maintenance'], dtype=object)

In [29]:
df['Status'].value_counts()

Status
Operating                                                         668
Removed                                                           137
Closed                                                             24
Under construction                                                 15
In Production                                                      11
Discontinued                                                        7
Closed in 2021                                                      2
SBNO December 2019                                                  2
Chapter 7 bankruptcy; rides dismantled and sold; property sold      2
closed for maintenance as of july 30 no reopening date known        1
Not Currently Operating                                             1
Temporarily Closed                                                  1
SBNO (Standing But Not Operating)                                   1
Temporarily closed                                                  1
Under Mainten

#### Análisis greográfico

In [4]:
from geopy.geocoders import Nominatim
from geopy.location import Location
from geopy.extra.rate_limiter import RateLimiter
geolocator = Nominatim(user_agent="my_geocoder", timeout=10)
reverse = RateLimiter(geolocator.reverse, min_delay_seconds=1)  

In [5]:
def city_state_country(row):
    latitude, longitude = row['latitude'], row['longitude']

    # Saltar filas con valores nulos
    if pd.isna(latitude) or pd.isna(longitude):
        row['city'] = ''
        row['state'] = ''
        row['country'] = ''
        return row

    try:
        location = reverse((latitude, longitude))
        if location:
            address = location.raw.get('address', {})
            row['city'] = address.get('city', '') or address.get('town', '') or address.get('village', '')
            row['state'] = address.get('state', '')
            row['country'] = address.get('country', '')
        else:
            row['city'] = ''
            row['state'] = ''
            row['country'] = ''
    except Exception as e:
        print(f"Error en coordenadas ({latitude}, {longitude}): {e}")
        row['city'] = ''
        row['state'] = ''
        row['country'] = ''

    return row


In [6]:
df_locations = df.apply(city_state_country, axis=1)
df_locations[['city', 'state', 'country']].to_csv('locations.csv', index=False)

In [3]:
locations_df = pd.read_csv("data/locations.csv")
locations_df.head()

Unnamed: 0,city,state,country
0,City of New York,New York,United States
1,City of New York,New York,United States
2,Cleveland,Ohio,United States
3,City of New York,New York,United States
4,Atlantic City,New Jersey,United States


In [5]:
locations_df.country.value_counts()

country
United States                  491
United Kingdom                  63
Deutschland                     32
日本                              28
Canada                          28
Australia                       25
Nederland                       22
中国                              17
France                          13
Sverige                         12
Suomi / Finland                 12
España                          10
België / Belgique / Belgien     10
Italia                          10
México                           6
Danmark                          6
Singapore                        4
Norge                            3
Việt Nam                         3
Brasil                           2
Philippines                      2
Polska                           2
臺灣                               2
대한민국                             2
الإمارات العربية المتحدة         2
الكويت                           1
Ayiti                            1
India                            1
Éire / Irela

## **Data Cleaning**