In [5]:
# Step 1: Import Required Libraries
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Step 2: Load the Dataset
# Replace 'roller_coasters.csv' with your dataset file path
df = pd.read_csv('coaster_db.csv')

# Step 3: Understand the Dataset
print("Dataset Info:")
print(df.info())

print("\nMissing Values:")
print(df.isnull().sum())

print("\nSummary Statistics:")
print(df.describe())

# Step 4: Data Cleaning
# Drop rows with missing values in key columns
df = df.dropna(subset=['Speed_mph', 'Height_ft', 'Year_Introduced'])

# Drop duplicate rows
df = df.drop_duplicates()

# Step 5: Exploratory Data Analysis (EDA)

# 5.1: Distribution of Roller Coaster Speeds
plt.figure(figsize=(10, 6))
df['Speed_mph'].hist(bins=20, color='skyblue', edgecolor='black')
plt.title('Distribution of Roller Coaster Speeds', fontsize=16)
plt.xlabel('Speed (mph)', fontsize=14)
plt.ylabel('Frequency', fontsize=14)
plt.show()

# 5.2: Top 10 Locations with the Fastest Roller Coasters
# Group by 'Location' and calculate the average speed
average_speed_by_location = df.groupby('Location')['Speed_mph'].mean().reset_index()

# Sort by average speed in descending order
average_speed_by_location_sorted = average_speed_by_location.sort_values(by='Speed_mph', ascending=False)

# Filter locations with at least 10 roller coasters
location_counts = df['Location'].value_counts()
locations_with_10 = location_counts[location_counts >= 10].index
average_speed_by_location_sorted = average_speed_by_location_sorted[average_speed_by_location_sorted['Location'].isin(locations_with_10)]

# Plot the top 10 locations
plt.figure(figsize=(12, 6))
sns.barplot(x='Speed_mph', y='Location', data=average_speed_by_location_sorted.head(10), palette='viridis')
plt.title('Top 10 Locations with the Fastest Roller Coasters (Minimum of 10)', fontsize=16)
plt.xlabel('Average Speed (mph)', fontsize=14)
plt.ylabel('Location', fontsize=14)
plt.show()

# 5.3: Relationship Between Speed and Height
plt.figure(figsize=(10, 6))
sns.scatterplot(x='Height_ft', y='Speed_mph', data=df, color='blue', alpha=0.6)
plt.title('Relationship Between Speed and Height', fontsize=16)
plt.xlabel('Height (ft)', fontsize=14)
plt.ylabel('Speed (mph)', fontsize=14)
plt.show()

# 5.4: Trend of Roller Coaster Introductions Over Time
df['Year_Introduced'] = pd.to_datetime(df['Year_Introduced'], format='%Y')  # Convert to datetime
yearly_introductions = df['Year_Introduced'].dt.year.value_counts().sort_index()

plt.figure(figsize=(12, 6))
yearly_introductions.plot(kind='line', color='green', marker='o')
plt.title('Number of Roller Coasters Introduced Over Time', fontsize=16)
plt.xlabel('Year', fontsize=14)
plt.ylabel('Number of Roller Coasters', fontsize=14)
plt.grid(True)
plt.show()

# Step 6: Save the Results
# Save the cleaned dataset to a new CSV file
df.to_csv('cleaned_roller_coasters.csv', index=False)

# Save the top locations with the fastest roller coasters
average_speed_by_location_sorted.head(10).to_csv('top_locations_fastest_coasters.csv', index=False)

# Step 7: Summarize Findings
print("\nSummary of Findings:")
print("1. The dataset contains roller coasters from various locations, with speed and height data.")
print("2. The average speed of roller coasters is approximately {:.2f} mph.".format(df['Speed_mph'].mean()))
print("3. The tallest roller coaster is {:.2f} feet tall.".format(df['Height_ft'].max()))
print("4. The top location with the fastest roller coasters is '{}' with an average speed of {:.2f} mph.".format(
    average_speed_by_location_sorted.iloc[0]['Location'],
    average_speed_by_location_sorted.iloc[0]['Speed_mph']
))
print("5. There is a positive correlation between roller coaster speed and height.")
print("6. The number of roller coasters introduced has increased over time, peaking around the year {}.".format(
    yearly_introductions.idxmax()
))

Dataset Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1087 entries, 0 to 1086
Data columns (total 56 columns):
 #   Column                         Non-Null Count  Dtype  
---  ------                         --------------  -----  
 0   coaster_name                   1087 non-null   object 
 1   Length                         953 non-null    object 
 2   Speed                          937 non-null    object 
 3   Location                       1087 non-null   object 
 4   Status                         874 non-null    object 
 5   Opening date                   837 non-null    object 
 6   Type                           1087 non-null   object 
 7   Manufacturer                   1028 non-null   object 
 8   Height restriction             831 non-null    object 
 9   Model                          744 non-null    object 
 10  Height                         965 non-null    object 
 11  Inversions                     932 non-null    float64
 12  Lift/launch system             795

KeyError: ['Speed_mph', 'Height_ft', 'Year_Introduced']