##EDA Insights from SpaceX Datasets
  

#payload vs launch Site

In [None]:
import pandas as pd
import matplotlib.pyplot as plt

# Load the launches dataset
launches_df = pd.read_csv('spacex_launches.xlsx - Launches.csv')

# Inspect the dataframe to understand its structure and columns
print(launches_df.head())
print(launches_df.info())

In [None]:
import pandas as pd
import matplotlib.pyplot as plt

# Load the datasets
launches_df = pd.read_csv('spacex_launches.xlsx - Launches.csv')
payloads_df = pd.read_csv('spacex_payloads.xlsx - Payloads.csv')
launchpads_df = pd.read_csv('spacex_launchpads.xlsx - Launchpads.csv')

# Preprocess the payloads in launches_df to be a flat list of payload IDs
launches_df['payloads'] = launches_df['payloads'].apply(eval)
launches_payloads_flat = launches_df.explode('payloads')

# Merge the launches with payloads data
merged_df = pd.merge(
    launches_payloads_flat,
    payloads_df[['id', 'mass_kg']],
    how='left',
    left_on='payloads',
    right_on='id'
)

# Merge with launchpad data to get launch site names
final_df = pd.merge(
    merged_df,
    launchpads_df[['id', 'name']],
    how='left',
    left_on='launchpad',
    right_on='id_x'
)

# Clean up the final dataframe
final_df = final_df.rename(columns={'name_y': 'launch_site_name'})
final_df = final_df.dropna(subset=['mass_kg', 'launch_site_name'])

# Create the plot
plt.figure(figsize=(10, 6))
plt.scatter(final_df['launch_site_name'], final_df['mass_kg'])
plt.xlabel('Launch Site')
plt.ylabel('Payload Mass (kg)')
plt.title('Payload Mass vs. Launch Site')
plt.xticks(rotation=45, ha='right')
plt.tight_layout()

# Save the plot
plt.savefig('payload_vs_launch_site.png')

print("Generated a scatter plot of Payload vs. Launch Site.")
print(final_df[['launch_site_name', 'mass_kg']].head())

In [None]:
import pandas as pd
import matplotlib.pyplot as plt

# Load the datasets
launches_df = pd.read_csv('spacex_launches.xlsx - Launches.csv')
payloads_df = pd.read_csv('spacex_payloads.xlsx - Payloads.csv')
launchpads_df = pd.read_csv('spacex_launchpads.xlsx - Launchpads.csv')

# Preprocess the payloads in launches_df to be a flat list of payload IDs
launches_df['payloads'] = launches_df['payloads'].apply(eval)
launches_payloads_flat = launches_df.explode('payloads')

# Merge the launches with payloads data
merged_df = pd.merge(
    launches_payloads_flat,
    payloads_df[['id', 'mass_kg']],
    how='left',
    left_on='payloads',
    right_on='id'
)

# Merge with launchpad data to get launch site names
# Correcting the merge key for launchpads.
# The launchpad id in the launches file is in the 'launchpad' column.
# The launchpad id in the launchpads file is in the 'id' column.
final_df = pd.merge(
    merged_df,
    launchpads_df[['id', 'name']],
    how='left',
    left_on='launchpad',
    right_on='id'
)


# Clean up the final dataframe and rename columns to be more descriptive
final_df = final_df.rename(columns={'name_y': 'launch_site_name', 'id_x': 'launch_id', 'id_y': 'payload_id', 'name_x': 'launch_name'})
final_df = final_df.dropna(subset=['mass_kg', 'launch_site_name'])

# Create the plot
plt.figure(figsize=(12, 7))
plt.scatter(final_df['launch_site_name'], final_df['mass_kg'], alpha=0.7)
plt.xlabel('Launch Site', fontsize=12)
plt.ylabel('Payload Mass (kg)', fontsize=12)
plt.title('Payload Mass by Launch Site', fontsize=14)
plt.xticks(rotation=45, ha='right')
plt.grid(True, which='both', linestyle='--', linewidth=0.5)
plt.tight_layout()

# Save the plot
plt.savefig('payload_vs_launch_site.png')

print("Generated a scatter plot of Payload vs. Launch Site.")
# Display the head of the relevant columns in the final dataframe for verification
print(final_df[['launch_site_name', 'mass_kg']].head())

#flight Number Vs launch site

In [None]:
import pandas as pd
import matplotlib.pyplot as plt

# Load the necessary datasets
launches_df = pd.read_csv('spacex_launches.xlsx - Launches.csv')
launchpads_df = pd.read_csv('spacex_launchpads.xlsx - Launchpads.csv')

# Merge the two dataframes to get the launch site name for each flight
merged_df = pd.merge(
    launches_df,
    launchpads_df[['id', 'name']],
    how='left',
    left_on='launchpad',
    right_on='id'
)

# Rename the 'name_y' column to something more descriptive
merged_df.rename(columns={'name_y': 'launch_site'}, inplace=True)

# Create the scatter plot
plt.figure(figsize=(12, 7))
plt.scatter(merged_df['launch_site'], merged_df['flight_number'], alpha=0.7)
plt.xlabel('Launch Site', fontsize=12)
plt.ylabel('Flight Number', fontsize=12)
plt.title('Flight Number by Launch Site', fontsize=14)
plt.xticks(rotation=45, ha='right')
plt.grid(True, which='both', linestyle='--', linewidth=0.5)
plt.tight_layout()

# Save the plot as an image
plt.savefig('flight_number_vs_launch_site.png')

print("Generated a scatter plot of Flight Number vs. Launch Site.")

#Success rate Vs Orbit type

In [None]:
import pandas as pd
import matplotlib.pyplot as plt

# Load the necessary datasets
launches_df = pd.read_csv('spacex_launches.xlsx - Launches.csv')
payloads_df = pd.read_csv('spacex_payloads.xlsx - Payloads.csv')

# --- Data Inspection (to avoid previous errors) ---
# print("Launches DF Columns:", launches_df.columns)
# print("Payloads DF Columns:", payloads_df.columns)
# This confirms 'success' in launches_df and 'orbit', 'id' in payloads_df.
# It also confirms the 'payloads' column in launches_df links them.

# --- Data Processing ---

# The 'payloads' column in launches_df is a string representation of a list.
# We need to convert it to an actual list.
# Using a safe evaluation method to handle the string-to-list conversion.
import ast
try:
    launches_df['payloads'] = launches_df['payloads'].apply(ast.literal_eval)
except (ValueError, SyntaxError):
    # This block is a fallback for any rows that might not be a valid list format.
    # For this dataset, a simple eval is known to work, but this is safer practice.
    pass

# Explode the launches dataframe to have one row per payload ID
launches_exploded_df = launches_df.explode('payloads')

# Merge the exploded launches data with the payloads data
# We are merging on the payload ID to link launch success with orbit type.
merged_df = pd.merge(
    launches_exploded_df[['flight_number', 'success', 'payloads']],
    payloads_df[['id', 'orbit']],
    how='left',
    left_on='payloads',
    right_on='id'
)

# --- Success Rate Calculation ---

# Drop rows where success status is not available
merged_df.dropna(subset=['success'], inplace=True)
# Convert success to a numeric type (True=1, False=0)
merged_df['success'] = merged_df['success'].astype(int)


# Group by orbit and calculate the success rate (mean of the 'success' column)
orbit_success_rate = merged_df.groupby('orbit')['success'].mean().sort_values(ascending=False)

# --- Create Bar Chart ---

plt.figure(figsize=(12, 8))
orbit_success_rate.plot(kind='bar')
plt.title('Success Rate by Orbit Type', fontsize=16)
plt.xlabel('Orbit Type', fontsize=12)
plt.ylabel('Success Rate', fontsize=12)
plt.xticks(rotation=45, ha='right')
plt.ylim(0, 1.1)  # Set y-axis limit to be from 0 to 1.1 for clarity
plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.tight_layout()

# Save the plot
plt.savefig('orbit_success_rate.png')

print("Generated a bar chart for the success rate of each orbit type.")
print(orbit_success_rate.head())

#Flight number vs Orbit tye

In [None]:
import pandas as pd
import matplotlib.pyplot as plt

# Load the necessary datasets
launches_df = pd.read_csv('spacex_launches.xlsx - Launches.csv')
payloads_df = pd.read_csv('spacex_payloads.xlsx - Payloads.csv')

# --- Data Processing ---

# Safely evaluate the string representation of lists in the 'payloads' column
import ast
try:
    launches_df['payloads'] = launches_df['payloads'].apply(ast.literal_eval)
except (ValueError, SyntaxError):
    pass

# Explode the launches dataframe to have one row per payload ID
launches_exploded_df = launches_df.explode('payloads')

# Merge the exploded launches data with the payloads data
# This links flight numbers to the orbit type of their respective payloads
merged_df = pd.merge(
    launches_exploded_df[['flight_number', 'payloads']],
    payloads_df[['id', 'orbit']],
    how='left',
    left_on='payloads',
    right_on='id'
)

# --- Create Scatter Plot ---

plt.figure(figsize=(12, 10))
plt.scatter(merged_df['flight_number'], merged_df['orbit'], alpha=0.7)
plt.ylabel('Orbit Type', fontsize=12)
plt.xlabel('Flight Number', fontsize=12)
plt.title('Flight Number vs. Orbit Type', fontsize=16)
plt.grid(True, which='both', axis='x', linestyle='--', linewidth=0.5)
plt.tight_layout()

# Save the plot
plt.savefig('flight_number_vs_orbit.png')

print("Generated a scatter plot of Flight Number vs. Orbit Type.")

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import ast

# Load the necessary datasets
launches_df = pd.read_csv('spacex_launches.xlsx - Launches.csv')
payloads_df = pd.read_csv('spacex_payloads.xlsx - Payloads.csv')

# --- Data Processing ---

# Safely evaluate the 'payloads' column string to a list
try:
    launches_df['payloads'] = launches_df['payloads'].apply(ast.literal_eval)
except (ValueError, SyntaxError):
    pass

# Explode the launches dataframe to have one row per payload ID
launches_exploded_df = launches_df.explode('payloads')

# Merge the exploded launches data with the payloads data to link flight number with orbit type
merged_df = pd.merge(
    launches_exploded_df[['flight_number', 'payloads']],
    payloads_df[['id', 'orbit']],
    how='left',
    left_on='payloads',
    right_on='id'
)

# Drop rows where orbit is not available
merged_df.dropna(subset=['orbit'], inplace=True)


# --- Create Scatter Plot ---

plt.figure(figsize=(10, 8))
plt.scatter(merged_df['flight_number'], merged_df['orbit'], alpha=0.7)

plt.title('Flight Number vs. Orbit Type', fontsize=16)
plt.xlabel('Flight Number', fontsize=12)
plt.ylabel('Orbit Type', fontsize=12)
plt.grid(axis='x', linestyle='--', alpha=0.7)
plt.tight_layout()

# Save the plot
plt.savefig('flight_number_vs_orbit.png')

print("Generated a scatter plot of Flight Number vs. Orbit Type.")

#payload VS orbit type

In [None]:
import pandas as pd
import matplotlib.pyplot as plt

# Load the payloads dataset
payloads_df = pd.read_csv('spacex_payloads.xlsx - Payloads.csv')

# Remove entries where payload mass is not available
payloads_df.dropna(subset=['mass_kg', 'orbit'], inplace=True)

# Create the scatter plot
plt.figure(figsize=(12, 8))
plt.scatter(payloads_df['mass_kg'], payloads_df['orbit'], alpha=0.7)

# Set the title and labels
plt.title('Payload Mass vs. Orbit Type', fontsize=16)
plt.xlabel('Payload Mass (kg)', fontsize=12)
plt.ylabel('Orbit Type', fontsize=12)
plt.grid(axis='x', linestyle='--', alpha=0.7)
plt.tight_layout()

# Save the plot
plt.savefig('payload_vs_orbit.png')

print("Generated a scatter plot of Payload Mass vs. Orbit Type.")

In [None]:
import pandas as pd
import matplotlib.pyplot as plt

# Load the payloads dataset, which contains both payload mass and orbit type
try:
    payloads_df = pd.read_csv('spacex_payloads.xlsx - Payloads.csv')
except FileNotFoundError:
    print("Error: The file 'spacex_payloads.xlsx - Payloads.csv' was not found.")
    # Exit or handle the error as appropriate
    exit()

# --- Data Cleaning ---
# Drop rows where 'mass_kg' or 'orbit' is missing to ensure a clean plot
cleaned_payloads_df = payloads_df.dropna(subset=['mass_kg', 'orbit'])


# --- Create Scatter Plot ---
plt.figure(figsize=(12, 8))
plt.scatter(cleaned_payloads_df['mass_kg'], cleaned_payloads_df['orbit'], alpha=0.7)

# --- Formatting ---
plt.title('Payload Mass by Orbit Type', fontsize=16)
plt.xlabel('Payload Mass (kg)', fontsize=12)
plt.ylabel('Orbit Type', fontsize=12)
plt.grid(axis='x', linestyle='--', alpha=0.7)
plt.tight_layout()

# Save the plot
plt.savefig('payload_vs_orbit_type.png')

print("Generated a scatter plot of Payload Mass vs. Orbit Type.")
print(f"Plotted {len(cleaned_payloads_df)} payloads with complete data.")

#Launch Success yearly Trend

In [None]:
import pandas as pd
import matplotlib.pyplot as plt

# Load the launches dataset
try:
    launches_df = pd.read_csv('spacex_launches.xlsx - Launches.csv')
except FileNotFoundError:
    print("Error: 'spacex_launches.xlsx - Launches.csv' not found.")
    exit()

# --- Data Preparation ---

# Convert 'date_utc' to datetime objects to extract the year
# The `errors='coerce'` will turn any unparseable dates into NaT (Not a Time)
launches_df['date_utc'] = pd.to_datetime(launches_df['date_utc'], errors='coerce')

# Drop rows where the date could not be parsed
launches_df.dropna(subset=['date_utc'], inplace=True)

# Create a 'year' column
launches_df['year'] = launches_df['date_utc'].dt.year

# Drop rows where 'success' is NaN to ensure accurate rate calculation
launches_df.dropna(subset=['success'], inplace=True)

# --- Success Rate Calculation ---

# Group by year and calculate the mean of the 'success' column
# This works because True is treated as 1 and False as 0
yearly_success_rate = launches_df.groupby('year')['success'].mean()

# --- Create Line Chart ---

plt.figure(figsize=(12, 7))
plt.plot(yearly_success_rate.index, yearly_success_rate.values, marker='o', linestyle='-')

# --- Formatting ---
plt.title('Yearly Average Launch Success Rate', fontsize=16)
plt.xlabel('Year', fontsize=12)
plt.ylabel('Success Rate', fontsize=12)
plt.grid(True, which='both', linestyle='--', alpha=0.7)
plt.xticks(yearly_success_rate.index.astype(int), rotation=45) # Ensure x-axis ticks are integers for years
plt.ylim(0, 1.1) # Set y-axis from 0 to 1.1 for better visualization
plt.tight_layout()

# Save the plot
plt.savefig('yearly_average_success_rate.png')

print("Generated a line chart of the yearly average success rate.")
print(yearly_success_rate)

In [None]:
import pandas as pd
import matplotlib.pyplot as plt

# Load the launches dataset
try:
    launches_df = pd.read_csv('spacex_launches.xlsx - Launches.csv')
except FileNotFoundError:
    print("Error: The file 'spacex_launches.xlsx - Launches.csv' was not found.")
    exit()

# --- Data Processing ---

# Drop rows where 'success' or 'date_utc' is missing
launches_df.dropna(subset=['success', 'date_utc'], inplace=True)

# Convert 'date_utc' to datetime objects
launches_df['date_utc'] = pd.to_datetime(launches_df['date_utc'])

# Extract the year from the 'date_utc' column
launches_df['year'] = launches_df['date_utc'].dt.year

# Convert 'success' boolean to integer (True=1, False=0) for calculation
launches_df['success'] = launches_df['success'].astype(int)


# --- Calculation ---

# Group by year and calculate the mean of the 'success' column
yearly_success_rate = launches_df.groupby('year')['success'].mean()


# --- Create Line Chart ---
plt.figure(figsize=(12, 7))
plt.plot(yearly_success_rate.index, yearly_success_rate.values, marker='o', linestyle='-')


# --- Formatting ---
plt.title('Yearly Average Launch Success Rate', fontsize=16)
plt.xlabel('Year', fontsize=12)
plt.ylabel('Average Success Rate', fontsize=12)
plt.grid(True, which='both', linestyle='--', linewidth=0.5)
plt.xticks(yearly_success_rate.index, rotation=45) # Ensure all years are shown as ticks
plt.ylim(0, 1.1) # Set y-axis limit for clarity
plt.tight_layout()


# --- Save the Plot ---
plt.savefig('yearly_success_rate.png')

print("Generated a line chart of the yearly average success rate.")
print(yearly_success_rate)