In [1]:
import pandas as pd
import sqlite3

Part 1

In [None]:
# Connect to the database
try:
    with sqlite3.connect('data/chinook.db') as con:
        # Read customers table
        df_customers = pd.read_sql_query('SELECT * from customers', con)

    # Display the first 10 rows
    print(df_customers.head(10))
except Exception as e:
    print(f"An error occurred: {e}")

In [None]:
# Load JSON file into a DataFrame
df_iris = pd.read_json('data/iris.json')
# Show the shape of the dataset
print("Shape of dataset: ", df_iris.shape)
# Show column names
print("Column names: ", df_iris.columns.tolist())

In [None]:
# Load the Excel file into a DataFrame
df_titanic = pd.read_excel('data/titanic.xlsx')

# Display the first five rows
print("The first 5 rows: \n", df_titanic.head())

In [None]:
# Load Parquet file into a DataFrame
flights_df = pd.read_parquet("data/flights")

# Summarize the DataFrame
flights_df.head()

In [None]:
# Load the CSV file into a DataFrame
movie_df = pd.read_csv('data/movie.csv')

# Display a random sample of 10 rows
print("Random sample of 10 rows: \n", movie_df.sample(10))

Part 2


In [None]:
# Renaming the columns to lowercase
df_iris.columns = [col.lower() for col in df_iris.columns]

# Selecting only sepal_length and sepal_width columns
df_iris_selected = df_iris[['sepallength', 'sepalwidth']]

print("Selected columns from iris.json:")
print(df_iris_selected.head())

In [None]:
# Filtering rows where age > 30
titanic_filtered = df_titanic[df_titanic['Age'] > 30]

print("\nRows where age is above 30:")
print(titanic_filtered.head())

# Count the number of male and female passengers
gender_counts = df_titanic["Sex"].value_counts()
print("\nCount of male and female passengers:")
print(gender_counts)

In [None]:
# Extract only the 'origin', 'dest', and 'carrier' columns
flights_selected = flights_df[["origin", "dest", "carrier"]]
print("\nSelected columns from flights.parquet:")
print(flights_selected.head())

# Find the number of unique destinations
unique_destinations = flights_df["dest"].nunique()
print("\nNumber of unique destinations:")
print(unique_destinations)

In [None]:
# Filtering rows where duration > 120
movie_filtered = movie_df[movie_df['duration'] > 120]

# Sort the filtered DataFrame by director_facebook_likes in descending order
movies_sorted = movie_filtered.sort_values(by="director_facebook_likes", ascending=False)
print("\nMovies with duration > 120 minutes, sorted by director_facebook_likes:")
movies_sorted.head()

Part 3


In [None]:
# Calculate mean, median, and standard deviation for numerical columns
# Summarize mean, std, min, max
numerical_stats = df_iris.describe().T  
numerical_stats["median"] = df_iris.median(numeric_only=True)
numerical_stats = numerical_stats[["mean", "median", "std"]]
numerical_stats


In [None]:
# Drop missing age values for accurate computation
df_titanic = df_titanic.dropna(subset=["Age"])

# Calculate min, max, and sum of passenger ages
age_min = df_titanic["Age"].min()
age_max = df_titanic["Age"].max()
age_sum = df_titanic["Age"].sum()

print(f"Minimum Age: {age_min}, Maximum Age: {age_max}, Total Age Sum: {age_sum}")


In [None]:
# Group by director and calculate total facebook likes
top_director = movie_df.groupby("director_name")["director_facebook_likes"].sum().idxmax()
print(f"Director with the highest Facebook likes: {top_director}")

# Sort by duration and fetch top 5 movies with their directors
longest_movies = movie_df.nlargest(5, "duration")[["movie_title", "duration", "director_name"]]
longest_movies


In [None]:
# Check for missing values
missing_summary = flights_df.isnull().sum()

# Display columns with missing values
missing_summary[missing_summary > 0]
  
# Replace missing values in a numerical column (e.g., 'column_name') with the mean
numerical_col = "Tail_Number" 
if numerical_col in flights_df.columns:
    flights_df[numerical_col].fillna(flights_df[numerical_col].mean(), inplace=True)
