In [None]:
import pandas as pd
import numpy as np
import time
from sklearn.preprocessing import StandardScaler

# Sample Data: Simulating a dataset of action movies from the 1990s
data = {
    "title": ["Movie A", "Movie B", "Movie C", "Movie D", "Movie E"],
    "duration": [85, 120, 95, 88, 75],  # Movie durations in minutes
    "rating": [7.5, 8.2, 6.9, 7.8, 5.4]
}

action_movies_1990s = pd.DataFrame(data)

# --- 1. Inefficient approach using iterrows() ---
start_time = time.time()
short_movie_count = 0
for label, row in action_movies_1990s.iterrows():
    if row["duration"] < 90:
        short_movie_count += 1
end_time = time.time()  
loop_time = end_time - start_time

print(f"Short movies count (iterrows approach): {short_movie_count}")
print(f"Loop execution time: {loop_time:.6f} seconds")

# --- 2. Efficient vectorized approach ---
start_time = time.time()
short_movie_count_vectorized = (action_movies_1990s["duration"] < 90).sum()
end_time = time.time()
vectorized_time = end_time - start_time

print(f"Short movies count (vectorized approach): {short_movie_count_vectorized}")
print(f"Vectorized execution time: {vectorized_time:.6f} seconds")

# --- 3. Using apply() to categorize movies efficiently ---
action_movies_1990s["category"] = action_movies_1990s["duration"].apply(lambda x: "Short" if x < 90 else "Long")
print("\nCategorized DataFrame:")
print(action_movies_1990s)

# --- 4. Using NumPy for conditional operations ---
action_movies_1990s["fast_watch"] = np.where(action_movies_1990s["duration"] < 90, True, False)
print("\nDataFrame with NumPy fast_watch column:")
print(action_movies_1990s)

# --- 5. Handling Missing Data ---
action_movies_1990s.loc[2, "rating"] = np.nan  # Introduce a missing value
action_movies_1990s["rating"].fillna(action_movies_1990s["rating"].mean(), inplace=True)  # Fill missing with mean
print("\nDataFrame after handling missing data:")
print(action_movies_1990s)

# --- 6. One-hot Encoding a Categorical Column ---
action_movies_encoded = pd.get_dummies(action_movies_1990s, columns=["category"])
print("\nDataFrame with one-hot encoding:")
print(action_movies_encoded)

# --- 7. Feature Scaling (Standardization) ---
scaler = StandardScaler()
action_movies_1990s["scaled_rating"] = scaler.fit_transform(action_movies_1990s[["rating"]])
print("\nDataFrame with scaled ratings:")
print(action_movies_1990s)

# --- 8. Selected Pandas Exercises for ML Practitioners ---
# Exercise 1: Count the number of missing values per column
def count_missing_values(df):
    return df.isnull().sum()
print("\nMissing values count:")
print(count_missing_values(action_movies_1990s))

# Exercise 2: Compute the correlation matrix of numerical columns
correlation_matrix = action_movies_1990s.corr()
print("\nCorrelation matrix:")
print(correlation_matrix)

# Exercise 3: Normalize the 'duration' column using Min-Max scaling
min_duration = action_movies_1990s["duration"].min()
max_duration = action_movies_1990s["duration"].max()
action_movies_1990s["normalized_duration"] = (action_movies_1990s["duration"] - min_duration) / (max_duration - min_duration)
print("\nDataFrame with normalized duration:")
print(action_movies_1990s)

# Exercise 4: Extract the top N movies with the highest rating
def top_n_movies(df, n=3):
    return df.nlargest(n, "rating")
print("\nTop 3 highest-rated movies:")
print(top_n_movies(action_movies_1990s))

# Exercise 5: Create a new feature: rating per minute
action_movies_1990s["rating_per_minute"] = action_movies_1990s["rating"] / action_movies_1990s["duration"]
print("\nDataFrame with rating per minute:")
print(action_movies_1990s)

# --- 9. Performance comparison summary ---
print("\nPerformance Summary:")
print(f"Loop execution time: {loop_time:.6f} seconds")
print(f"Vectorized execution time: {vectorized_time:.6f} seconds")
