In [6]:
import pandas as pd

# --- 1. Load Data ---
# Load the original CSV file
df = pd.read_csv('attendance.csv')

# --- 2. Get Original Size ---
original_rows = df.shape[0]
original_cols = df.shape[1]
print(f"Original size: {original_rows} rows, {original_cols} columns")

# --- 3. Apply Solutions ---

# Solution 1 & 3: Handle 'NA' values and convert type
# pd.to_numeric with errors='coerce' turns all 'NA' strings into NaN (Not a Number)
df['weekly_attendance'] = pd.to_numeric(df['weekly_attendance'], errors='coerce')

# Now, drop all rows where 'weekly_attendance' is NaN
df_cleaned = df.dropna(subset=['weekly_attendance'])

# Convert the column to a whole number (integer)
df_cleaned['weekly_attendance'] = df_cleaned['weekly_attendance'].astype(int)

# Solution 2: Remove redundant yearly columns
columns_to_drop = ['total', 'home', 'away']
df_cleaned = df_cleaned.drop(columns=columns_to_drop)

# --- 4. Get Final Size ---
final_rows = df_cleaned.shape[0]
final_cols = df_cleaned.shape[1]
print(f"Final dataset: {final_rows} rows, {final_cols} columns")

# --- 5. Save Cleaned Data ---
# Save the final, clean dataset to a new file
cleaned_file_name = 'attendance_cleaned.csv'
df_cleaned.to_csv(cleaned_file_name, index=False)

print(f"\nCleaned data saved to '{cleaned_file_name}'")

Original size: 10846 rows, 8 columns
Final dataset: 10208 rows, 5 columns

Cleaned data saved to 'attendance_cleaned.csv'


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_cleaned['weekly_attendance'] = df_cleaned['weekly_attendance'].astype(int)


In [7]:
import pandas as pd

# Load the cleaned dataset
df = pd.read_csv('attendance_cleaned.csv')

# Select the column of interest
attendance_col = df['weekly_attendance']

# Calculate key statistics
mean_attendance = attendance_col.mean()
median_attendance = attendance_col.median()
min_attendance = attendance_col.min()
max_attendance = attendance_col.max()
std_dev_attendance = attendance_col.std()
count = attendance_col.count()

# Print the statistics
print("--- Descriptive Statistics for Weekly Attendance ---")
print(f"Total Observations (Count): {count}")
print(f"Mean: {mean_attendance:.2f}")
print(f"Median: {median_attendance:.0f}")
print(f"Standard Deviation: {std_dev_attendance:.2f}")
print(f"Range: {min_attendance} - {max_attendance}")

--- Descriptive Statistics for Weekly Attendance ---
Total Observations (Count): 10208
Mean: 67556.88
Median: 68334
Standard Deviation: 9022.02
Range: 23127 - 105121
