In [1]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv('BigML_Dataset_5f50a62795a9306aa200003e.csv')

In [3]:
# Data Cleaning

In [4]:
# 1. Check for missing values
df.isnull().sum()

Age                                                      0
Sex                                                      0
Race                                                     0
Marital status?                                          0
Education                                               58
Employement                                              0
Incomes                                                  0
How many books did you read during last 12months?        0
Read any printed books during last 12months?           390
Read any audiobooks during last 12months?              390
Read any e-books during last 12months?                 390
Last book you read, you…                               390
Do you happen to read any daily news or newspapers?      0
Do you happen to read any magazines or journals?         0
dtype: int64

In [5]:
# 2. Drop duplicates if any
df = df.drop_duplicates()

In [6]:
# 3. Clean categorical text values (strip spaces, unify Yes/No answers)
df = df.map(lambda x: x.strip() if isinstance(x, str) else x)

In [7]:
# 4. Standardize column names
df.columns = df.columns.str.strip().str.replace("?", "").str.replace(" ", "_").str.lower()

In [8]:
# 5. Handle missing values for reading-related questions
# Not removing any rows (Tableau has excluding

In [9]:
# 6. Convert categorical Yes/No to binary (0/1) and Don't know to NaN
cols_to_convert = [
    "read_any_printed_books_during_last_12months",
    "read_any_audiobooks_during_last_12months",
    "read_any_e-books_during_last_12months"
]
value_map = {'Yes': 1, 'No': 0}
df[cols_to_convert] = df[cols_to_convert].apply(lambda col: col.map(value_map))

In [10]:
# 7. Handle extreme book counts (cap outliers)
q99 = df["how_many_books_did_you_read_during_last_12months"].quantile(0.99)
df["how_many_books_did_you_read_during_last_12months"] = np.where(
    df["how_many_books_did_you_read_during_last_12months"] > q99,
    q99,
    df["how_many_books_did_you_read_during_last_12months"]
)

In [11]:
# After applying cleaning
df.isnull().sum()

age                                                    0
sex                                                    0
race                                                   0
marital_status                                         0
education                                             58
employement                                            0
incomes                                                0
how_many_books_did_you_read_during_last_12months       0
read_any_printed_books_during_last_12months           10
read_any_audiobooks_during_last_12months               4
read_any_e-books_during_last_12months                 14
last_book_you_read,_you…                               0
do_you_happen_to_read_any_daily_news_or_newspapers     0
do_you_happen_to_read_any_magazines_or_journals        0
dtype: int64

In [12]:
# EDA

In [13]:
# Age distribution
df["age"].describe()

count    2831.000000
mean       47.270223
std        18.565806
min        16.000000
25%        32.000000
50%        48.000000
75%        62.000000
max        93.000000
Name: age, dtype: float64

In [14]:
# Gender distribution
df["sex"].value_counts()

sex
Female    1479
Male      1352
Name: count, dtype: int64

In [15]:
# Average books read by gender
df.groupby("sex")["how_many_books_did_you_read_during_last_12months"].mean()

sex
Female    19.694388
Male      13.377959
Name: how_many_books_did_you_read_during_last_12months, dtype: float64

In [16]:
# Average books read by education level
df.groupby("education")["how_many_books_did_you_read_during_last_12months"].mean()

education
College graduate                                            18.815436
Don’t know                                                   5.375000
High school graduate                                        12.215116
High school incomplete                                      12.460076
Post-graduate training/professional school after college    22.738523
Some college, no 4-year degree                              17.889401
Technical, trade or vocational school AFTER high school     14.378788
Name: how_many_books_did_you_read_during_last_12months, dtype: float64

In [17]:
# Income vs reading habits
df.groupby("incomes")["how_many_books_did_you_read_during_last_12months"].mean()

incomes
$10,000 to under $20,000       13.578704
$100,000 to under $150,000     19.896226
$20,000 to under $30,000       16.373950
$30,000 to under $40,000       15.203774
$40,000 to under $50,000       16.541063
$50,000 to under $75,000       18.010152
$75,000 to under $100,000      16.806349
9$100,000 to under $150,000    15.042453
Less than $10,000              12.846626
Refused                        16.199313
Name: how_many_books_did_you_read_during_last_12months, dtype: float64

In [18]:
# Print vs e-book vs audiobook popularity
df[["read_any_printed_books_during_last_12months",
    "read_any_audiobooks_during_last_12months",
    "read_any_e-books_during_last_12months"]].mean()

read_any_printed_books_during_last_12months    0.802198
read_any_audiobooks_during_last_12months       0.145384
read_any_e-books_during_last_12months          0.279020
dtype: float64

In [19]:
# News vs Magazines vs Books
pd.crosstab(df["do_you_happen_to_read_any_daily_news_or_newspapers"],
            df["do_you_happen_to_read_any_magazines_or_journals"])

do_you_happen_to_read_any_magazines_or_journals,Don’t know,No,Yes
do_you_happen_to_read_any_daily_news_or_newspapers,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Don’t know,1,3,1
No,1,647,429
Yes,2,672,1075


In [20]:
# Borrow vs Purchase last book
df["last_book_you_read,_you…"].value_counts(normalize=True) * 100

last_book_you_read,_you…
Purchased the book                                  44.683857
Borrowed the book from a friend or family member    17.696927
No                                                  13.776051
Borrowed the book from a library                    11.833274
Got the book some other way                         10.914871
8                                                    0.883080
9                                                    0.211939
Name: proportion, dtype: float64

In [21]:
# Save cleaned dataset to CSV
df.to_csv("cleaned_reading_habits.csv", index=False)