### Data Preperation
### Week 7 and 8
### Submitter - Himanshu Singh
### Data Cleaning and Transforming

Chapter 7
* Filter out missing data
* Fill in missing data


In [102]:
import pandas as pd
import numpy as np


# Load the dataset, which initially loads as a single column
df = pd.read_csv('MetObjects.csv', sep=',', quotechar='"', na_values=['', ' '], keep_default_na=True, encoding='latin-1', header=None)


# Split the single column (column 0) into multiple columns
data = df[df.columns[0]].str.split(',', expand=True)
#print(data.head(1))

# Set the column names using the first row of the split data
data.columns = data.iloc[0]
#print (data.columns)

cleaned_header_list = [
    h for h in data.columns
    if h is not None and str(h).strip()
]

#print (cleaned_header_list)

# Remove the header row (now row 0) from the data and reset the index
data = data[1:].reset_index(drop=True)
#data.head().to_csv('MetObjects_filtered_head.csv', index=False)
# Replace empty strings ('' or ' ') which might still exist, with NaN
# This step ensures all true missing values are represented as NaN for filtering/filling
data = data.replace('', np.nan)

data = data.replace('"', '', regex=True)

# 1. Convert various "blank" values to NaN for effective cleaning
data.replace(['', ' ', 'None', np.nan, None], pd.NA, inplace=True)

# 2. Remove blank headers (as shown in section 1)
data.columns = [str(col).strip() if col is not None else '' for col in data.columns]
data = data.loc[:, data.columns != '']

# 3. Remove all-NaN columns
#data = data.dropna(axis=1, how='all')

#data.head().to_csv('Before_MetObjects_filled_head.csv', index=False)

# --- 1. Filter out missing data (Drop rows with any NaN) ---
# This creates a new DataFrame containing only rows with NO missing values.
df_filtered = data.dropna()

# --- 2. Fill in missing data (Fill all NaN with 'N/A') ---
# This creates a new DataFrame where all missing values (NaN) are replaced by 'N/A'.
df_filled = data.fillna('N/A')

# Code used to save the head of the resulting DataFrames to CSV:
#df_filtered.head().to_csv('MetObjects_filtered_head.csv', index=False)
#df_filled.head().to_csv('MetObjects_filled_head.csv', index=False)

#print (df_filtered.head())
print (df_filled.head())


  Object Number Is Highlight Is Public Domain Object ID         Department  \
0    1979.486.1        False            False         1  The American Wing   
1    1980.264.5        False            False         2  The American Wing   
2      67.265.9        False            False         3  The American Wing   
3     67.265.10        False            False         4  The American Wing   
4     67.265.11        False            False         5  The American Wing   

  Object Name                         Title Culture Period Dynasty  ...  \
0        Coin  One-dollar Liberty Head Coin     N/A    N/A     N/A  ...   
1        Coin  Ten-dollar Liberty Head Coin     N/A    N/A     N/A  ...   
2        Coin    Two-and-a-Half Dollar Coin     N/A    N/A     N/A  ...   
3        Coin    Two-and-a-Half Dollar Coin     N/A    N/A     N/A  ...   
4        Coin    Two-and-a-Half Dollar Coin     N/A    N/A     N/A  ...   

  Locale Locus Excavation River Classification Rights and Reproduction  \
0    N

Chapter 8

* Create hierarchical index
* Reshape

In [103]:
import pandas as pd

# --- Data Loading and Cleansing Setup ---
file_name = 'BOING-BOING-CANDY-HIERARCHY-2016-SURVEY-Responses.xlsx'
df = pd.read_excel(file_name)

# 1. Clean Column Names
df.columns = df.columns.str.strip()

def clean_candy_col(col):
    if col.startswith('['):
        col = col.replace('[', '').replace(']', '')
    if col.endswith('...'):
        col = col.replace('...', '')
    return col.strip()

df.columns = [clean_candy_col(col) for col in df.columns]

# 1. Convert Categorical Ratings to Numeric Scores (1: JOY, 0: MEH, -1: DESPAIR)
candy_rating_map = {'JOY': 1, 'MEH': 0, 'DESPAIR': -1}
non_candy_cols_first_group = ['Timestamp', 'Are you going actually going trick or treating yourself?', 'Your gender:', 'How old are you?', 'Which country do you live in?', 'Which state, province, county do you live in?']
non_candy_cols_second_group =['Please list any items not included above that give you JOY.', 'Please list any items not included above that give you DESPAIR.', 'Please leave any witty, snarky or thoughtful remarks or comments regarding your choices.', 'Guess the number of mints in my hand.', 'Betty or Veronica?', '"That dress* that went viral a few years back - when I first saw it, it was ________"', 'What is your favourite font?', 'Please estimate the degree(s) of separation you have from the following celebrities [JK Rowling]', 'Please estimate the degree(s) of separation you have from the following celebrities [JJ Abrams]', 'Please estimate the degree(s) of separation you have from the following celebrities [Beyoncé]', 'Please estimate the degree(s) of separation you have from the following celebrities [Bieber]', 'Please estimate the degree(s) of separation you have from the following celebrities [Kevin Bacon]', 'Please estimate the degree(s) of separation you have from the following celebrities [Francis Bacon (1561 - 1626)]', 'Which day do you prefer, Friday or Sunday?', 'Do you eat apples the correct way, East to West (side to side) or do you eat them like a freak of nature, South to North (bottom to top)?', 'When you see the above image of the 4 different websites, which one would you most likely check out (please be honest).', 'York Peppermint Patties Ignore']
candy_cols_start_index = df.columns.get_loc(non_candy_cols_first_group[-1]) + 1
print (candy_cols_start_index)
candy_cols_end_index = df.columns.get_loc(non_candy_cols_second_group[0]) -1
print (candy_cols_end_index)
candy_rating_cols = df.columns[candy_cols_start_index:candy_cols_end_index].tolist()

df[candy_rating_cols] = df[candy_rating_cols].replace(candy_rating_map)



# Create Hierarchical Index (MultiIndex)
# Organize the data using two index levels: Country, then Gender.

# Select relevant columns and create a new DataFrame with a MultiIndex
df_multiindex = df[['Which country do you live in?', 'Your gender:', 'Kit Kat', 'Snickers', 'Candy Corn']].set_index(['Which country do you live in?', 'Your gender:'])

print("DataFrame with Hierarchical Index")
print(df_multiindex.head(10).to_markdown(numalign='left', stralign='left'))



# Reshape (Melt)
# Convert the DataFrame from wide format (candies as columns) to long format
# (one row per observation of a candy rating).

# Use pd.melt to unpivot the candy rating columns
df_melted = df.melt(
    # Columns to keep as identifiers (will be repeated for each row)
    id_vars=['Timestamp', 'Which country do you live in?', 'Your gender:'],
    # Columns to unpivot (i.e., candy rating columns)
    value_vars=candy_rating_cols,
    # New column name for the unpivoted column headers
    var_name='Candy_Name',
    # New column name for the unpivoted values
    value_name='Rating_Score_Numeric'
)

print("\nReshaped DataFrame")
print(df_melted.head().to_markdown(index=False, numalign='left', stralign='left'))

6
105
DataFrame with Hierarchical Index
|                    | Kit Kat   | Snickers   | Candy Corn   |
|:-------------------|:----------|:-----------|:-------------|
| ('Canada', 'Male') | 1         | 1          | -1           |
| ('usa', 'Male')    | 1         | 1          | -1           |
| ('US', 'Female')   | 0         | 1          | 0            |
| ('usa', 'Male')    | 1         | 1          | 1            |
| ('USA', 'Male')    | 1         | 0          | -1           |
| ('USA', 'Male')    | 1         | 1          | 0            |
| ('usa', 'Male')    | 1         | 1          | -1           |
| ('Canada', 'Male') | 1         | 1          | -1           |
| ('USA', 'Male')    | 0         | 1          | -1           |
| ('UK', 'Female')   | 1         | 0          | -1           |

Reshaped DataFrame
| Timestamp                  | Which country do you live in?   | Your gender:   | Candy_Name    | Rating_Score_Numeric   |
|:---------------------------|:------------------------------

  df[candy_rating_cols] = df[candy_rating_cols].replace(candy_rating_map)


Chapter 10

* Grouping with Functions
* Cross Tabs

In [104]:

# Additional Cleansing for  Convert 'How old are you?' to numeric
df['How old are you?'] = pd.to_numeric(df['How old are you?'], errors='coerce')


# Grouping with Functions
# Define the function to categorize age
def age_group(age):
    if pd.isna(age):
        return 'Unknown'
    elif age <= 25:
        return 'Young (<= 25)'
    elif age <= 40:
        return 'Adult (26-40)'
    elif age <= 60:
        return 'Middle-Aged (41-60)'
    else:
        return 'Older (> 60)'

# Group by applying the function to the 'How old are you?' column and calculate the mean
df_age_grouped = df.groupby(df['How old are you?'].apply(age_group))[candy_rating_cols].mean()

# Select and sort results
df_age_grouped_sorted = df_age_grouped.sort_values(by='Kit Kat', ascending=False)[['Kit Kat', 'Snickers', 'Candy Corn']]



# Cross Tabs
# Create the Cross-Tabulation
df_crosstab = pd.crosstab(
    index=df['Your gender:'],
    columns=df['Candy Corn'],
    normalize=True # Show percentages of the total responses
)

# Clean up the output for display
df_crosstab.columns = ['DESPAIR (-1)', 'MEH (0)', 'JOY (1)']
df_crosstab.index.name = 'Gender'
df_crosstab = (df_crosstab * 100).round(2)

print("Grouping by Age Function (Average Kit Kat Rating by Age Group)")
print(df_age_grouped_sorted.to_markdown(numalign='left', stralign='left'))
print("\nCross Tabulation (Gender vs. Candy Corn Rating as % of Total)")
print(df_crosstab.to_markdown(numalign='left', stralign='left'))

Grouping by Age Function (Average Kit Kat Rating by Age Group)
| How old are you?    | Kit Kat   | Snickers   | Candy Corn   |
|:--------------------|:----------|:-----------|:-------------|
| Young (<= 25)       | 0.808824  | 0.492537   | -0.057971    |
| Unknown             | 0.777778  | 0.640625   | -0.0597015   |
| Adult (26-40)       | 0.75514   | 0.711864   | -0.0742115   |
| Middle-Aged (41-60) | 0.731061  | 0.680154   | -0.148571    |
| Older (> 60)        | 0.604651  | 0.642857   | -0.214286    |

Cross Tabulation (Gender vs. Candy Corn Rating as % of Total)
| Gender             | DESPAIR (-1)   | MEH (0)   | JOY (1)   |
|:-------------------|:---------------|:----------|:----------|
| Female             | 12.56          | 9.72      | 10.7      |
| I'd rather not say | 0.65           | 0.81      | 0.73      |
| Male               | 25.2           | 22.2      | 16.05     |
| Other              | 0.41           | 0.49      | 0.49      |


Chapter 11

* Convert between string and date time
* Period Frequency conversions

In [105]:

# Convert between string and date time

# Convert the 'Timestamp' column from string (object) to a datetime object.
# The 'errors='coerce' option is typically used to turn invalid dates into NaT (Not a Time),

df['Timestamp'] = pd.to_datetime(df['Timestamp'])
print(df.head())


# Frequencies and date offsets

# Calculate the frequency of survey responses by day.

# Set the converted 'Timestamp' column as the index
df_time = df.set_index('Timestamp')

# Resample the data to a daily frequency ('D') and count the number of responses (rows)
# This uses the 'D' (Day) date offset.
daily_response_count = df_time.resample('D').size()
print(daily_response_count)

                Timestamp  \
0 2016-10-24 05:09:23.033   
1 2016-10-24 05:09:54.798   
2 2016-10-24 05:13:06.734   
3 2016-10-24 05:14:17.192   
4 2016-10-24 05:14:24.625   

  Are you going actually going trick or treating yourself? Your gender:  \
0                                                 No               Male   
1                                                 No               Male   
2                                                 No             Female   
3                                                 No               Male   
4                                                Yes               Male   

   How old are you? Which country do you live in?  \
0              22.0                        Canada   
1              45.0                           usa   
2              48.0                            US   
3              57.0                           usa   
4              42.0                           USA   

  Which state, province, county do you live in?  100 Gr