In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [None]:
# Ensure all rows and columns are displayed without truncation.
pd.set_option('display.max_rows', None)  # 모든 행 출력
pd.set_option('display.max_columns', None)  # 모든 열 출력

In [None]:
df = pd.read_csv("/content/pub.csv")

In [None]:
# Check column names, types and value counts.
df.info()

In [None]:
df.head(10)

In [None]:
# Identify missing values.
print(df.isnull().sum())

In [None]:
missing_ratio = (df.isnull().mean() * 100).round(2)
print(missing_ratio)

In [None]:
# Check for duplicate rows.
df.duplicated().sum()

In [None]:
# Correct incorrect column names.
df = df.rename(columns = {'ahrsmian' : 'ahrsmain', 'atothur': 'atothrs'})

In [None]:
# Convert specific columns from numeric to categorical data types.
columns_exclude = ['id', 'rec_num', 'survyear', 'survmnth', 'durunemp', 'wksaway', 'uhrsmain', 'ahrsmain', 'utothrs', 'atothrs', 'hrsaway', 'paidot', 'unpaidot', 'xtrahrs', 'tenure',
                   'prevten', 'hrlyearn', 'durjless', 'finalwt']
columns_numeric = ['durunemp', 'wksaway', 'uhrsmain', 'ahrsmain', 'utothrs', 'atothrs', 'hrsaway', 'paidot', 'unpaidot', 'xtrahrs', 'tenure',
                   'prevten', 'hrlyearn', 'durjless', 'finalwt']

columns_category = df.columns.difference(columns_exclude)
df[columns_category] = df[columns_category].astype('category')

df.info()

In [None]:
# Review basic statistical metrics for numeric columns.
df[columns_numeric].describe()

In [None]:
# Review basic statistical metrics for categorical columns.
df[columns_category].describe()

In [None]:
# Analyze the distribution of numeric columns.
for col in columns_numeric:
    plt.figure(figsize=(6, 4))
    df[[col]].boxplot()
    plt.title(f'Boxplot of {col}')
    plt.ylabel('Values')
    plt.show()

In [None]:
# Analyze the distribution of categorical columns.
for col in columns_category:
    plt.figure(figsize=(8, 4))
    df[col].value_counts().plot(kind='bar')
    plt.title(f'Distribution of {col}')
    plt.xlabel('Category')
    plt.ylabel('Count')
    plt.xticks(rotation=90)
    plt.show()

In [None]:
# Filter data for currently employed workers, public or private sector employees (excluding self-employed), full-time workers, permanent positions, and major metropolitan areas.
df_fliterd = df[(df['lfsstat'] == 1) & (df['cowmain'].isin([1, 2])) & (df['ftptmain'] == 1) & (df['permtemp'] == 1) & (df['cma'] != 0)]
df_fliterd.info()

In [None]:
print(df_fliterd.isnull().sum())

In [None]:
# Select relevant columns based on the provided codebook.
selected_catetory = ['cma', 'naics_21', 'noc_10', 'noc_43', 'union']
selected_number = ['ahrsmain', 'paidot', 'unpaidot', 'tenure', 'hrlyearn']

In [None]:
# Analyze the distribution of numeric columns in the filtered dataset.
for col in selected_number:
    plt.figure(figsize=(6, 4))
    df_fliterd[[col]].boxplot()
    plt.title(f'Boxplot of {col}')
    plt.ylabel('Values')
    plt.show()

In [None]:
# Analyze the distribution of categorical columns in the filtered dataset.
for col in selected_catetory:
    plt.figure(figsize=(8, 4))
    df_fliterd[col].value_counts().plot(kind='bar')
    plt.title(f'Distribution of {col}')
    plt.xlabel('Category')
    plt.ylabel('Count')
    plt.xticks(rotation=90)
    plt.show()