In [None]:
import pandas as pd
import matplotlib.pyplot as plt
from io import StringIO

# Creating the lists
x1 = ["Dec", "Apr", "Jan", "Mar"]
x2 = ["Dec", "Apr", "Jam", "Mar"]
# Sorting x1
x1_sorted = sorted(x1)

# Define the month levels
month_levels = [
  "Jan", "Feb", "Mar", "Apr", "May", "Jun",
  "Jul", "Aug", "Sep", "Oct", "Nov", "Dec"
]

# Creating factors with levels (categories with order)
y1 = pd.Categorical(x1, categories=month_levels, ordered=True)
y1_sorted = y1.sort_values()

# y2 with a typo fixed
y2 = pd.Categorical(x2, categories=month_levels, ordered=True)

# Reading CSV data
csv_data = """
month,value
Jan,12
Feb,56
Mar,12
"""
df = pd.read_csv(StringIO(csv_data))
df['month'] = pd.Categorical(df['month'], categories=month_levels, ordered=True)


In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

# Assuming gss_cat is a pandas DataFrame
gss_cat = pd.read_csv('data/gss_cat.csv')

# Equivalent to 'count(race)'
race_count = gss_cat['race'].value_counts().reset_index()
race_count.columns = ['race', 'count']

# Equivalent to 'group_by' and 'summarize'
relig_summary = gss_cat.groupby('relig').agg(
    tvhours=('tvhours', lambda x: x.mean(skipna=True)),
    n=('tvhours', 'size')  # 'size' counts NaN values as well, 'count' does not
).reset_index()

# Plotting
sns.scatterplot(data=relig_summary, x='tvhours', y='relig')
plt.show()

# For the reordered plot
relig_summary['relig_reordered'] = relig_summary['relig'].astype('category')
relig_summary['relig_reordered'] = relig_summary['relig_reordered'].cat.reorder_categories(
    relig_summary.sort_values('tvhours')['relig'], ordered=True)

sns.scatterplot(data=relig_summary, x='tvhours', y='relig_reordered')
plt.show()


In [None]:
# Reordering 'relig' within the dataframe based on 'tvhours'
relig_summary['relig_reordered'] = pd.Categorical(
    relig_summary['relig'],
    categories=relig_summary.sort_values('tvhours')['relig'],
    ordered=True
)

# Plotting with reordered 'relig'
sns.scatterplot(data=relig_summary, x='tvhours', y='relig_reordered')
plt.show()

# Group by 'rincome' and summarize
rincome_summary = gss_cat.groupby('rincome').agg(
    age=('age', lambda x: x.mean(skipna=True)),
    n=('age', 'count')
).reset_index()

# Reorder 'rincome' based on 'age'
rincome_summary['rincome_reordered'] = pd.Categorical(
    rincome_summary['rincome'],
    categories=rincome_summary.sort_values('age')['rincome'],
    ordered=True
)

# Plot with 'rincome' reordered by 'age'
sns.scatterplot(data=rincome_summary, x='age', y='rincome_reordered')
plt.show()

# Plot with 'rincome' relevel by moving 'Not applicable' to the front
rincome_summary['rincome_relevel'] = pd.Categorical(
    rincome_summary['rincome'],
    categories=['Not applicable'] + [cat for cat in rincome_summary['rincome'] if cat != 'Not applicable'],
    ordered=True
)

sns.scatterplot(data=rincome_summary, x='age', y='rincome_relevel')
plt.show()

# Filter out rows where 'age' is NA and then count occurrences of 'age' and 'marital'
by_age = gss_cat.dropna(subset=['age']).groupby(['age', 'marital']).size().reset_index(name='n')

# Calculate proportion within each 'age'
by_age['prop'] = by_age.groupby('age')['n'].transform(lambda x: x / x.sum())

# Plot by 'age' with color representing 'marital' status
plt.figure(figsize=(10, 6))
sns.lineplot(data=by_age, x='age', y='prop', hue='marital', palette="Set1", linewidth=1)
plt.show()

# For the second line plot, we need to reorder 'marital' within each 'age' by 'prop'.
# This is complex in pandas, might involve sorting by 'age' and 'prop' within each group and then plotting.

# Bar plot of marital status, ordered by frequency and then reversed
marital_ordered = gss_cat['marital'].value_counts(ascending=True).index
gss_cat['marital_ordered'] = pd.Categorical(
    gss_cat['marital'],
    categories=marital_ordered,
    ordered=True
)

sns.countplot(data=gss_cat, x='marital_ordered')
plt.xticks(rotation=90)
plt.show()


In [20]:
# Count occurrences of 'partyid'
partyid_count = gss_cat['partyid'].value_counts().reset_index()
partyid_count.columns = ['partyid', 'count']

# Recode 'partyid' values and count occurrences
recode_dict = {
    "Strong republican": "Republican, strong",
    "Not str republican": "Republican, weak",
    "Ind,near rep": "Independent, near rep",
    "Ind,near dem": "Independent, near dem",
    "Not str democrat": "Democrat, weak",
    "Strong democrat": "Democrat, strong"
}

print(recode_dict)

gss_cat['partyid_recode'] = gss_cat['partyid'].replace(recode_dict)
recode_count = gss_cat['partyid_recode'].value_counts().reset_index()
recode_count.columns = ['partyid_recode', 'count']

# Recode 'partyid' values including collapsing 'Other'
recode_dict.update({
    "No answer": "Other",
    "Don't know": "Other",
    "Other party": "Other"
})

gss_cat['partyid_recode_collapse'] = gss_cat['partyid'].replace(recode_dict)

# Collapse categories in 'partyid'
collapse_categories = {
    "other": ["No answer", "Don't know", "Other party"],
    "rep": ["Strong republican", "Not str republican"],
    "ind": ["Ind,near rep", "Independent", "Ind,near dem"],
    "dem": ["Not str democrat", "Strong democrat"]
}

# A function to collapse categories
def collapse_partyid(row, collapse_dict):
    for new_category, old_categories in collapse_dict.items():
        if row in old_categories:
            return new_category
    return row

gss_cat['partyid_collapsed'] = gss_cat['partyid'].apply(lambda row: collapse_partyid(row, collapse_categories))
collapsed_count = gss_cat['partyid_collapsed'].value_counts().reset_index()
collapsed_count.columns = ['partyid_collapsed', 'count']

# Lump low frequency 'relig' categories
lump_thresh = gss_cat['relig'].value_counts().tail(1).values[0]  # You can adjust this threshold
gss_cat['relig_lump_lowfreq'] = gss_cat['relig'].apply(lambda x: 'Other' if gss_cat['relig'].value_counts()[x] <= lump_thresh else x)
lump_lowfreq_count = gss_cat['relig_lump_lowfreq'].value_counts().reset_index()
lump_lowfreq_count.columns = ['relig_lump_lowfreq', 'count']

# Lump 'relig' categories with n less than 10
lump_n_thresh = 10
gss_cat['relig_lump_n'] = gss_cat['relig'].apply(lambda x: 'Other' if gss_cat['relig'].value_counts()[x] < lump_n_thresh else x)
lump_n_count = gss_cat['relig_lump_n'].value_counts().sort_values(ascending=False).reset_index()
lump_n_count.columns = ['relig_lump_n', 'count']

# Create ordered category
categories_ordered = pd.Categorical(["a", "b", "c"], ordered=True)

print(categories_ordered)


{'Strong republican': 'Republican, strong', 'Not str republican': 'Republican, weak', 'Ind,near rep': 'Independent, near rep', 'Ind,near dem': 'Independent, near dem', 'Not str democrat': 'Democrat, weak', 'Strong democrat': 'Democrat, strong'}
['a', 'b', 'c']
Categories (3, object): ['a' < 'b' < 'c']
