In [35]:
import pandas as pd
import numpy as np
from datetime import datetime

# Function to create a DataFrame with random normal distributions
def create_df(n=10):
    return pd.DataFrame({
        'a': np.random.normal(size=n),
        'b': np.random.normal(size=n),
        'c': np.random.normal(size=n),
        'd': np.random.normal(size=n)
    })

# Summarize with medians
df = create_df()
summary = df.agg({
    'a': 'median',
    'b': 'median',
    'c': 'median',
    'd': 'median'
})
summary['n'] = len(df)

# Group by and summarize
df['grp'] = np.random.choice([1, 2], size=10)
group_summary = df.groupby('grp').agg('median')

# Function to create a DataFrame with random normal distributions and NAs
def rnorm_na(n, n_na, mean=0, sd=1):
    values = np.random.normal(loc=mean, scale=sd, size=n - n_na).tolist()
    values.extend([np.nan] * n_na)
    np.random.shuffle(values)
    return values

# Create a DataFrame with missing values
df_miss = pd.DataFrame({
    'a': rnorm_na(5, 1),
    'b': rnorm_na(5, 1),
    'c': rnorm_na(5, 2),
    'd': np.random.normal(size=5)
})

# Summarize with medians, handling missing values
miss_summary = df_miss.agg(lambda x: x.median(skipna=True))
miss_summary['n'] = len(df_miss)

# Summarize with medians and count missing values
def summarize_with_missing(df):
    summary_df = pd.DataFrame()
    for col in df.columns:
        summary_df[f"{col}_median"] = [df[col].median(skipna=True)]
        summary_df[f"{col}_n_miss"] = [df[col].isna().sum()]
    summary_df['n'] = len(df)
    return summary_df

miss_summary_with_count = summarize_with_missing(df_miss)

# Replace missing values with 0
df_miss.fillna(0, inplace=True)

# Create absolute value columns
for col in df_miss.columns:
    df_miss[f"{col}_abs"] = df_miss[col].abs()

# Filter rows with any and all NAs
df_miss_any_na = df_miss[df_miss.isna().any(axis=1)]
df_miss_all_na = df_miss[df_miss.isna().all(axis=1)]

# Expand dates to year, month, day
def expand_dates(df, date_col):
    df['year'] = df[date_col].dt.year
    df['month'] = df[date_col].dt.month
    df['day'] = df[date_col].dt.day
    return df

# Example DataFrame with dates
df_date = pd.DataFrame({
    'name': ['Amy', 'Bob'],
    'date': pd.to_datetime(['2009-08-03', '2010-01-16'])
})
print(df_date)

df_date_expanded = expand_dates(df_date, 'date')
df_date_expanded

  name       date
0  Amy 2009-08-03
1  Bob 2010-01-16


Unnamed: 0,name,date,year,month,day
0,Amy,2009-08-03,2009,8,3
1,Bob,2010-01-16,2010,1,16


In [36]:
import pandas as pd

# Load the CSV file into a DataFrame
df_diamonds = pd.read_csv('data/diamonds.csv')

# Define the function to summarize means for numeric columns
def summarize_means(df, group_by):
    # Group by the specified column and calculate mean for numeric columns, ignoring NA values
    summary = df.groupby(group_by).agg(lambda x: x.mean(skipna=True))
    summary['n'] = df.groupby(group_by).size()
    return summary

# Call the function and pass the 'cut' column as the group_by argument
summarize_means(df_diamonds, 'cut')


  summary = df.groupby(group_by).agg(lambda x: x.mean(skipna=True))


Unnamed: 0_level_0,carat,depth,table,price,x,y,z,n
cut,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Fair,1.046137,64.041677,59.053789,4358.757764,6.246894,6.182652,3.98277,1610
Good,0.849185,62.365879,58.694639,3928.864452,5.838785,5.850744,3.639507,4906
Ideal,0.702837,61.709401,55.951668,3457.54197,5.507451,5.52008,3.401448,21551
Premium,0.891955,61.264673,58.746095,4584.257704,5.973887,5.944879,3.647124,13791
Very Good,0.806381,61.818275,57.95615,3981.759891,5.740696,5.770026,3.559801,12082


In [37]:
# Define the function to summarize means for specified columns
def summarize_means_selected(df, group_by, columns):
    # Select only the specified columns for aggregation
    df_selected = df[group_by + columns]
    # Group by the specified 'group_by' column and calculate mean for the selected columns, ignoring NA values
    summary = df_selected.groupby(group_by).agg(lambda x: x.mean(skipna=True))
    summary['n'] = df_selected.groupby(group_by).size()
    return summary

# Call the function and pass the 'cut' column as the group_by argument and the specified columns
selected_columns = ['carat', 'x', 'y', 'z']
summarize_means_selected(df_diamonds, ['cut'], selected_columns)

Unnamed: 0_level_0,carat,x,y,z,n
cut,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Fair,1.046137,6.246894,6.182652,3.98277,1610
Good,0.849185,5.838785,5.850744,3.639507,4906
Ideal,0.702837,5.507451,5.52008,3.401448,21551
Premium,0.891955,5.973887,5.944879,3.647124,13791
Very Good,0.806381,5.740696,5.770026,3.559801,12082


In [38]:
# Function to summarize both median and mean for specified columns
def summarize_median_mean(df, columns):
    summary = df[columns].agg(['median', 'mean'])
    return summary

# Applying the function to the dataframe with columns a to d
columns_to_summarize = ['a', 'b', 'c', 'd']
summarize_median_mean(df, columns_to_summarize)

Unnamed: 0,a,b,c,d
median,-0.35268,-0.167668,-0.243206,0.067583
mean,-0.433823,0.146498,-0.174421,-0.103702


In [39]:
# Define the function to summarize median for all columns
def summarize_median_all_columns(df, group_by):
    # Group by the specified 'group_by' column and calculate median for all columns, ignoring NA values
    summary = df.groupby(group_by).median()
    return summary

# Call the function and pass the 'grp' column as the group_by argument
summarize_median_all_columns(df_diamonds, 'cut')  # Assuming 'cut' is used as an example for 'grp'



  summary = df.groupby(group_by).median()


Unnamed: 0_level_0,carat,depth,table,price,x,y,z
cut,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Fair,1.0,65.0,58.0,3282.0,6.175,6.1,3.97
Good,0.82,63.4,58.0,3050.5,5.98,5.99,3.7
Ideal,0.54,61.8,56.0,1810.0,5.25,5.26,3.23
Premium,0.86,61.4,59.0,3185.0,6.11,6.06,3.72
Very Good,0.71,62.1,58.0,2648.0,5.74,5.77,3.56


In [40]:
import numpy as np

# Define the function again now that numpy is imported
def rnorm_na(n, n_na, mean=0, sd=1):
    values = np.random.normal(loc=mean, scale=sd, size=n - n_na).tolist()
    values.extend([np.nan] * n_na)
    np.random.shuffle(values)
    return values

# Create the DataFrame with missing values again
df_miss = pd.DataFrame({
    'a': rnorm_na(5, 1),
    'b': rnorm_na(5, 1),
    'c': rnorm_na(5, 2),
    'd': np.random.normal(size=5)
})

# Summarize with medians, handling missing values
miss_summary_medians = df_miss.agg(lambda x: x.median(skipna=True))
miss_summary_medians['n'] = len(df_miss)

miss_summary_medians


a   -0.434442
b    0.337332
c    0.672295
d    0.082710
n    5.000000
dtype: float64

In [41]:
import pandas as pd
import numpy as np

# Function to create random numbers with specified number of NaNs
# def rnorm_na(n, n_na, mean=0, sd=1):
#     values = np.random.normal(loc=mean, scale=sd, size=n - n_na).tolist()
#     values += [np.nan] * n_na
#     np.random.shuffle(values)
#     return values
# 
# # Create the DataFrame with missing values
# df_miss = pd.DataFrame({
#     'a': rnorm_na(5, 1),
#     'b': rnorm_na(5, 1),
#     'c': rnorm_na(5, 2),
#     'd': np.random.normal(size=5)
# })

# Summarize with medians, skipping NaN values
medians = df_miss.median(skipna=True)
medians['n'] = len(df_miss)

medians


a   -0.434442
b    0.337332
c    0.672295
d    0.082710
n    5.000000
dtype: float64

In [42]:
import pandas as pd
import numpy as np

# Re-defining the function as it might not be retained from the previous context
def rnorm_na(n, n_na, mean=0, sd=1):
    values = np.random.normal(loc=mean, scale=sd, size=n - n_na).tolist()
    values.extend([np.nan] * n_na)
    np.random.shuffle(values)
    return values

# Create a DataFrame with missing values
df_miss = pd.DataFrame({
    'a': rnorm_na(5, 1),
    'b': rnorm_na(5, 1),
    'c': rnorm_na(5, 2),
    'd': np.random.normal(size=5)
})

# First summary: median of each column, excluding NaN values, and total count
summary1 = df_miss.median(skipna=True).to_frame().T
summary1['n'] = len(df_miss)

# Second summary: median of each column, excluding NaN values, count of NaNs for each column, and total count
summary2 = df_miss.agg(['median', 'count'])
summary2.loc['n_miss'] = df_miss.isna().sum()
summary2.loc['n'] = [len(df_miss)] * len(df_miss.columns)

(summary1, summary2)


(          a         b         c         d  n
 0 -0.231731  1.540338 -0.461585  0.265786  5,
                a         b         c         d
 median -0.231731  1.540338 -0.461585  0.265786
 count   4.000000  4.000000  3.000000  5.000000
 n_miss  1.000000  1.000000  2.000000  0.000000
 n       5.000000  5.000000  5.000000  5.000000)

In [43]:
import pandas as pd

# Assuming df_miss is already defined as shown in the previous Python code

# Define the functions to calculate median and count missing values
def median_ignore_nan(col):
    return col.median(skipna=True)

def count_nan(col):
    return col.isna().sum()

# Calculate median and missing count for each column
medians = df_miss.apply(median_ignore_nan).rename(lambda x: f'median_{x}')
missing_counts = df_miss.apply(count_nan).rename(lambda x: f'n_miss_{x}')

# Combine the results into a single DataFrame
summary_df = pd.concat([medians, missing_counts])

# Add the total number of observations
summary_df['n'] = len(df_miss)

# Convert the Series to a DataFrame
summary_df = summary_df.to_frame().transpose()

summary_df


Unnamed: 0,median_a,median_b,median_c,median_d,n_miss_a,n_miss_b,n_miss_c,n_miss_d,n
0,-0.231731,1.540338,-0.461585,0.265786,1.0,1.0,2.0,0.0,5.0


In [44]:
# Replace NaN values with 0 across specified columns
df_filled = df_miss.fillna(0)
df_filled

Unnamed: 0,a,b,c,d
0,0.197643,1.197322,0.0,1.713343
1,0.4139,2.284333,-1.315907,1.041175
2,-0.661104,0.714224,-0.068242,-0.033236
3,0.0,0.0,-0.461585,0.065641
4,-1.180191,1.883354,0.0,0.265786


In [45]:
# Apply the absolute value function to each of the specified columns and create new columns
df_abs = df_miss.assign(**{f"{col}_abs": lambda x: x[col].abs() for col in ['a', 'b', 'c', 'd']})

df_abs

Unnamed: 0,a,b,c,d,a_abs,b_abs,c_abs,d_abs
0,0.197643,1.197322,,1.713343,1.713343,1.713343,1.713343,1.713343
1,0.4139,2.284333,-1.315907,1.041175,1.041175,1.041175,1.041175,1.041175
2,-0.661104,0.714224,-0.068242,-0.033236,0.033236,0.033236,0.033236,0.033236
3,,,-0.461585,0.065641,0.065641,0.065641,0.065641,0.065641
4,-1.180191,1.883354,,0.265786,0.265786,0.265786,0.265786,0.265786


In [46]:
# Filter rows where any of the specified columns 'a' to 'd' contain NaN values
df_filtered = df_miss.loc[df_miss[['a', 'b', 'c', 'd']].isna().any(axis=1)]

df_filtered

Unnamed: 0,a,b,c,d
0,0.197643,1.197322,,1.713343
3,,,-0.461585,0.065641
4,-1.180191,1.883354,,0.265786


In [47]:
# Filter rows where all of the specified columns 'a' to 'd' contain NaN values
df_filtered_all_na = df_miss.loc[df_miss[['a', 'b', 'c', 'd']].isna().all(axis=1)]

df_filtered_all_na

Unnamed: 0,a,b,c,d


In [48]:
import pandas as pd

def expand_dates(df):
    # Iterate over each column to check for datetime type
    for col in df.select_dtypes(include=['datetime64']):
        # Extract year, month, and day to new columns
        df[f'{col}_year'] = df[col].dt.year
        df[f'{col}_month'] = df[col].dt.month
        df[f'{col}_day'] = df[col].dt.day
    return df

# Create the DataFrame similar to the R tibble
df_date = pd.DataFrame({
    'name': ['Amy', 'Bob'],
    'date': pd.to_datetime(['2009-08-03', '2010-01-16'])  # converting string dates to datetime
})

# Apply the expand_dates function
df_date_expanded = expand_dates(df_date)

df_date_expanded

Unnamed: 0,name,date,date_year,date_month,date_day
0,Amy,2009-08-03,2009,8,3
1,Bob,2010-01-16,2010,1,16


In [49]:
import pandas as pd

# Load the diamonds dataset from the uploaded CSV file
diamonds = pd.read_csv('data/diamonds.csv')

# Define the summarize_means function to calculate means of numeric columns
def summarize_means(df):
    # Select numeric columns and calculate mean, ignoring NaN values
    summary = df.select_dtypes(include='number').apply(lambda x: x.mean(skipna=True))
    # Add the count of rows
    summary['n'] = len(df)
    return summary

# Group by 'cut' column and apply the summarize_means function
diamonds_grouped = diamonds.groupby('cut').apply(summarize_means)

diamonds_grouped.reset_index()  # Reset index to make 'cut' a column again

Unnamed: 0,cut,carat,depth,table,price,x,y,z,n
0,Fair,1.046137,64.041677,59.053789,4358.757764,6.246894,6.182652,3.98277,1610.0
1,Good,0.849185,62.365879,58.694639,3928.864452,5.838785,5.850744,3.639507,4906.0
2,Ideal,0.702837,61.709401,55.951668,3457.54197,5.507451,5.52008,3.401448,21551.0
3,Premium,0.891955,61.264673,58.746095,4584.257704,5.973887,5.944879,3.647124,13791.0
4,Very Good,0.806381,61.818275,57.95615,3981.759891,5.740696,5.770026,3.559801,12082.0


In [50]:
# Specify the columns for which to calculate means
summary_columns = ['carat', 'x', 'y', 'z']

# Group by 'cut' column and apply the summarize_means function with specified columns
diamonds_grouped_specific = diamonds.groupby('cut').apply(lambda df: summarize_means(df, summary_columns))

diamonds_grouped_specific.reset_index()  # Reset index to make 'cut' a column again


TypeError: summarize_means() takes 1 positional argument but 2 were given

In [51]:
# Now summarize the DataFrame across specified columns 'a' through 'd'
summary = df[['a', 'b', 'c', 'd']].agg(['median', 'mean']).transpose().reset_index()

summary.columns = ['variable', 'median', 'mean']  # Rename the columns for clarity
summary

Unnamed: 0,variable,median,mean
0,a,-0.35268,-0.433823
1,b,-0.167668,0.146498
2,c,-0.243206,-0.174421
3,d,0.067583,-0.103702


In [52]:
# Pivot the DataFrame from wide to long format
df_long = pd.melt(df, id_vars=['grp'], value_vars=['a', 'b', 'c', 'd'], var_name='name', value_name='value')

# Group by the 'name' column and calculate median and mean
df_long_summary = df_long.groupby('name')['value'].agg(['median', 'mean']).reset_index()

df_long_summary

Unnamed: 0,name,median,mean
0,a,-0.35268,-0.433823
1,b,-0.167668,0.146498
2,c,-0.243206,-0.174421
3,d,0.067583,-0.103702


In [53]:
# Pivot the DataFrame from wide to long format
df_long = pd.melt(df, id_vars=['grp'], value_vars=['a', 'b', 'c', 'd'], var_name='name', value_name='value')

# Group by the 'name' column and calculate median and mean
df_long_summary = df_long.groupby('name')['value'].agg(['median', 'mean']).reset_index()

df_long_summary

Unnamed: 0,name,median,mean
0,a,-0.35268,-0.433823
1,b,-0.167668,0.146498
2,c,-0.243206,-0.174421
3,d,0.067583,-0.103702


In [54]:
# Simulate the creation of a similar DataFrame in Python with paired values and weights
np.random.seed(0)  # For reproducibility

df_paired = pd.DataFrame({
    'a_val': np.random.randn(10),
    'a_wts': np.random.rand(10),
    'b_val': np.random.randn(10),
    'b_wts': np.random.rand(10),
    'c_val': np.random.randn(10),
    'c_wts': np.random.rand(10),
    'd_val': np.random.randn(10),
    'd_wts': np.random.rand(10)
})

# Pivot the DataFrame from wide to long format, handling paired values and weights
df_long = pd.wide_to_long(df_paired.reset_index(), 
                          stubnames=['a', 'b', 'c', 'd'], 
                          i='index', 
                          j='group', 
                          sep='_', 
                          suffix=r'\w+').reset_index()

# Rename columns to match the expected output from R
df_long.rename(columns={'group': 'group', 'a': 'val', 'b': 'wts'}, inplace=True)

df_long.head()  # Display the first few rows of the long DataFrame


Unnamed: 0,index,group,val,wts,c,d
0,0,val,1.764052,1.494079,1.230291,-0.028182
1,1,val,0.400157,-0.205158,1.20238,0.428332
2,2,val,0.978738,0.313068,-0.387327,0.066517
3,3,val,2.240893,-0.854096,-0.302303,0.302472
4,4,val,1.867558,-2.55299,-1.048553,-0.634322
