In [1]:
from datetime import datetime

import pandas as pd
import numpy as np

In [5]:
df = pd.read_csv('sales.csv').drop_duplicates()

In [10]:
df.groupby('team').count()

Unnamed: 0_level_0,name,sales
team,Unnamed: 1_level_1,Unnamed: 2_level_1
blue,4,4
green,2,2
red,3,3


In [11]:
df['team'].value_counts()

blue     4
red      3
green    2
Name: team, dtype: int64

In [12]:
df.groupby('team').size()

team
blue     4
green    2
red      3
dtype: int64

In [12]:
# to do a named aggregate for each row and partitioning by groups.
# have to .groupby('col')['col] and select only that column after and then use .transform('agg_operation'))
df['team_count_agg'] = df \
    .groupby('team')['team'] \
    .transform('count')

# same operation as above
df['team_count_agg2'] = df \
    .groupby('team')['team'] \
    .transform(len)

df['team_sales_agg'] = df \
    .groupby('team')['sales'] \
    .transform('sum')

df['team_sales_agg_plus1'] = df \
    .groupby('team')['sales'] \
    .transform(lambda x: x + 1)

# to create a named aggregate for the whole dataset
df['dummy'] = 1
df['total_sales_agg'] = df \
    .groupby('dummy')['sales'] \
    .transform('sum')
df = df.drop('dummy', axis = 1)

df_sales = df.query('team_sales_agg > 200').sort_values('team_sales_agg', ascending = False) # query the team_sales col for values > 200 and sort descending

certain_name = df[df['name'].str.contains('tom')] # return the row if name contains the string 'tom'
certain_name2 = df.query('name == "tom"')

df['total_sales_after_tax'] = df['total_sales_agg'] * .8
df['scrape_date'] = datetime.now().date()
df['scrape_ts'] = datetime.now()

df = df.sort_values(by = ['team', 'name'])
df['cumulative_sales'] = df['sales'].cumsum()                       # Cumulative sum
df['cumulative_team_sales'] = df.groupby('team')['sales'].cumsum()   # grouped cumulative sum
df['prev_sales'] = df.groupby('team')['sales'].shift(1)             # SQL LAG
df['pct_sales'] = round(df['sales'] / df['cumulative_team_sales'], 3)

# use isin(["name1", "name2"]).values w/ query 
df = df.query('name.isin(["abby", "donny"]).values')
df['final_names'] = ', '.join(df['name'].tolist()) # comma separated list of all strings in a column (pd.series)
final_names = ', '.join(df['name'].tolist())
df

Unnamed: 0,name,team,sales,team_count_agg,team_count_agg2,team_sales_agg,team_sales_agg_plus1,total_sales_agg,total_sales_after_tax,scrape_date,scrape_ts,cumulative_sales,cumulative_team_sales,prev_sales,pct_sales,final_names
1,abby,blue,200,2,2,350,201,350,280.0,2022-07-23,2022-07-23 14:27:37.602077,200,200,,1.0,"abby, donny"
8,donny,blue,150,2,2,350,151,350,280.0,2022-07-23,2022-07-23 14:27:37.602077,350,350,200.0,0.429,"abby, donny"
