In [1]:
# Operation on Groups

In [2]:
## Iteration

In [3]:
import pandas as pd
import os

In [4]:
# Let's load the data for the first time
df = pd.read_pickle(os.path.join('data_frame.pickle'))

In [5]:
small_df = df.iloc[49980:50019, :].copy()
grouped = small_df.groupby('artist')
type(grouped)

pandas.core.groupby.generic.DataFrameGroupBy

In [7]:
for name, group_df in grouped:
    print(name)
    print(group_df)
    break

Frost, Sir Terry
                artist            title               medium    year  \
id                                                                     
4704  Frost, Sir Terry        Blue Moon  Lithograph on paper  1952.0   
4705  Frost, Sir Terry      Boat Shapes     Linocut on paper  1952.0   
4706  Frost, Sir Terry      Boat Shapes     Linocut on paper  1954.0   
4707  Frost, Sir Terry      Boat Shapes     Linocut on paper  1954.0   
4708  Frost, Sir Terry            Leeds    Drypoint on paper  1956.0   
4709  Frost, Sir Terry  Camping, Anduze     Etching on paper  1979.0   
4710  Frost, Sir Terry     Umea, Sweden     Etching on paper  1979.0   
4711  Frost, Sir Terry    Self-Portrait     Etching on paper  1980.0   

      acquisitionYear width height units  
id                                        
4704           1983.0   355    273    mm  
4705           1983.0   132    143    mm  
4706           1983.0   131    155    mm  
4707           1983.0   193    267    mm  
4708

In [8]:
# Aggregate
# Mins

In [10]:
for name, group_df in small_df.groupby('artist'):
    min_year = group_df['acquisitionYear'].min()
    print(f"{name}: {min_year}")

Frost, Sir Terry: 1983.0
Phillips, Esq Tom: 1983.0
Wols: 1983.0


In [11]:
# Transform
# Equivalent of editing by hand:
# Make a case when there is no data to infer
# small_df.loc[[11838, 16441], 'medium'] = np.nan

In [12]:
def fill_values(series):
    values_counted = series.value_counts()
    if values_counted.empty:
        return series
    most_frequent = values_counted.index[0]
    new_medium = series.fillna(most_frequent)
    return new_medium

In [13]:
def transform_df(source_df):
    group_dfs = []
    for name, group_df in source_df.groupby('artist'):
        filled_df = group_df.copy()
        filled_df.loc[:, 'medium'] = fill_values(group_df['medium'])
        group_dfs.append(filled_df)
        
    new_df = pd.concat(group_dfs)
    return new_df

In [15]:
# Now check the result
filled_df = transform_df(small_df)

In [16]:
# BUILT-INS
# Transform

In [17]:
grouped_medium = small_df.groupby('artist')['medium']
small_df.loc[:, 'medium'] = grouped_medium.transform(fill_values)

In [18]:
import numpy as np

In [19]:
# Finding minimum using inbuilt method

In [20]:
grouped_acq_year = df.groupby('artist')['acquisitionYear']
min_acquisition_years = grouped_acq_year

In [21]:
min_acquisition_years = grouped_acq_year.min()

In [22]:
# Min

In [32]:
df.groupby('artist')['acquisitionYear'].min()

artist
?British School           1927.0
Abakanowicz, Magdalena    2009.0
Abbey, Edwin Austin       1924.0
Abbott, Berenice          2010.0
Abbott, Lemuel Francis    1885.0
                           ...  
Zuloaga, Ignacio          1923.0
Zyw, Aleksander           1962.0
di Suvero, Mark           2004.0
van Elk, Ger              1980.0
Štyrský, Jindrich         2007.0
Name: acquisitionYear, Length: 3336, dtype: float64

In [33]:
df.groupby('artist')['acquisitionYear'].agg(np.min)

artist
?British School           1927.0
Abakanowicz, Magdalena    2009.0
Abbey, Edwin Austin       1924.0
Abbott, Berenice          2010.0
Abbott, Lemuel Francis    1885.0
                           ...  
Zuloaga, Ignacio          1923.0
Zyw, Aleksander           1962.0
di Suvero, Mark           2004.0
van Elk, Ger              1980.0
Štyrský, Jindrich         2007.0
Name: acquisitionYear, Length: 3336, dtype: float64

In [34]:
# Filter

In [35]:
grouped_titles = df.groupby('title')
title_counts = grouped_titles.size().sort_values(ascending=False)

In [37]:
condition = lambda x: len(x.index) > 1
dup_titles_df = grouped_titles.filter(condition)
dup_titles_df.sort_values('title', inplace=True)