# GroupBy - Pandas

In [72]:
import pandas as pd
import numpy as np

In [73]:
df = pd.read_csv("googleplaystore.csv")

In [74]:
df.head()

Unnamed: 0,App,Category,Rating,Reviews,Size,Installs,Type,Price,Content Rating,Genres,Last Updated,Current Ver,Android Ver
0,Photo Editor & Candy Camera & Grid & ScrapBook,ART_AND_DESIGN,4.1,159,19M,"10,000+",Free,0,Everyone,Art & Design,"January 7, 2018",1.0.0,4.0.3 and up
1,Coloring book moana,ART_AND_DESIGN,3.9,967,14M,"500,000+",Free,0,Everyone,Art & Design;Pretend Play,"January 15, 2018",2.0.0,4.0.3 and up
2,"U Launcher Lite – FREE Live Cool Themes, Hide ...",ART_AND_DESIGN,4.7,87510,8.7M,"5,000,000+",Free,0,Everyone,Art & Design,"August 1, 2018",1.2.4,4.0.3 and up
3,Sketch - Draw & Paint,ART_AND_DESIGN,4.5,215644,25M,"50,000,000+",Free,0,Teen,Art & Design,"June 8, 2018",Varies with device,4.2 and up
4,Pixel Draw - Number Art Coloring Book,ART_AND_DESIGN,4.3,967,2.8M,"100,000+",Free,0,Everyone,Art & Design;Creativity,"June 20, 2018",1.1,4.4 and up


In [75]:
df.columns

Index(['App', 'Category', 'Rating', 'Reviews', 'Size', 'Installs', 'Type',
       'Price', 'Content Rating', 'Genres', 'Last Updated', 'Current Ver',
       'Android Ver'],
      dtype='object')

In [76]:
df['Category'].unique()

array(['ART_AND_DESIGN', 'AUTO_AND_VEHICLES', 'BEAUTY',
       'BOOKS_AND_REFERENCE', 'BUSINESS', 'COMICS', 'COMMUNICATION',
       'DATING', 'EDUCATION', 'ENTERTAINMENT', 'EVENTS', 'FINANCE',
       'FOOD_AND_DRINK', 'HEALTH_AND_FITNESS', 'HOUSE_AND_HOME',
       'LIBRARIES_AND_DEMO', 'LIFESTYLE', 'GAME', 'FAMILY', 'MEDICAL',
       'SOCIAL', 'SHOPPING', 'PHOTOGRAPHY', 'SPORTS', 'TRAVEL_AND_LOCAL',
       'TOOLS', 'PERSONALIZATION', 'PRODUCTIVITY', 'PARENTING', 'WEATHER',
       'VIDEO_PLAYERS', 'NEWS_AND_MAGAZINES', 'MAPS_AND_NAVIGATION',
       '1.9'], dtype=object)

## Group by a Single Column

In [77]:
print(df.groupby('Category').groups)

{'1.9': Int64Index([10472], dtype='int64'), 'ART_AND_DESIGN': Int64Index([   0,    1,    2,    3,    4,    5,    6,    7,    8,    9,   10,
              11,   12,   13,   14,   15,   16,   17,   18,   19,   20,   21,
              22,   23,   24,   25,   26,   27,   28,   29,   30,   31,   32,
              33,   34,   35,   36,   37,   38,   39,   40,   41,   42,   43,
              44,   45,   46,   47,   48, 3982, 4193, 4241, 4749, 4755, 4759,
            4764, 5179, 5254, 5258, 6930, 7174, 8679, 8712, 8871, 8888],
           dtype='int64'), 'AUTO_AND_VEHICLES': Int64Index([   49,    50,    51,    52,    53,    54,    55,    56,    57,
               58,    59,    60,    61,    62,    63,    64,    65,    66,
               67,    68,    69,    70,    71,    72,    73,    74,    75,
               76,    77,    78,    79,    80,    81,    82,    83,    84,
               85,    86,    87,    88,    89,    90,    91,    92,    93,
               94,    95,    96,    97,  4120,  4216

## Group by Multiple Columns

In [78]:
df['Type'].unique()

array(['Free', 'Paid', nan, '0'], dtype=object)

In [79]:
print(df.groupby(['Category', 'Type']).groups)

{('ART_AND_DESIGN', 'Free'): Int64Index([   0,    1,    2,    3,    4,    5,    6,    7,    8,    9,   10,
              11,   12,   13,   14,   15,   16,   17,   18,   19,   20,   21,
              22,   23,   24,   25,   26,   27,   28,   29,   30,   31,   32,
              33,   34,   35,   36,   37,   38,   39,   40,   41,   42,   43,
              44,   45,   46,   47,   48, 3982, 4193, 4241, 4749, 5179, 5254,
            5258, 6930, 7174, 8679, 8712, 8871, 8888],
           dtype='int64'), ('AUTO_AND_VEHICLES', 'Free'): Int64Index([   49,    50,    51,    52,    53,    54,    55,    56,    57,
               58,    59,    60,    61,    62,    63,    64,    65,    66,
               67,    68,    69,    70,    71,    72,    73,    74,    75,
               76,    77,    78,    79,    80,    81,    82,    83,    84,
               85,    86,    87,    88,    89,    90,    91,    92,    93,
               94,    95,    96,    97,  4120,  4376,  4622,  5015,  5023,
             5101,

## Iterating through Groups

In [80]:
df['Content Rating'].unique()

array(['Everyone', 'Teen', 'Everyone 10+', 'Mature 17+',
       'Adults only 18+', 'Unrated', nan], dtype=object)

In [81]:
df_grouped_by_content_rating = df.groupby('Content Rating')

In [82]:
for name, group in df_grouped_by_content_rating:
    print(name)
    print(group)

Adults only 18+
                                           App Category  Rating Reviews  \
298   Manga Master - Best manga & comic reader   COMICS     4.6   24005   
3043         DraftKings - Daily Fantasy Sports   SPORTS     4.5   50017   
6424                               Manga Books   COMICS     3.8    7326   

                    Size    Installs  Type Price   Content Rating  Genres  \
298                 4.9M    500,000+  Free     0  Adults only 18+  Comics   
3043                 41M  1,000,000+  Free     0  Adults only 18+  Sports   
6424  Varies with device    500,000+  Free     0  Adults only 18+  Comics   

        Last Updated         Current Ver         Android Ver  
298     July 4, 2018             1.1.7.0          4.1 and up  
3043   July 24, 2018            3.21.324          4.4 and up  
6424  August 3, 2018  Varies with device  Varies with device  
Everyone
                                                     App             Category  \
0         Photo Editor & Candy C

## Select a Group

Use the `get_group()` method

In [83]:
print(df_grouped_by_content_rating.get_group('Adults only 18+'))

                                           App Category  Rating Reviews  \
298   Manga Master - Best manga & comic reader   COMICS     4.6   24005   
3043         DraftKings - Daily Fantasy Sports   SPORTS     4.5   50017   
6424                               Manga Books   COMICS     3.8    7326   

                    Size    Installs  Type Price   Content Rating  Genres  \
298                 4.9M    500,000+  Free     0  Adults only 18+  Comics   
3043                 41M  1,000,000+  Free     0  Adults only 18+  Sports   
6424  Varies with device    500,000+  Free     0  Adults only 18+  Comics   

        Last Updated         Current Ver         Android Ver  
298     July 4, 2018             1.1.7.0          4.1 and up  
3043   July 24, 2018            3.21.324          4.4 and up  
6424  August 3, 2018  Varies with device  Varies with device  


## Aggregations

In [84]:
print(df_grouped_by_content_rating['Rating'].agg(np.mean))

Content Rating
Adults only 18+    4.300000
Everyone           4.186375
Everyone 10+       4.257179
Mature 17+         4.123427
Teen               4.233487
Unrated            4.100000
Name: Rating, dtype: float64


## Applying Multiple Aggregation Functions at Once

In [85]:
print(df_grouped_by_content_rating['Rating'].agg([np.mean, np.average, np.sum, np.median, np.std]))

                     mean  average      sum  median       std
Content Rating                                               
Adults only 18+  4.300000      4.3     12.9     4.5  0.435890
Everyone         4.186375      NaN  31062.9     4.3  0.537377
Everyone 10+     4.257179      NaN   1690.1     4.3  0.367259
Mature 17+       4.123427      NaN   1900.9     4.2  0.505765
Teen             4.233487      NaN   4589.1     4.3  0.391595
Unrated          4.100000      NaN      4.1     4.1       NaN
