## GroupBy
SQL語法中的Group by 是欄位內的資料若有不只一筆名稱相同的資料的話，就會把它們作為群組 (Group)。然後通過使用GROUP BY,可以搭配聚合函數 (aggregation function) AVG()、COUNT()、MAX()、MIN()、SUM() 等這些內建函數對資料起作用，每一個群組都會傳回一個資料列。若沒有使用 GROUP BY，聚合函數針對一個 SELECT 查詢，只會返回一個彙總值。

groupby() function
Pandas DataFrame groupby() function is used to group rows that have the same values. It's mostly used with aggregate functions (count, sum, min, max, mean) to get the statistics based on one or more column values. Pandas gropuby() function is very similar to the SQL group by statement.

In [1]:
#pd.DataFrame.groupby?

In [2]:
import numpy as np
import pandas as pd
data = pd.DataFrame({'Name':['A','A','A','B','B','B'],
                    'F1':[11,12,13,14,15,16],
                    'F2':[21,22,23,24,25,26],})
data

Unnamed: 0,Name,F1,F2
0,A,11,21
1,A,12,22
2,A,13,23
3,B,14,24
4,B,15,25
5,B,16,26


In [3]:
data.groupby('Name')['F1'].aggregate([np.sum])  #aggregate #apply #filter #transform .....

Unnamed: 0_level_0,sum
Name,Unnamed: 1_level_1
A,36
B,45


In [4]:
marks = pd.DataFrame({'key1' : ['a', 'a', 'b', 'b', 'a', 'a'],
                      'key2' : ['one', 'two', 'one', 'two', 'one', 'one'],
                     'data1' : np.arange(10, 16),
                     'data2' : np.arange(16, 22)})
marks

Unnamed: 0,key1,key2,data1,data2
0,a,one,10,16
1,a,two,11,17
2,b,one,12,18
3,b,two,13,19
4,a,one,14,20
5,a,one,15,21


In [5]:
grouped = marks['data1'].groupby(by=marks['key1']).mean()
grouped

key1
a    12.5
b    12.5
Name: data1, dtype: float64

In [6]:
print('a mean:', (10 + 11 + 14 + 15) / 4)
print('b mean:', (12 + 13)/2)

a mean: 12.5
b mean: 12.5


In [7]:
group_tk = marks['data1'].groupby(by=[marks['key1'], marks['key2']])
print(marks)
print()
print('result:', group_tk.mean())

  key1 key2  data1  data2
0    a  one     10     16
1    a  two     11     17
2    b  one     12     18
3    b  two     13     19
4    a  one     14     20
5    a  one     15     21

result: key1  key2
a     one     13.0
      two     11.0
b     one     12.0
      two     13.0
Name: data1, dtype: float64


In [8]:
group_tk.mean().unstack()

key2,one,two
key1,Unnamed: 1_level_1,Unnamed: 2_level_1
a,13.0,11.0
b,12.0,13.0


In [9]:
feedb = np.array(['good', 'avg', 'good', 'avg', 'good', 'avg'])
actual = np.array(['good', 'med', 'good', 'med', 'good', 'med'])
mean1 = marks['data1'].groupby(by=[feedb, actual]).mean()
mean1

avg   med     13.0
good  good    12.0
Name: data1, dtype: float64

In [10]:
mean_df = marks.groupby(by=['key1']).mean()
mean_df

Unnamed: 0_level_0,data1,data2
key1,Unnamed: 1_level_1,Unnamed: 2_level_1
a,12.5,18.5
b,12.5,18.5


In [11]:
print(marks)
print()
marks.groupby(by=['key1', 'key2']).size()

  key1 key2  data1  data2
0    a  one     10     16
1    a  two     11     17
2    b  one     12     18
3    b  two     13     19
4    a  one     14     20
5    a  one     15     21



key1  key2
a     one     3
      two     1
b     one     1
      two     1
dtype: int64

## Iterate

In [12]:
for key_name, group_name in marks.groupby(by='key1'):
    print(key_name)
    print(group_name)
    print(type(key_name))
    print(type(group_name))

a
  key1 key2  data1  data2
0    a  one     10     16
1    a  two     11     17
4    a  one     14     20
5    a  one     15     21
<class 'str'>
<class 'pandas.core.frame.DataFrame'>
b
  key1 key2  data1  data2
2    b  one     12     18
3    b  two     13     19
<class 'str'>
<class 'pandas.core.frame.DataFrame'>


In [13]:
grouped = marks.groupby(marks.dtypes, axis=1)
for datatype, group in grouped:
    print(datatype)
    print(group)

int64
   data1  data2
0     10     16
1     11     17
2     12     18
3     13     19
4     14     20
5     15     21
object
  key1 key2
0    a  one
1    a  two
2    b  one
3    b  two
4    a  one
5    a  one


In [14]:
print(marks['data1'])
print(marks[['data1', 'data2']]) 

0    10
1    11
2    12
3    13
4    14
5    15
Name: data1, dtype: int64
   data1  data2
0     10     16
1     11     17
2     12     18
3     13     19
4     14     20
5     15     21


In [15]:
marks['data1'].groupby(by=marks['key1']).sum()

key1
a    50
b    25
Name: data1, dtype: int64

In [16]:
marks[['data1', 'data2']].groupby(by=marks['key2']).sum()

Unnamed: 0_level_0,data1,data2
key2,Unnamed: 1_level_1,Unnamed: 2_level_1
one,51,75
two,24,36


## Project - vgsales

In [17]:
import numpy as np
import pandas as pd
data = pd.read_excel("input/vgsales.xlsx")
data.head()

Unnamed: 0,Rank,Name,Platform,Year,Genre,Publisher,NA_Sales,EU_Sales,JP_Sales,Other_Sales,Global_Sales
0,1.0,Wii Sports,Wii,2006.0,Sports,Nintendo,41.49,29.02,3.77,8.46,82.74
1,2.0,Super Mario Bros.,NES,1985.0,Platform,Nintendo,29.08,3.58,6.81,0.77,40.24
2,3.0,Mario Kart Wii,Wii,2008.0,Racing,Nintendo,15.85,12.88,3.79,3.31,35.82
3,4.0,Wii Sports Resort,Wii,2009.0,Sports,Nintendo,15.75,11.01,3.28,2.96,33.0
4,5.0,Pokemon Red/Pokemon Blue,GB,1996.0,Role-Playing,Nintendo,11.27,8.89,10.22,1.0,31.37


In [18]:
data.groupby('Genre').size()                 #預設會忽略Nan
#data.groupby('Genre',dropna=False).size()   #加上 dropna=False 來強制不忽略
#data.groupby(['Year','Platform']).size()

Genre
Action          3316
Adventure       1286
Fighting         848
Misc            1739
Platform         886
Puzzle           582
Racing          1249
Role-Playing    1488
Shooter         1310
Simulation       867
Sports          2346
Strategy         681
dtype: int64

In [19]:
#data.groupby(['Year','Platform']).sum().head()
data.groupby(['Year','Platform']).sum().reset_index() #表頭索引

Unnamed: 0,Year,Platform,Rank,NA_Sales,EU_Sales,JP_Sales,Other_Sales,Global_Sales
0,1980.0,2600.0,29826.0,10.59,0.67,0.00,0.12,11.38
1,1981.0,2600.0,190488.0,33.40,1.96,0.00,0.32,35.77
2,1982.0,2600.0,149186.0,26.92,1.65,0.00,0.31,28.86
3,1983.0,2600.0,49355.0,5.44,0.34,0.00,0.06,5.83
4,1983.0,NES,7404.0,2.32,0.46,8.10,0.08,10.96
...,...,...,...,...,...,...,...,...
236,2016.0,X360,93657.0,0.36,0.40,0.00,0.07,0.83
237,2016.0,XOne,584548.0,6.69,4.63,0.01,1.05,12.37
238,2017.0,PS4,14393.0,0.00,0.00,0.03,0.00,0.03
239,2017.0,PSV,32685.0,0.00,0.00,0.02,0.00,0.02


In [20]:
lists = ['NA_Sales','EU_Sales','JP_Sales','Other_Sales']   #獨立設立 metrics
data.groupby(['Year','Platform'])[lists].sum().head()

Unnamed: 0_level_0,Unnamed: 1_level_0,NA_Sales,EU_Sales,JP_Sales,Other_Sales
Year,Platform,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1980.0,2600.0,10.59,0.67,0.0,0.12
1981.0,2600.0,33.4,1.96,0.0,0.32
1982.0,2600.0,26.92,1.65,0.0,0.31
1983.0,2600.0,5.44,0.34,0.0,0.06
1983.0,NES,2.32,0.46,8.1,0.08


In [21]:
data.groupby('Platform').get_group('Wii')
#data[data['Platform'] == 'Wii']

Unnamed: 0,Rank,Name,Platform,Year,Genre,Publisher,NA_Sales,EU_Sales,JP_Sales,Other_Sales,Global_Sales
0,1.0,Wii Sports,Wii,2006.0,Sports,Nintendo,41.49,29.02,3.77,8.46,82.74
2,3.0,Mario Kart Wii,Wii,2008.0,Racing,Nintendo,15.85,12.88,3.79,3.31,35.82
3,4.0,Wii Sports Resort,Wii,2009.0,Sports,Nintendo,15.75,11.01,3.28,2.96,33.00
7,8.0,Wii Play,Wii,2006.0,Misc,Nintendo,14.03,9.20,2.93,2.85,29.02
8,9.0,New Super Mario Bros. Wii,Wii,2009.0,Platform,Nintendo,14.59,7.06,4.70,2.26,28.62
...,...,...,...,...,...,...,...,...,...,...,...
16517,16520.0,Teenage Mutant Ninja Turtles,Wii,2007.0,Action,Konami Digital Entertainment,0.00,0.01,0.00,0.00,0.01
16552,16555.0,Mahou Sensei Negima!? Neo-Pactio Fight!!,Wii,2007.0,Fighting,Marvelous Interactive,0.00,0.00,0.01,0.00,0.01
16573,16576.0,Mini Desktop Racing,Wii,2007.0,Racing,Popcorn Arcade,0.01,0.00,0.00,0.00,0.01
16574,16577.0,Yattaman Wii: BikkuriDokkiri Machine de Mou Ra...,Wii,2008.0,Racing,Takara Tomy,0.00,0.00,0.01,0.00,0.01


## iloc

In [22]:
data.groupby('Year').size().head()   #1980年有 9個資料

Year
1980.0     9
1981.0    46
1982.0    36
1983.0    17
1984.0    14
dtype: int64

In [23]:
x = data.groupby('Year').groups  # 1980: [258, 544, 1766, 1969, 2669, 4025, 5366, 6317, 6896]
x

{1980.0: [258, 544, 1766, 1969, 2669, 4025, 5366, 6317, 6896], 1981.0: [239, 734, 766, 864, 1106, 1153, 1306, 1429, 1556, 1848, 1965, 2143, 2232, 2516, 2596, 2664, 2806, 2940, 3044, 3403, 3408, 4015, 4139, 4346, 4402, 4517, 4784, 4908, 4967, 5127, 5246, 5389, 5400, 5564, 5800, 6021, 6537, 6866, 6940, 7148, 7150, 7184, 7371, 7904, 8459, 9495], 1982.0: [89, 1115, 1163, 1705, 1817, 1851, 1905, 2406, 2629, 2660, 2843, 3079, 3215, 3248, 3266, 3447, 3643, 3716, 3748, 3966, 4013, 4016, 4801, 4894, 5064, 5381, 5395, 5658, 5773, 6145, 6265, 6728, 7149, 8458, 8758, 9106], 1983.0: [421, 700, 763, 1508, 1809, 1839, 1861, 2652, 2672, 3953, 4096, 4267, 4780, 5382, 5559, 6007, 8457], 1984.0: [9, 277, 298, 754, 938, 1233, 1234, 1324, 1560, 1638, 1857, 2513, 2955, 6296], 1985.0: [1, 374, 868, 1260, 1490, 1973, 2020, 2059, 2259, 3907, 4337, 4727, 14335, 15867], 1986.0: [127, 452, 548, 573, 611, 737, 867, 992, 1117, 1262, 1427, 1623, 1675, 1676, 1772, 1791, 1971, 2158, 3532, 5082, 5956], 1987.0: [251, 46

In [24]:
data.iloc[x[1980.0]]

Unnamed: 0,Rank,Name,Platform,Year,Genre,Publisher,NA_Sales,EU_Sales,JP_Sales,Other_Sales,Global_Sales
258,259.0,Asteroids,2600.0,1980.0,Shooter,Atari,4.0,0.26,0.0,0.05,4.31
544,545.0,Missile Command,2600.0,1980.0,Shooter,Atari,2.56,0.17,0.0,0.03,2.76
1766,1768.0,Kaboom!,2600.0,1980.0,Misc,Activision,1.07,0.07,0.0,0.01,1.15
1969,1971.0,Defender,2600.0,1980.0,Misc,Atari,0.99,0.05,0.0,0.01,1.05
2669,2671.0,Boxing,2600.0,1980.0,Fighting,Activision,0.72,0.04,0.0,0.01,0.77
4025,4027.0,Ice Hockey,2600.0,1980.0,Sports,Activision,0.46,0.03,0.0,0.01,0.49
5366,5368.0,Freeway,2600.0,1980.0,Action,Activision,0.32,0.02,0.0,0.0,0.34
6317,6319.0,Bridge,2600.0,1980.0,Misc,Activision,0.25,0.02,0.0,0.0,0.27
6896,6898.0,Checkers,2600.0,1980.0,Misc,Atari,0.22,0.01,0.0,0.0,0.24


## Aggregation : agg()

聚合函數 (aggregation function) 也就是 AVG()、COUNT()、MAX()、MIN()、SUM() 等這些內建函數對資料起作用 Aggregating functions are the ones that reduce the dimension of the returned objects.Some common aggregating functions are tabulated below:
<table class="colwidths-given table">
<colgroup>
<col style="width: 20%">
<col style="width: 80%">
</colgroup>
<thead>
<tr class="row-odd"><th class="head"><p>Function</p></th>
<th class="head"><p>Description</p></th>
</tr>
</thead>
<tbody>
<tr class="row-even"><td><p><code class="xref py py-meth docutils literal notranslate"><span class="pre">mean()</span></code></p></td>
<td><p>Compute mean of groups</p></td>
</tr>
<tr class="row-odd"><td><p><code class="xref py py-meth docutils literal notranslate"><span class="pre">sum()</span></code></p></td>
<td><p>Compute sum of group values</p></td>
</tr>
<tr class="row-even"><td><p><code class="xref py py-meth docutils literal notranslate"><span class="pre">size()</span></code></p></td>
<td><p>Compute group sizes</p></td>
</tr>
<tr class="row-odd"><td><p><code class="xref py py-meth docutils literal notranslate"><span class="pre">count()</span></code></p></td>
<td><p>Compute count of group</p></td>
</tr>
<tr class="row-even"><td><p><code class="xref py py-meth docutils literal notranslate"><span class="pre">std()</span></code></p></td>
<td><p>Standard deviation of groups</p></td>
</tr>
<tr class="row-odd"><td><p><code class="xref py py-meth docutils literal notranslate"><span class="pre">var()</span></code></p></td>
<td><p>Compute variance of groups</p></td>
</tr>
<tr class="row-even"><td><p><code class="xref py py-meth docutils literal notranslate"><span class="pre">sem()</span></code></p></td>
<td><p>Standard error of the mean of groups</p></td>
</tr>
<tr class="row-odd"><td><p><code class="xref py py-meth docutils literal notranslate"><span class="pre">describe()</span></code></p></td>
<td><p>Generates descriptive statistics</p></td>
</tr>
<tr class="row-even"><td><p><code class="xref py py-meth docutils literal notranslate"><span class="pre">first()</span></code></p></td>
<td><p>Compute first of group values</p></td>
</tr>
<tr class="row-odd"><td><p><code class="xref py py-meth docutils literal notranslate"><span class="pre">last()</span></code></p></td>
<td><p>Compute last of group values</p></td>
</tr>
<tr class="row-even"><td><p><code class="xref py py-meth docutils literal notranslate"><span class="pre">nth()</span></code></p></td>
<td><p>Take nth value, or a subset if n is a list</p></td>
</tr>
<tr class="row-odd"><td><p><code class="xref py py-meth docutils literal notranslate"><span class="pre">min()</span></code></p></td>
<td><p>Compute min of group values</p></td>
</tr>
<tr class="row-even"><td><p><code class="xref py py-meth docutils literal notranslate"><span class="pre">max()</span></code></p></td>
<td><p>Compute max of group values</p></td>
</tr>
</tbody>
</table>

In [25]:
data.groupby('Platform').sum().head()

Unnamed: 0_level_0,Rank,Year,NA_Sales,EU_Sales,JP_Sales,Other_Sales,Global_Sales
Platform,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2600.0,585665.0,229928.0,90.6,5.47,0.0,0.91,97.08
3DO,43118.0,5984.0,0.0,0.0,0.1,0.0,0.1
3DS,4662644.0,1006531.0,78.87,58.52,97.35,12.63,247.46
DC,456114.0,103997.0,5.43,1.69,8.56,0.27,15.97
DS,20845831.0,4283493.0,390.71,194.65,175.57,60.53,822.49


In [26]:
## 寫法1
data.groupby('Platform')[['NA_Sales','Global_Sales']].aggregate([np.sum,
                                                                 np.max,
                                                                 np.min]).head()

Unnamed: 0_level_0,NA_Sales,NA_Sales,NA_Sales,Global_Sales,Global_Sales,Global_Sales
Unnamed: 0_level_1,sum,amax,amin,sum,amax,amin
Platform,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
2600.0,90.6,7.28,0.07,97.08,7.81,0.07
3DO,0.0,0.0,0.0,0.1,0.06,0.02
3DS,78.87,5.17,0.0,247.46,14.35,0.01
DC,5.43,1.26,0.0,15.97,2.42,0.02
DS,390.71,11.38,0.0,822.49,30.01,0.01


In [27]:
## 寫法2
def sum1(x): return np.sum(x)
def max1(x): return np.max(x)   
def min1(x): return np.min(x)   
data.groupby('Platform')[['NA_Sales','Global_Sales']].aggregate([sum1,
                                                                 max1,
                                                                 min1]).head()
#return np.max(x) - np.min(x)  #return np.median(x) - np.mean(x)

Unnamed: 0_level_0,NA_Sales,NA_Sales,NA_Sales,Global_Sales,Global_Sales,Global_Sales
Unnamed: 0_level_1,sum1,max1,min1,sum1,max1,min1
Platform,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
2600.0,90.6,7.28,0.07,97.08,7.81,0.07
3DO,0.0,0.0,0.0,0.1,0.06,0.02
3DS,78.87,5.17,0.0,247.46,14.35,0.01
DC,5.43,1.26,0.0,15.97,2.42,0.02
DS,390.71,11.38,0.0,822.49,30.01,0.01


In [28]:
# data.groupby('Platform')[['NA_Sales','Global_Sales']].aggregate(
#     [lambda x: x.max() - x.min(),
#     lambda x: x.median() - x.mean()]
# )

## Transform ＆ Apply & Filter

In [29]:
data.groupby('Platform')['Global_Sales'].transform(sum)

0         926.71
1         251.07
2         926.71
3         926.71
4         255.45
          ...   
16593     318.50
16594     199.36
16595    1255.64
16596     822.49
16597     318.50
Name: Global_Sales, Length: 16598, dtype: float64

In [30]:
data2 = data.copy()

In [31]:
#方法 1
global_sum = data2.groupby('Platform')['Global_Sales'].transform(np.sum)  # cal Global_Sales.sum
data2['銷售佔比'] = data2['Global_Sales'] / global_sum 
data2.head()

Unnamed: 0,Rank,Name,Platform,Year,Genre,Publisher,NA_Sales,EU_Sales,JP_Sales,Other_Sales,Global_Sales,銷售佔比
0,1.0,Wii Sports,Wii,2006.0,Sports,Nintendo,41.49,29.02,3.77,8.46,82.74,0.089284
1,2.0,Super Mario Bros.,NES,1985.0,Platform,Nintendo,29.08,3.58,6.81,0.77,40.24,0.160274
2,3.0,Mario Kart Wii,Wii,2008.0,Racing,Nintendo,15.85,12.88,3.79,3.31,35.82,0.038653
3,4.0,Wii Sports Resort,Wii,2009.0,Sports,Nintendo,15.75,11.01,3.28,2.96,33.0,0.03561
4,5.0,Pokemon Red/Pokemon Blue,GB,1996.0,Role-Playing,Nintendo,11.27,8.89,10.22,1.0,31.37,0.122803


In [32]:
data2 = data.copy()

In [33]:
#方法 2 使用apply來達成等同上面的功能
def rate(x):
    x['銷售佔比'] = x['Global_Sales'] / x['Global_Sales'].sum()
    return x
data2.groupby('Platform').apply(rate).head()

Unnamed: 0,Rank,Name,Platform,Year,Genre,Publisher,NA_Sales,EU_Sales,JP_Sales,Other_Sales,Global_Sales,銷售佔比
0,1.0,Wii Sports,Wii,2006.0,Sports,Nintendo,41.49,29.02,3.77,8.46,82.74,0.089284
1,2.0,Super Mario Bros.,NES,1985.0,Platform,Nintendo,29.08,3.58,6.81,0.77,40.24,0.160274
2,3.0,Mario Kart Wii,Wii,2008.0,Racing,Nintendo,15.85,12.88,3.79,3.31,35.82,0.038653
3,4.0,Wii Sports Resort,Wii,2009.0,Sports,Nintendo,15.75,11.01,3.28,2.96,33.0,0.03561
4,5.0,Pokemon Red/Pokemon Blue,GB,1996.0,Role-Playing,Nintendo,11.27,8.89,10.22,1.0,31.37,0.122803


In [34]:
# filter : 列出該平台總銷售額大於 1000 百萬的遊戲
index = data.groupby('Platform')['Global_Sales'].transform(sum) > 1000
data[index]

Unnamed: 0,Rank,Name,Platform,Year,Genre,Publisher,NA_Sales,EU_Sales,JP_Sales,Other_Sales,Global_Sales
17,18.0,Grand Theft Auto: San Andreas,PS2,2004.0,Action,Take-Two Interactive,9.43,0.40,0.41,10.57,20.81
24,25.0,Grand Theft Auto: Vice City,PS2,2002.0,Action,Take-Two Interactive,8.41,5.49,0.47,1.78,16.15
28,29.0,Gran Turismo 3: A-Spec,PS2,2001.0,Racing,Sony Computer Entertainment,6.85,5.09,1.87,1.16,14.98
38,39.0,Grand Theft Auto III,PS2,2001.0,Action,Take-Two Interactive,6.99,4.51,0.30,1.30,13.10
47,48.0,Gran Turismo 4,PS2,2004.0,Racing,Sony Computer Entertainment,3.01,0.01,1.10,7.53,11.66
...,...,...,...,...,...,...,...,...,...,...,...
16559,16562.0,Sugar + Spice! Anoko no Suteki na Nanimokamo,PS2,2008.0,Adventure,Alchemist,0.00,0.00,0.01,0.00,0.01
16561,16564.0,Kanokon: Esuii,PS2,2008.0,Adventure,5pb,0.00,0.00,0.01,0.00,0.01
16572,16575.0,Scarlett: Nichijou no Kyoukaisen,PS2,2008.0,Adventure,Kadokawa Shoten,0.00,0.00,0.01,0.00,0.01
16580,16583.0,Real Rode,PS2,2008.0,Adventure,Kadokawa Shoten,0.00,0.00,0.01,0.00,0.01


## Project - employee 

In [35]:
import pandas as pd
data = pd.read_csv('input/employee.csv')
data.head()

Unnamed: 0,Age,Attrition,BusinessTravel,DailyRate,Department,DistanceFromHome,Education,EducationField,EmployeeCount,EmployeeNumber,...,RelationshipSatisfaction,StandardHours,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager
0,41,Yes,Travel_Rarely,1102,Sales,1,2,Life Sciences,1,1,...,1,80,0,8,0,1,6,4,0,5
1,49,No,Travel_Frequently,279,Research & Development,8,1,Life Sciences,1,2,...,4,80,1,10,3,3,10,7,1,7
2,37,Yes,Travel_Rarely,1373,Research & Development,2,2,Other,1,4,...,2,80,0,7,3,3,0,0,0,0
3,33,No,Travel_Frequently,1392,Research & Development,3,4,Life Sciences,1,5,...,3,80,0,8,3,3,8,7,3,0
4,27,No,Travel_Rarely,591,Research & Development,2,1,Medical,1,7,...,4,80,1,6,3,3,2,2,2,2


In [36]:
# grouping the data by department
data.groupby(['Department']).agg({'Age':'mean',
                                  'DailyRate':'max'})

Unnamed: 0_level_0,Age,DailyRate
Department,Unnamed: 1_level_1,Unnamed: 2_level_1
Human Resources,37.809524,1444
Research & Development,37.042664,1496
Sales,36.542601,1499


In [37]:
data[['Age','DailyRate','Department']].groupby([data['Department'],
                                                data['EducationField']]).agg(['mean',
                                                                              'median',
                                                                              'min',
                                                                              'max',
                                                                              'std',
                                                                              'var'])

Unnamed: 0_level_0,Unnamed: 1_level_0,Age,Age,Age,Age,Age,Age,DailyRate,DailyRate,DailyRate,DailyRate,DailyRate,DailyRate
Unnamed: 0_level_1,Unnamed: 1_level_1,mean,median,min,max,std,var,mean,median,min,max,std,var
Department,EducationField,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2
Human Resources,Human Resources,37.037037,36.0,24,59,8.604478,74.037037,675.259259,600.0,106,1420,436.471657,190507.507123
Human Resources,Life Sciences,39.375,41.0,26,56,9.076527,82.383333,753.0625,709.0,176,1383,448.447465,201105.129167
Human Resources,Medical,39.846154,42.0,24,59,11.312575,127.974359,875.615385,898.0,179,1398,382.294289,146148.923077
Human Resources,Other,34.666667,37.0,29,38,4.932883,24.333333,1005.0,1239.0,332,1444,591.77952,350203.0
Human Resources,Technical Degree,32.5,34.5,19,42,9.678154,93.666667,667.0,536.0,489,1107,294.241397,86578.0
Research & Development,Life Sciences,36.997727,36.0,18,60,8.937775,79.883822,789.195455,786.5,102,1490,402.79626,162244.827314
Research & Development,Medical,37.242424,36.0,18,60,9.471776,89.714549,825.730028,878.0,109,1495,398.918343,159135.844041
Research & Development,Other,36.1875,35.5,21,53,7.869723,61.93254,763.359375,669.0,116,1474,398.990089,159193.091022
Research & Development,Technical Degree,37.06383,35.5,20,58,10.039501,100.791581,846.202128,945.5,128,1496,416.60857,173562.700641
Sales,Life Sciences,37.186667,36.0,20,59,8.683384,75.401163,854.58,880.0,111,1498,395.281096,156247.144564


In [38]:
#pd.set_option('max_rows',1470)

def above_thousand(x):
    color = 'red' if x>1000 else 'green'
    return 'color: %s' % color

data[['DailyRate']].head().style.applymap(above_thousand)

Unnamed: 0,DailyRate
0,1102
1,279
2,1373
3,1392
4,591


In [39]:
dt1 = data.groupby(['Department','EducationField'])[['Age','DailyRate','MonthlyIncome']].agg('mean')
dt1

Unnamed: 0_level_0,Unnamed: 1_level_0,Age,DailyRate,MonthlyIncome
Department,EducationField,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Human Resources,Human Resources,37.037037,675.259259,7241.148148
Human Resources,Life Sciences,39.375,753.0625,6914.0625
Human Resources,Medical,39.846154,875.615385,6594.076923
Human Resources,Other,34.666667,1005.0,5016.666667
Human Resources,Technical Degree,32.5,667.0,3081.25
Research & Development,Life Sciences,36.997727,789.195455,6179.984091
Research & Development,Medical,37.242424,825.730028,6539.22314
Research & Development,Other,36.1875,763.359375,6278.6875
Research & Development,Technical Degree,37.06383,846.202128,5760.819149
Sales,Life Sciences,37.186667,854.58,7246.233333


In [40]:
dt1.style.highlight_max(color = 'red').highlight_min(color='lightgreen')

Unnamed: 0_level_0,Unnamed: 1_level_0,Age,DailyRate,MonthlyIncome
Department,EducationField,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Human Resources,Human Resources,37.037037,675.259259,7241.148148
Human Resources,Life Sciences,39.375,753.0625,6914.0625
Human Resources,Medical,39.846154,875.615385,6594.076923
Human Resources,Other,34.666667,1005.0,5016.666667
Human Resources,Technical Degree,32.5,667.0,3081.25
Research & Development,Life Sciences,36.997727,789.195455,6179.984091
Research & Development,Medical,37.242424,825.730028,6539.22314
Research & Development,Other,36.1875,763.359375,6278.6875
Research & Development,Technical Degree,37.06383,846.202128,5760.819149
Sales,Life Sciences,37.186667,854.58,7246.233333


In [41]:
# grouped the data by education field
dt2 = data.groupby(['EducationField'])[['Age','DailyRate','HourlyRate','MonthlyIncome']].agg('mean')
dt2

Unnamed: 0_level_0,Age,DailyRate,HourlyRate,MonthlyIncome
EducationField,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Human Resources,37.037037,675.259259,60.888889,7241.148148
Life Sciences,37.107261,804.425743,66.831683,6463.288779
Marketing,37.924528,727.836478,66.150943,7348.584906
Medical,36.838362,822.799569,65.280172,6510.036638
Other,35.365854,796.02439,62.365854,6071.54878
Technical Degree,36.121212,842.128788,66.621212,5758.30303


In [42]:
dt2.style.highlight_max(color = 'darkgreen').highlight_min(color = 'yellow')

Unnamed: 0_level_0,Age,DailyRate,HourlyRate,MonthlyIncome
EducationField,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Human Resources,37.037037,675.259259,60.888889,7241.148148
Life Sciences,37.107261,804.425743,66.831683,6463.288779
Marketing,37.924528,727.836478,66.150943,7348.584906
Medical,36.838362,822.799569,65.280172,6510.036638
Other,35.365854,796.02439,62.365854,6071.54878
Technical Degree,36.121212,842.128788,66.621212,5758.30303


In [43]:
# highlighting the range of values using background_gradient
dt2.style.background_gradient(cmap = 'Greens')

Unnamed: 0_level_0,Age,DailyRate,HourlyRate,MonthlyIncome
EducationField,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Human Resources,37.037037,675.259259,60.888889,7241.148148
Life Sciences,37.107261,804.425743,66.831683,6463.288779
Marketing,37.924528,727.836478,66.150943,7348.584906
Medical,36.838362,822.799569,65.280172,6510.036638
Other,35.365854,796.02439,62.365854,6071.54878
Technical Degree,36.121212,842.128788,66.621212,5758.30303


In [44]:
# method chains to hide the index as well
grp_data.style.background_gradient(cmap = 'Greens').hide_index()

NameError: name 'grp_data' is not defined