In [1]:
import numpy as np


In [6]:
def squares(values):
    result = []
    for item in values:
        result.append(item**2)
    
    return result

print(squares([1,2,3,4]))
        
to_square = range(100000)
%timeit squares(to_square)

[1, 4, 9, 16]
10 loops, best of 3: 36.2 ms per loop


In [8]:
array_to_square = np.arange(0,100000)

%timeit array_to_square**2

The slowest run took 5.59 times longer than the fastest. This could mean that an intermediate result is being cached 
10000 loops, best of 3: 51.1 µs per loop


In [10]:
a1 = np.array([1,2,3,4])

In [12]:
a1

array([1, 2, 3, 4])

In [13]:
type(a1)

numpy.ndarray

In [14]:
a2 = np.array([1,2,3.0,4])

In [15]:
a2

array([ 1.,  2.,  3.,  4.])

In [17]:
a2.dtype

dtype('float64')

In [18]:
a3 = np.array([0]*10)
a3

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0])

In [19]:
# convert a python range to numpy array
np.array(range(10))

array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])

In [20]:
# create a numpy array of 10 0.0's
np.zeros(10)

array([ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.])

In [21]:

# force it to be of int instead of float64
np.zeros(10, dtype=int)

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0])

In [22]:
# make "a range" starting at 0 and with 10 values
np.arange(0, 10)

array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])

In [23]:
# 0 <= x < 10 increment by two
np.arange(0, 10, 2)

array([0, 2, 4, 6, 8])

In [24]:
# 10 >= x > 0, counting down
np.arange(10, 0, -1)

array([10,  9,  8,  7,  6,  5,  4,  3,  2,  1])

In [25]:
# evenly spaced #'s between two intervals
np.linspace(0, 10, 11)

array([  0.,   1.,   2.,   3.,   4.,   5.,   6.,   7.,   8.,   9.,  10.])

In [26]:
# multiply numpy array by 2
a1 = np.arange(0, 10) 
a1 * 2

array([ 0,  2,  4,  6,  8, 10, 12, 14, 16, 18])

In [27]:
# add two numpy arrays
a2 = np.arange(10, 20) 
a1 + a2

array([10, 12, 14, 16, 18, 20, 22, 24, 26, 28])

In [28]:
# create a 2- array (2x2)
np.array([[1,2], [3,4]])

array([[1, 2],
       [3, 4]])

In [29]:
# create a 1x20 array, and reshape to a 5x4 2d-array
m = np.arange(0, 20).reshape(5, 4)
m 

array([[ 0,  1,  2,  3],
       [ 4,  5,  6,  7],
       [ 8,  9, 10, 11],
       [12, 13, 14, 15],
       [16, 17, 18, 19]])

In [30]:

# size of any dimensional array is the # of elements
np.size(m)

20

In [32]:

# can ask the size along a given axis (0 is rows)
np.size(m, 0)

5

In [33]:

# can ask the size along a given axis (0 is rows)
np.size(m, 1)

4

In [34]:
# select 0-based elements 0 and 2
a1[0], a1[2]

(0, 2)

In [35]:
# select an element in 2d array at row 1 column 2
m[1, 2]

6

In [36]:
# all items in row 1
m[1,]

array([4, 5, 6, 7])

In [37]:

# all items in column 2
m[:,2]

array([ 2,  6, 10, 14, 18])

In [38]:
# which items are less than 2?
a = np.arange(5)
a < 2

array([ True,  True, False, False, False], dtype=bool)

In [39]:
# less than 2 or greater than 3?
(a<2) | (a>3)

array([ True,  True, False, False,  True], dtype=bool)

In [40]:

# create a function that is applied to all array elements
def exp (x):
    return x<3 or x>3
# np.vectorize applies the method to all items in an array
np.vectorize(exp)(a)

array([ True,  True,  True, False,  True], dtype=bool)

In [41]:
# boolean select items < 3
r = a<3
# applying the result of the expression to the [] operate
# selects just the array elements where there is a matching True
a[r]

array([0, 1, 2])

In [42]:
# np.sum treats True as 1 and False as 0
# so this is how many items are less than 3
np.sum(a < 3)

3

In [43]:

# This can be applied across two arrays
a1 = np.arange(0, 5)
a2 = np.arange(5, 0, -1)
a1 < a2

array([ True,  True,  True, False, False], dtype=bool)

In [44]:
# and even multi dimensional arrays
a1 = np.arange(9).reshape(3, 3)
a2 = np.arange(9, 0 , -1).reshape(3, 3)
a1 < a2

array([[ True,  True,  True],
       [ True,  True, False],
       [False, False, False]], dtype=bool)

In [45]:

# get all items in the array from position 3 
# up to position 8 (but not inclusive)
a1 = np.arange(1, 10)
a1[3:8]

array([4, 5, 6, 7, 8])

In [46]:
# every other item
a1[::2]

array([1, 3, 5, 7, 9])

In [47]:
# in reverse order
a1[::-1]

array([9, 8, 7, 6, 5, 4, 3, 2, 1])

In [48]:
# note that when in reverse, this does not include
# the element specified in the second component of the slice
# ie: there is no 1 printed in this
a1[9:0:-1]

array([9, 8, 7, 6, 5, 4, 3, 2])

In [49]:
# all items from position 5 onwards
a1[5:]

array([6, 7, 8, 9])

In [50]:
# the items in the first 5 positions
a1[:5]

array([1, 2, 3, 4, 5])

In [51]:
# we saw this earlier
# : in rows specifier means all rows
# so this gets items in column position 1, all rows
m[:,1]

array([ 1,  5,  9, 13, 17])

In [52]:
# in all rows, but for all columns in positions 
# 1 up to but not including 3
m[:,1:3]

array([[ 1,  2],
       [ 5,  6],
       [ 9, 10],
       [13, 14],
       [17, 18]])

In [53]:
# in row positions 3 up to but not including 5, all columns
m[3:5,:]

array([[12, 13, 14, 15],
       [16, 17, 18, 19]])

In [54]:

# combined to pull out a sub matrix of the matrix
m[3:5,1:3]

array([[13, 14],
       [17, 18]])

In [55]:
# using a python array, we can select 
# non-contiguous rows or columns
m[[1,3,4],:]

array([[ 4,  5,  6,  7],
       [12, 13, 14, 15],
       [16, 17, 18, 19]])

In [56]:
# create a 9 element array (1x9)
a = np.arange(0, 9)
# and reshape to a 3x3 2-d array
m = a.reshape(3, 3)
m

array([[0, 1, 2],
       [3, 4, 5],
       [6, 7, 8]])

In [57]:
# and we can reshape downward in dimensions too
reshaped = m.reshape(9)
reshaped

array([0, 1, 2, 3, 4, 5, 6, 7, 8])

In [58]:
# .ravel will array representing a flattened 2-d array
raveled = m.ravel()
raveled

array([0, 1, 2, 3, 4, 5, 6, 7, 8])

In [59]:

# it does not alter the shape of the source
m

array([[0, 1, 2],
       [3, 4, 5],
       [6, 7, 8]])

In [60]:
# but it will be a view into the source
# so items changed in the result of the ravel
# are changed in the original object
# reshape m to an array
reshaped = m.reshape(np.size(m))
# ravel into an array
raveled = m.ravel()
# change values in either
reshaped[2] = 1000
raveled[5] = 2000
# and they show as changed in the original
m

array([[   0,    1, 1000],
       [   3,    4, 2000],
       [   6,    7,    8]])

In [61]:
# flattened is like ravel, but a copy of the data, 
# not a view into the source
m2 = np.arange(0, 9).reshape(3,3)
flattened = m2.flatten()
# change in the flattened object
flattened[0] = 1000
flattened

array([1000,    1,    2,    3,    4,    5,    6,    7,    8])

In [62]:
# but not in the original
m2

array([[0, 1, 2],
       [3, 4, 5],
       [6, 7, 8]])

In [63]:

# we can reshape by assigning  a tuple to the .shape property
# we start with this, which has one dimension
flattened.shape

(9,)

In [64]:

# and make it 3x3
flattened.shape = (3, 3)
# it is no longer flattened
flattened

array([[1000,    1,    2],
       [   3,    4,    5],
       [   6,    7,    8]])

In [65]:

# transpose a matrix
flattened.transpose()

array([[1000,    3,    6],
       [   1,    4,    7],
       [   2,    5,    8]])

In [66]:
# can also use .T property to transpose
flattened.T

array([[1000,    3,    6],
       [   1,    4,    7],
       [   2,    5,    8]])

In [67]:
# we can also use .resize, which changes shape of
# and object in-place
m = np.arange(0, 9).reshape(3,3)
m.resize(1, 9)
m # my shape has changed

array([[0, 1, 2, 3, 4, 5, 6, 7, 8]])

In [68]:
# creating two arrays for examples
a = np.arange(9).reshape(3, 3)
b = (a + 1) * 10
a

array([[0, 1, 2],
       [3, 4, 5],
       [6, 7, 8]])

In [69]:
b

array([[10, 20, 30],
       [40, 50, 60],
       [70, 80, 90]])

In [70]:
# horizontally stack the two arrays
# b becomes columns of a to the right of a's columns
np.hstack((a, b))

array([[ 0,  1,  2, 10, 20, 30],
       [ 3,  4,  5, 40, 50, 60],
       [ 6,  7,  8, 70, 80, 90]])

In [71]:
# identical to concatenate along axis = 1
np.concatenate((a, b), axis = 1)

array([[ 0,  1,  2, 10, 20, 30],
       [ 3,  4,  5, 40, 50, 60],
       [ 6,  7,  8, 70, 80, 90]])

In [72]:
# vertical stack, adding b as rows after a's rows
np.vstack((a, b))

array([[ 0,  1,  2],
       [ 3,  4,  5],
       [ 6,  7,  8],
       [10, 20, 30],
       [40, 50, 60],
       [70, 80, 90]])

In [73]:
# concatenate along axis=0 is the same as vstack
np.concatenate((a, b), axis = 0)

array([[ 0,  1,  2],
       [ 3,  4,  5],
       [ 6,  7,  8],
       [10, 20, 30],
       [40, 50, 60],
       [70, 80, 90]])

In [74]:
# dstack stacks each independent column of a and b 
np.dstack((a, b))

array([[[ 0, 10],
        [ 1, 20],
        [ 2, 30]],

       [[ 3, 40],
        [ 4, 50],
        [ 5, 60]],

       [[ 6, 70],
        [ 7, 80],
        [ 8, 90]]])

In [75]:
# set up 1-d array 
one_d_a = np.arange(5)
one_d_a

array([0, 1, 2, 3, 4])

In [76]:

# another 1-d array
one_d_b = (one_d_a + 1) * 10
one_d_b

array([10, 20, 30, 40, 50])

In [77]:
# stack the two columns
np.column_stack((one_d_a, one_d_b))

array([[ 0, 10],
       [ 1, 20],
       [ 2, 30],
       [ 3, 40],
       [ 4, 50]])

In [78]:
# stack along rows
np.row_stack((one_d_a, one_d_b))

array([[ 0,  1,  2,  3,  4],
       [10, 20, 30, 40, 50]])

In [79]:
# sample array
a = np.arange(12).reshape(3, 4)
a

array([[ 0,  1,  2,  3],
       [ 4,  5,  6,  7],
       [ 8,  9, 10, 11]])

In [80]:
# horiz split the 2-d array into 4 array columns
np.hsplit(a, 4)

[array([[0],
        [4],
        [8]]), array([[1],
        [5],
        [9]]), array([[ 2],
        [ 6],
        [10]]), array([[ 3],
        [ 7],
        [11]])]

In [81]:
# horiz split into two array columns
np.hsplit(a, 2)

[array([[0, 1],
        [4, 5],
        [8, 9]]), array([[ 2,  3],
        [ 6,  7],
        [10, 11]])]

In [82]:
# split at columns 1 and 3
np.hsplit(a, [1, 3])

[array([[0],
        [4],
        [8]]), array([[ 1,  2],
        [ 5,  6],
        [ 9, 10]]), array([[ 3],
        [ 7],
        [11]])]

In [83]:
# along the rows
np.split(a, 2, axis = 1)

[array([[0, 1],
        [4, 5],
        [8, 9]]), array([[ 2,  3],
        [ 6,  7],
        [10, 11]])]

In [84]:
# new array for examples
a = np.arange(12).reshape(4, 3)
a

array([[ 0,  1,  2],
       [ 3,  4,  5],
       [ 6,  7,  8],
       [ 9, 10, 11]])

In [85]:
# split into four rows of arrays
np.vsplit(a, 4)

[array([[0, 1, 2]]),
 array([[3, 4, 5]]),
 array([[6, 7, 8]]),
 array([[ 9, 10, 11]])]

In [86]:
# into two rows of arrays
np.vsplit(a, 2)

[array([[0, 1, 2],
        [3, 4, 5]]), array([[ 6,  7,  8],
        [ 9, 10, 11]])]

In [87]:
# split along axis=0
# row 0 of original is row 0 of new array
# rows 1 and 2 of original are row 1
np.vsplit(a, [1, 3])

[array([[0, 1, 2]]), array([[3, 4, 5],
        [6, 7, 8]]), array([[ 9, 10, 11]])]

In [88]:
# split can specify axis
np.split(a, 2, axis = 0)

[array([[0, 1, 2],
        [3, 4, 5]]), array([[ 6,  7,  8],
        [ 9, 10, 11]])]

In [89]:

# 3-d array
c = np.arange(27).reshape(3, 3, 3)
c

array([[[ 0,  1,  2],
        [ 3,  4,  5],
        [ 6,  7,  8]],

       [[ 9, 10, 11],
        [12, 13, 14],
        [15, 16, 17]],

       [[18, 19, 20],
        [21, 22, 23],
        [24, 25, 26]]])

In [90]:

# split into 3 
np.dsplit(c, 3)

[array([[[ 0],
         [ 3],
         [ 6]],
 
        [[ 9],
         [12],
         [15]],
 
        [[18],
         [21],
         [24]]]), array([[[ 1],
         [ 4],
         [ 7]],
 
        [[10],
         [13],
         [16]],
 
        [[19],
         [22],
         [25]]]), array([[[ 2],
         [ 5],
         [ 8]],
 
        [[11],
         [14],
         [17]],
 
        [[20],
         [23],
         [26]]])]

In [91]:
# demonstrate some of the properties of NumPy arrays
m = np.arange(10, 19).reshape(3, 3)
print (a)
print ("{0} min of the entire matrix".format(m.min()))
print ("{0} max of entire matrix".format(m.max()))
print ("{0} position of the min value".format(m.argmin()))
print ("{0} position of the max value".format(m.argmax()))
print ("{0} mins down each column".format(m.min(axis = 0)))
print ("{0} mins across each row".format(m.min(axis = 1)))
print ("{0} maxs down each column".format(m.max(axis = 0)))
print ("{0} maxs across each row".format(m.max(axis = 1)))

[[ 0  1  2]
 [ 3  4  5]
 [ 6  7  8]
 [ 9 10 11]]
10 min of the entire matrix
18 max of entire matrix
0 position of the min value
8 position of the max value
[10 11 12] mins down each column
[10 13 16] mins across each row
[16 17 18] maxs down each column
[12 15 18] maxs across each row


In [92]:
# demonstrate included statistical methods
a = np.arange(10)
a

array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])

In [93]:
a.mean(), a.std(), a.var()

(4.5, 2.8722813232690143, 8.25)

In [94]:

# demonstrate sum and prod
a = np.arange(1, 6)
a

array([1, 2, 3, 4, 5])

In [95]:
a.sum(), a.prod()

(15, 120)

In [96]:
# and cumulative sum and prod
a.cumsum(), a.cumprod()

(array([ 1,  3,  6, 10, 15], dtype=int32),
 array([  1,   2,   6,  24, 120], dtype=int32))

In [97]:

# applying logical operators
a = np.arange(10)
(a < 5).any() # any < 5?

True

In [98]:
(a < 5).all() # all < 5?

False

In [99]:
# size is always the total number of elements
np.arange(10).reshape(2, 5).size

10

In [100]:
# .ndim will with you the total # of dimensions
np.arange(10).reshape(2,5).ndim

2

In [177]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

In [178]:
df = pd.read_csv('data/Most_Popular_Baby_Names_by_Sex_and_Mother_s_Ethnic_Group__New_York_City.csv', parse_dates='BRTH_YR')
df.head()

Unnamed: 0,BRTH_YR,GNDR,ETHCTY,NM,CNT,RNK
0,2011,FEMALE,ASIAN AND PACIFIC ISLANDER,ABIGAIL,24,24
1,2011,FEMALE,ASIAN AND PACIFIC ISLANDER,ABIGAIL,24,24
2,2011,FEMALE,ASIAN AND PACIFIC ISLANDER,ABIGAIL,24,24
3,2011,FEMALE,ASIAN AND PACIFIC ISLANDER,ADA,13,35
4,2011,FEMALE,ASIAN AND PACIFIC ISLANDER,ADA,13,35


In [179]:
df['BRTH_YR'] = pd.to_datetime(df['BRTH_YR'], format='%Y')

In [180]:
df.head()

Unnamed: 0,BRTH_YR,GNDR,ETHCTY,NM,CNT,RNK
0,2011-01-01,FEMALE,ASIAN AND PACIFIC ISLANDER,ABIGAIL,24,24
1,2011-01-01,FEMALE,ASIAN AND PACIFIC ISLANDER,ABIGAIL,24,24
2,2011-01-01,FEMALE,ASIAN AND PACIFIC ISLANDER,ABIGAIL,24,24
3,2011-01-01,FEMALE,ASIAN AND PACIFIC ISLANDER,ADA,13,35
4,2011-01-01,FEMALE,ASIAN AND PACIFIC ISLANDER,ADA,13,35


In [181]:
df.dtypes

BRTH_YR    datetime64[ns]
GNDR               object
ETHCTY             object
NM                 object
CNT                 int64
RNK                 int64
dtype: object

In [182]:
no_dupes = df.drop_duplicates()

In [183]:
no_dupes.head()

Unnamed: 0,BRTH_YR,GNDR,ETHCTY,NM,CNT,RNK
0,2011-01-01,FEMALE,ASIAN AND PACIFIC ISLANDER,ABIGAIL,24,24
3,2011-01-01,FEMALE,ASIAN AND PACIFIC ISLANDER,ADA,13,35
6,2011-01-01,FEMALE,ASIAN AND PACIFIC ISLANDER,AISHA,13,35
9,2011-01-01,FEMALE,ASIAN AND PACIFIC ISLANDER,AIZA,10,38
12,2011-01-01,FEMALE,ASIAN AND PACIFIC ISLANDER,ALEENA,12,36


In [184]:
no_dupes.pivot_table(index = ['NM'],columns='ETHCTY',aggfunc=sum,fill_value=0)

Unnamed: 0_level_0,CNT,CNT,CNT,CNT,RNK,RNK,RNK,RNK
ETHCTY,ASIAN AND PACIFIC ISLANDER,BLACK NON HISPANIC,HISPANIC,WHITE NON HISPANIC,ASIAN AND PACIFIC ISLANDER,BLACK NON HISPANIC,HISPANIC,WHITE NON HISPANIC
NM,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
AALIYAH,0,69,63,0,0,5,30,0
AARAV,15,0,0,0,51,0,0,0
AARON,51,53,102,67,19,28,33,49
ABBY,0,0,10,0,0,0,78,0
ABDIEL,0,0,12,0,0,0,92,0
ABDOUL,0,16,0,0,0,58,0,0
ABDOULAYE,0,13,0,0,0,61,0,0
ABDUL,20,0,0,0,46,0,0,0
ABDULLAH,30,0,0,0,36,0,0,0
ABEL,0,0,14,0,0,0,90,0


In [185]:
G2 = no_dupes.groupby(['ETHCTY']).get_group('HISPANIC')

In [186]:
G2.sort('CNT', ascending=False)

Unnamed: 0,BRTH_YR,GNDR,ETHCTY,NM,CNT,RNK
4512,2011-01-01,MALE,HISPANIC,JAYDEN,426,1
1440,2011-01-01,FEMALE,HISPANIC,ISABELLA,331,1
4620,2011-01-01,MALE,HISPANIC,JUSTIN,310,2
4467,2011-01-01,MALE,HISPANIC,JACOB,303,3
4722,2011-01-01,MALE,HISPANIC,MATTHEW,276,4
4152,2011-01-01,MALE,HISPANIC,ANGEL,253,5
4371,2011-01-01,MALE,HISPANIC,ETHAN,251,6
4248,2011-01-01,MALE,HISPANIC,CHRISTOPHER,239,7
1740,2011-01-01,FEMALE,HISPANIC,MIA,229,2
1920,2011-01-01,FEMALE,HISPANIC,SOPHIA,223,3


In [187]:
no_dupes.pivot_table(index=['ETHCTY','RNK'],values=['NM'],aggfunc = max)

Unnamed: 0_level_0,Unnamed: 1_level_0,NM
ETHCTY,RNK,Unnamed: 2_level_1
ASIAN AND PACIFIC ISLANDER,1,SOPHIA
ASIAN AND PACIFIC ISLANDER,2,JAYDEN
ASIAN AND PACIFIC ISLANDER,3,RYAN
ASIAN AND PACIFIC ISLANDER,4,OLIVIA
ASIAN AND PACIFIC ISLANDER,5,LUCAS
ASIAN AND PACIFIC ISLANDER,6,JASON
ASIAN AND PACIFIC ISLANDER,7,TIFFANY
ASIAN AND PACIFIC ISLANDER,8,KEVIN
ASIAN AND PACIFIC ISLANDER,9,FIONA
ASIAN AND PACIFIC ISLANDER,10,DANIEL


In [188]:
df3 = no_dupes.groupby(['NM']).agg({'CNT':sum})

In [189]:
df3.head()

Unnamed: 0_level_0,CNT
NM,Unnamed: 1_level_1
AALIYAH,132
AARAV,15
AARON,273
ABBY,10
ABDIEL,12


In [190]:
df3.sort('CNT', ascending=False)

Unnamed: 0_level_0,CNT
NM,Unnamed: 1_level_1
JAYDEN,822
JACOB,699
ETHAN,670
MATTHEW,598
MICHAEL,597
DANIEL,596
ISABELLA,593
SOPHIA,552
JUSTIN,551
DAVID,548


In [191]:
df4 = no_dupes.groupby(['GNDR','RNK']).max()

In [192]:
df4.head(200)

Unnamed: 0_level_0,Unnamed: 1_level_0,BRTH_YR,ETHCTY,NM,CNT
GNDR,RNK,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
FEMALE,1,2011-01-01,WHITE NON HISPANIC,SOPHIA,331
FEMALE,2,2011-01-01,WHITE NON HISPANIC,OLIVIA,229
FEMALE,3,2011-01-01,WHITE NON HISPANIC,SOPHIA,223
FEMALE,4,2011-01-01,WHITE NON HISPANIC,SOPHIA,190
FEMALE,5,2011-01-01,WHITE NON HISPANIC,EMMA,188
FEMALE,6,2011-01-01,WHITE NON HISPANIC,SOFIA,177
FEMALE,7,2011-01-01,WHITE NON HISPANIC,TIFFANY,171
FEMALE,8,2011-01-01,WHITE NON HISPANIC,HAILEY,162
FEMALE,9,2011-01-01,WHITE NON HISPANIC,TAYLOR,160
FEMALE,10,2011-01-01,WHITE NON HISPANIC,SERENITY,132


In [193]:
no_dupes.ix[no_dupes.groupby(['GNDR','ETHCTY']).CNT.idxmax()]

Unnamed: 0,BRTH_YR,GNDR,ETHCTY,NM,CNT,RNK
405,2011-01-01,FEMALE,ASIAN AND PACIFIC ISLANDER,SOPHIA,119,1
786,2011-01-01,FEMALE,BLACK NON HISPANIC,MADISON,176,1
1440,2011-01-01,FEMALE,HISPANIC,ISABELLA,331,1
2340,2011-01-01,FEMALE,WHITE NON HISPANIC,ESTHER,224,1
3168,2011-01-01,MALE,ASIAN AND PACIFIC ISLANDER,ETHAN,177,1
3756,2011-01-01,MALE,BLACK NON HISPANIC,JAYDEN,184,1
4512,2011-01-01,MALE,HISPANIC,JAYDEN,426,1
5547,2011-01-01,MALE,WHITE NON HISPANIC,MICHAEL,292,1


In [194]:
no_dupes[no_dupes['NM'] == 'ISABELLA']

Unnamed: 0,BRTH_YR,GNDR,ETHCTY,NM,CNT,RNK
216,2011-01-01,FEMALE,ASIAN AND PACIFIC ISLANDER,ISABELLA,67,6
678,2011-01-01,FEMALE,BLACK NON HISPANIC,ISABELLA,35,20
1440,2011-01-01,FEMALE,HISPANIC,ISABELLA,331,1
2469,2011-01-01,FEMALE,WHITE NON HISPANIC,ISABELLA,160,9


In [195]:
no_dupes.ix[no_dupes.groupby(['GNDR']).CNT.idxmax()]

Unnamed: 0,BRTH_YR,GNDR,ETHCTY,NM,CNT,RNK
1440,2011-01-01,FEMALE,HISPANIC,ISABELLA,331,1
4512,2011-01-01,MALE,HISPANIC,JAYDEN,426,1
