# Ch3.3 Handling Missing Data

In [1]:
import numpy as np
import pandas as pd

In [2]:
np.__version__,pd.__version__

('1.17.4', '0.25.3')

### Detecting null values

In [8]:
mb = pd.read_csv("./data/microbiome.csv")
mb

Unnamed: 0,Taxon,Patient,Group,Tissue,Stool
0,Firmicutes,1,0,136,4182
1,Firmicutes,2,1,1174,703
2,Firmicutes,3,0,408,3946
3,Firmicutes,4,1,831,8605
4,Firmicutes,5,0,693,50
...,...,...,...,...,...
65,Other,10,1,203,6
66,Other,11,0,392,6
67,Other,12,1,28,25
68,Other,13,0,12,22


In [21]:
# Taxon column 為 object
mb.dtypes

Taxon      object
Patient     int64
Group       int64
Tissue      int64
Stool       int64
dtype: object

In [19]:
# 資料處理盡量用數值去 run，比較有效率 
mb2 = mb.iloc[:,1:]
mb2

Unnamed: 0,Patient,Group,Tissue,Stool
0,1,0,136,4182
1,2,1,1174,703
2,3,0,408,3946
3,4,1,831,8605
4,5,0,693,50
...,...,...,...,...
65,10,1,203,6
66,11,0,392,6
67,12,1,28,25
68,13,0,12,22


In [20]:
# DataFrame (dtype: object)
mb2.dtypes

Patient    int64
Group      int64
Tissue     int64
Stool      int64
dtype: object

In [6]:
# np.nan (float)
# None 轉換成 NaN (float)
data = pd.Series([1, np.nan, 'hello', None])

In [7]:
data

0        1
1      NaN
2    hello
3     None
dtype: object

In [13]:
data.isnull()

0    False
1     True
2    False
3     True
dtype: bool

In [14]:
data.isnull().values.any()

True

In [15]:
%timeit data.isnull().values.any()
%timeit data.isnull().any()

47.1 µs ± 669 ns per loop (mean ± std. dev. of 7 runs, 10000 loops each)
71.9 µs ± 2.08 µs per loop (mean ± std. dev. of 7 runs, 10000 loops each)


In [111]:
df=pd.DataFrame(np.random.randn(1000,1000))
df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,990,991,992,993,994,995,996,997,998,999
0,-0.028664,1.172649,0.172513,0.635249,-0.164268,0.040557,-0.812318,0.904784,-0.178885,-0.384927,...,-1.823759,0.638244,-1.329999,-0.479661,1.108459,2.163746,0.282000,1.233100,0.614654,-0.481612
1,-1.873721,-0.898434,0.637057,1.554858,1.128185,0.161243,-0.712055,0.005755,-0.036252,-0.673538,...,1.052431,-0.493583,0.632878,-0.884109,-0.230216,-0.124270,0.391101,0.740440,-0.061118,-1.855378
2,-1.892134,-0.201745,-2.114829,0.068569,0.060576,-0.899655,-0.612831,-0.079743,0.016851,-0.499004,...,-0.499926,-1.114181,0.038820,0.791892,-1.536368,0.619618,0.925119,0.866258,1.348236,0.149638
3,-0.656863,0.924419,-0.170081,1.261404,-0.638521,-0.353891,-0.452254,-1.172909,-0.217179,0.117301,...,-0.775435,-0.205307,0.430036,0.818365,0.928905,-0.722579,-1.012280,-1.537296,0.123837,-0.221495
4,0.212333,0.081586,0.443724,1.401025,1.753772,1.404769,1.724018,-0.392093,-1.441270,-0.749622,...,0.056504,0.124866,-1.408422,-0.505197,0.330677,-0.450929,0.912951,-0.948809,0.368930,-1.317609
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,-0.282367,1.104180,-1.001765,-1.332602,-1.830299,-1.102079,-0.605661,2.328372,0.130005,0.403215,...,0.290463,-2.320608,-1.234639,0.701167,0.737433,-1.042211,0.668594,0.213131,1.071397,-1.182863
996,-0.151028,1.399679,0.683739,1.191808,0.168717,0.245438,-0.784825,-0.265554,-1.205659,0.328985,...,1.051847,0.632802,-0.316357,-1.488970,0.715256,0.182303,0.570066,-0.269767,1.194194,1.337218
997,-0.212872,-0.953993,-0.346528,-0.429546,1.311569,0.923810,-0.747463,0.881316,-0.552002,-2.564069,...,-1.118502,-1.264713,-0.226555,-0.826479,-2.110901,-1.401610,1.209838,-0.627412,0.175097,0.495797
998,-0.724949,0.531473,2.581144,1.072308,0.982171,-0.422190,0.995932,0.116595,-0.599141,0.062171,...,0.258315,-1.344678,1.255532,1.150856,0.666031,-0.483417,0.009360,-0.700871,-0.220377,0.491768


In [112]:
df[df>0.9]=pd.np.nan
df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,990,991,992,993,994,995,996,997,998,999
0,-0.028664,,0.172513,0.635249,-0.164268,0.040557,-0.812318,,-0.178885,-0.384927,...,-1.823759,0.638244,-1.329999,-0.479661,,,0.282000,,0.614654,-0.481612
1,-1.873721,-0.898434,0.637057,,,0.161243,-0.712055,0.005755,-0.036252,-0.673538,...,,-0.493583,0.632878,-0.884109,-0.230216,-0.124270,0.391101,0.740440,-0.061118,-1.855378
2,-1.892134,-0.201745,-2.114829,0.068569,0.060576,-0.899655,-0.612831,-0.079743,0.016851,-0.499004,...,-0.499926,-1.114181,0.038820,0.791892,-1.536368,0.619618,,0.866258,,0.149638
3,-0.656863,,-0.170081,,-0.638521,-0.353891,-0.452254,-1.172909,-0.217179,0.117301,...,-0.775435,-0.205307,0.430036,0.818365,,-0.722579,-1.012280,-1.537296,0.123837,-0.221495
4,0.212333,0.081586,0.443724,,,,,-0.392093,-1.441270,-0.749622,...,0.056504,0.124866,-1.408422,-0.505197,0.330677,-0.450929,,-0.948809,0.368930,-1.317609
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,-0.282367,,-1.001765,-1.332602,-1.830299,-1.102079,-0.605661,,0.130005,0.403215,...,0.290463,-2.320608,-1.234639,0.701167,0.737433,-1.042211,0.668594,0.213131,,-1.182863
996,-0.151028,,0.683739,,0.168717,0.245438,-0.784825,-0.265554,-1.205659,0.328985,...,,0.632802,-0.316357,-1.488970,0.715256,0.182303,0.570066,-0.269767,,
997,-0.212872,-0.953993,-0.346528,-0.429546,,,-0.747463,0.881316,-0.552002,-2.564069,...,-1.118502,-1.264713,-0.226555,-0.826479,-2.110901,-1.401610,,-0.627412,0.175097,0.495797
998,-0.724949,0.531473,,,,-0.422190,,0.116595,-0.599141,0.062171,...,0.258315,-1.344678,,,0.666031,-0.483417,0.009360,-0.700871,-0.220377,0.491768


In [119]:
# 查哪一 columns 有空值 (axis=0可省略)
df.isnull().any(axis=0)

0      True
1      True
2      True
3      True
4      True
       ... 
995    True
996    True
997    True
998    True
999    True
Length: 1000, dtype: bool

In [120]:
# axis=0不可省
df.isnull().values.any(axis=0)

array([ True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,

In [114]:
df.isnull().any().any()

True

In [115]:
df.isnull().values.any()

True

In [28]:
%timeit df.isnull().any().any()
%timeit df.isnull().values.any()

511 µs ± 14.5 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)
350 µs ± 5.36 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)


In [23]:
data.notnull()

0     True
1    False
2     True
3    False
dtype: bool

In [24]:
data[data.notnull()]

0        1
2    hello
dtype: object

### Dropping null values

For a ``Series``

In [25]:
data.dropna()

0        1
2    hello
dtype: object

For a ``DataFrame``

In [71]:
df = pd.DataFrame([[1,      np.nan, 2],
                   [2,      3,      5],
                   [np.nan, 4,      6]],columns=['a','b','c'])
df

Unnamed: 0,a,b,c
0,1.0,,2
1,2.0,3.0,5
2,,4.0,6


By default, ``dropna()`` will drop all rows in which *any* null value is present:

In [45]:
# 預設 axis=0 ('index')
# # drop all index contain null
df.dropna()

Unnamed: 0,a,b,c
1,2.0,3.0,5


In [42]:
df.dropna(axis='index')

Unnamed: 0,0,1,2
1,2.0,3.0,5


In [46]:
# drop all columns contain null
df.dropna(axis='columns')

Unnamed: 0,c
0,2
1,5
2,6


In [73]:
df['d'] = np.nan
df

Unnamed: 0,a,b,c,d
0,1.0,,2,
1,2.0,3.0,5,
2,,4.0,6,


In [51]:
df.dropna?

In [52]:
# how='all' 全部 null(axis='columns')，drop
df.dropna(axis='columns', how='all')

Unnamed: 0,a,b,c
0,1.0,,2
1,2.0,3.0,5
2,,4.0,6


In [54]:
# how='any' 任一 null(axis='columns')，drop
df.dropna(axis='columns', how='any')

Unnamed: 0,c
0,2
1,5
2,6


In [55]:
# thresh=3 (至少有 3 non-NA values)
df.dropna(axis='rows', thresh=3)

Unnamed: 0,a,b,c,d
1,2.0,3.0,5,


### Filling null values

For ``Series``

In [56]:
data = pd.Series([1, np.nan, 2, None, 3], index=list('abcde'))
data

a    1.0
b    NaN
c    2.0
d    NaN
e    3.0
dtype: float64

fill NA entries with a single value, such as zero:

In [57]:
data.fillna(0)

a    1.0
b    0.0
c    2.0
d    0.0
e    3.0
dtype: float64

specify a forward-fill to propagate the previous value forward:

In [58]:
# forward-fill
data.fillna(method='ffill')

a    1.0
b    1.0
c    2.0
d    2.0
e    3.0
dtype: float64

Or we can specify a back-fill to propagate the next values backward:

In [59]:
# back-fill
data.fillna(method='bfill')

a    1.0
b    2.0
c    2.0
d    3.0
e    3.0
dtype: float64

For ``DataFrame``

In [74]:
df

Unnamed: 0,a,b,c,d
0,1.0,,2,
1,2.0,3.0,5,
2,,4.0,6,


In [61]:
type(df.iloc[1])

pandas.core.series.Series

In [63]:
# fill NA 為 前 row 的 value ('a',2)
# 預設 axis=0 (index)
df.fillna(method='ffill')

Unnamed: 0,a,b,c,d
0,1.0,,2,
1,2.0,3.0,5,
2,2.0,4.0,6,


In [75]:
# fill NA 為 前 column 的 value
df.fillna(method='ffill', axis=1)

Unnamed: 0,a,b,c,d
0,1.0,1.0,2.0,2.0
1,2.0,3.0,5.0,5.0
2,,4.0,6.0,6.0


### Operating on Duplicated data

In [91]:
df = pd.read_csv('./data/calls_for_service_2015_small.csv')
df.head()

Unnamed: 0,NOPD_Item,Type_,TypeText,Priority,InitialType,InitialTypeText,InitialPriority,MapX,MapY,TimeCreate,...,TimeArrive,TimeClosed,Disposition,DispositionText,SelfInitiated,Beat,BLOCK_ADDRESS,Zip,PoliceDistrict,Location
0,A0002815,17J,JUVENILE ATTACHMENT,1F,17J,JUVENILE ATTACHMENT,1F,37369000,3513814,01/01/2015 12:09:56 AM,...,01/01/2015 12:09:56 AM,01/01/2015 03:06:28 AM,NAT,Necessary Action Taken,Y,8A03,Convention Center Blvd & Canal St,70130.0,8,"(0.0, 0.0)"
1,A0003715,24,MEDICAL,2B,24,MEDICAL,2B,37369000,3513814,01/01/2015 12:12:26 AM,...,01/01/2015 12:17:31 AM,01/01/2015 12:27:28 AM,NAT,Necessary Action Taken,N,1G02,S Broad Ave & Tulane Ave,70119.0,1,"(0.0, 0.0)"
2,A0007115,103M,MENTAL PATIENT,1A,21,COMPLAINT OTHER,1A,37369000,3513814,01/01/2015 12:31:43 AM,...,01/01/2015 01:21:31 AM,01/01/2015 02:24:28 AM,NAT,Necessary Action Taken,N,5M04,033XX Florida Ave,70117.0,5,"(0.0, 0.0)"
3,A0007415,24,MEDICAL,2B,24,MEDICAL,2B,37369000,3513814,01/01/2015 12:33:56 AM,...,01/01/2015 12:33:56 AM,01/01/2015 12:41:16 AM,NAT,Necessary Action Taken,Y,8E01,Royal St & Orleans Ave,70116.0,8,"(0.0, 0.0)"
4,A0012615,24,MEDICAL,2B,24,MEDICAL,2B,37369000,3513814,01/01/2015 01:00:28 AM,...,01/01/2015 01:00:28 AM,01/01/2015 01:22:41 AM,NAT,Necessary Action Taken,Y,8D01,Bourbon St & Canal St,70112.0,8,"(0.0, 0.0)"


In [66]:
df.shape

(3000, 21)

In [67]:
# 檢查資料型態
df.dtypes

NOPD_Item           object
Type_               object
TypeText            object
Priority            object
InitialType         object
InitialTypeText     object
InitialPriority     object
MapX                 int64
MapY                 int64
TimeCreate          object
TimeDispatch        object
TimeArrive          object
TimeClosed          object
Disposition         object
DispositionText     object
SelfInitiated       object
Beat                object
BLOCK_ADDRESS       object
Zip                float64
PoliceDistrict       int64
Location            object
dtype: object

In [89]:
pd.read_csv?

In [95]:
# date標準語法 '%Y/%m/%d %H:%M:%S'(MySQL, MSSQL, HiveSQL)
# parse_dates(判斷需要轉換成date格式的column)
# infer_datetime_format(自動轉換date格式，預設 Fales)
df = pd.read_csv('./data/calls_for_service_2015_small.csv', parse_dates=[9, 10, 11, 12], infer_datetime_format=True)
df

Unnamed: 0,NOPD_Item,Type_,TypeText,Priority,InitialType,InitialTypeText,InitialPriority,MapX,MapY,TimeCreate,...,TimeArrive,TimeClosed,Disposition,DispositionText,SelfInitiated,Beat,BLOCK_ADDRESS,Zip,PoliceDistrict,Location
0,A0002815,17J,JUVENILE ATTACHMENT,1F,17J,JUVENILE ATTACHMENT,1F,37369000,3513814,2015-01-01 00:09:56,...,2015-01-01 00:09:56,2015-01-01 03:06:28,NAT,Necessary Action Taken,Y,8A03,Convention Center Blvd & Canal St,70130.0,8,"(0.0, 0.0)"
1,A0003715,24,MEDICAL,2B,24,MEDICAL,2B,37369000,3513814,2015-01-01 00:12:26,...,2015-01-01 00:17:31,2015-01-01 00:27:28,NAT,Necessary Action Taken,N,1G02,S Broad Ave & Tulane Ave,70119.0,1,"(0.0, 0.0)"
2,A0007115,103M,MENTAL PATIENT,1A,21,COMPLAINT OTHER,1A,37369000,3513814,2015-01-01 00:31:43,...,2015-01-01 01:21:31,2015-01-01 02:24:28,NAT,Necessary Action Taken,N,5M04,033XX Florida Ave,70117.0,5,"(0.0, 0.0)"
3,A0007415,24,MEDICAL,2B,24,MEDICAL,2B,37369000,3513814,2015-01-01 00:33:56,...,2015-01-01 00:33:56,2015-01-01 00:41:16,NAT,Necessary Action Taken,Y,8E01,Royal St & Orleans Ave,70116.0,8,"(0.0, 0.0)"
4,A0012615,24,MEDICAL,2B,24,MEDICAL,2B,37369000,3513814,2015-01-01 01:00:28,...,2015-01-01 01:00:28,2015-01-01 01:22:41,NAT,Necessary Action Taken,Y,8D01,Bourbon St & Canal St,70112.0,8,"(0.0, 0.0)"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2995,C3280215,103M,MENTAL PATIENT,1A,103M,MENTAL PATIENT,1A,37369000,3513814,2015-03-27 01:53:32,...,2015-03-27 01:53:32,2015-03-27 02:19:32,NAT,Necessary Action Taken,Y,1E04,Bienville St & N Rampart St,70112.0,1,"(0.0, 0.0)"
2996,C3283415,24,MEDICAL,2B,24,MEDICAL,2B,37369000,3513814,2015-03-27 02:57:40,...,2015-03-27 02:57:40,2015-03-27 03:02:29,NAT,Necessary Action Taken,Y,1G02,024XX Tulane Ave,70119.0,1,"(0.0, 0.0)"
2997,C3283515,24,MEDICAL,2B,24,MEDICAL,2B,37369000,3513814,2015-03-27 02:58:05,...,2015-03-27 02:58:05,2015-03-27 03:16:22,NAT,Necessary Action Taken,Y,8A03,X Canal St,70130.0,8,"(0.0, 0.0)"
2998,C3286415,43B,SEXUAL BATTERY,2B,43B,SEXUAL BATTERY,2B,37369000,3513814,2015-03-27 04:07:53,...,2015-03-27 04:07:53,2015-03-27 04:08:10,RTF,REPORT TO FOLLOW,Y,8D05,005XX Bourbon St,70112.0,8,"(0.0, 0.0)"


In [96]:
df.dtypes

NOPD_Item                  object
Type_                      object
TypeText                   object
Priority                   object
InitialType                object
InitialTypeText            object
InitialPriority            object
MapX                        int64
MapY                        int64
TimeCreate         datetime64[ns]
TimeDispatch       datetime64[ns]
TimeArrive         datetime64[ns]
TimeClosed         datetime64[ns]
Disposition                object
DispositionText            object
SelfInitiated              object
Beat                       object
BLOCK_ADDRESS              object
Zip                       float64
PoliceDistrict              int64
Location                   object
dtype: object

In [78]:
# check if any records are duplicates
df.duplicated()

0       False
1       False
2       False
3       False
4       False
        ...  
2995    False
2996    False
2997    False
2998    False
2999    False
Length: 3000, dtype: bool

In [80]:
df.duplicated().any()

False

In [81]:
# check duplicates in a particular column
df.duplicated('TypeText')

0       False
1       False
2       False
3        True
4        True
        ...  
2995     True
2996     True
2997     True
2998     True
2999     True
Length: 3000, dtype: bool

In [82]:
df.duplicated('TypeText').any()

True

In [83]:
# count how many duplicates
df.duplicated('TypeText').sum()

2969

In [84]:
# remove duplicated rows
df.drop_duplicates()

Unnamed: 0,NOPD_Item,Type_,TypeText,Priority,InitialType,InitialTypeText,InitialPriority,MapX,MapY,TimeCreate,...,TimeArrive,TimeClosed,Disposition,DispositionText,SelfInitiated,Beat,BLOCK_ADDRESS,Zip,PoliceDistrict,Location
0,A0002815,17J,JUVENILE ATTACHMENT,1F,17J,JUVENILE ATTACHMENT,1F,37369000,3513814,01/01/2015 12:09:56 AM,...,01/01/2015 12:09:56 AM,01/01/2015 03:06:28 AM,NAT,Necessary Action Taken,Y,8A03,Convention Center Blvd & Canal St,70130.0,8,"(0.0, 0.0)"
1,A0003715,24,MEDICAL,2B,24,MEDICAL,2B,37369000,3513814,01/01/2015 12:12:26 AM,...,01/01/2015 12:17:31 AM,01/01/2015 12:27:28 AM,NAT,Necessary Action Taken,N,1G02,S Broad Ave & Tulane Ave,70119.0,1,"(0.0, 0.0)"
2,A0007115,103M,MENTAL PATIENT,1A,21,COMPLAINT OTHER,1A,37369000,3513814,01/01/2015 12:31:43 AM,...,01/01/2015 01:21:31 AM,01/01/2015 02:24:28 AM,NAT,Necessary Action Taken,N,5M04,033XX Florida Ave,70117.0,5,"(0.0, 0.0)"
3,A0007415,24,MEDICAL,2B,24,MEDICAL,2B,37369000,3513814,01/01/2015 12:33:56 AM,...,01/01/2015 12:33:56 AM,01/01/2015 12:41:16 AM,NAT,Necessary Action Taken,Y,8E01,Royal St & Orleans Ave,70116.0,8,"(0.0, 0.0)"
4,A0012615,24,MEDICAL,2B,24,MEDICAL,2B,37369000,3513814,01/01/2015 01:00:28 AM,...,01/01/2015 01:00:28 AM,01/01/2015 01:22:41 AM,NAT,Necessary Action Taken,Y,8D01,Bourbon St & Canal St,70112.0,8,"(0.0, 0.0)"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2995,C3280215,103M,MENTAL PATIENT,1A,103M,MENTAL PATIENT,1A,37369000,3513814,03/27/2015 01:53:32 AM,...,03/27/2015 01:53:32 AM,03/27/2015 02:19:32 AM,NAT,Necessary Action Taken,Y,1E04,Bienville St & N Rampart St,70112.0,1,"(0.0, 0.0)"
2996,C3283415,24,MEDICAL,2B,24,MEDICAL,2B,37369000,3513814,03/27/2015 02:57:40 AM,...,03/27/2015 02:57:40 AM,03/27/2015 03:02:29 AM,NAT,Necessary Action Taken,Y,1G02,024XX Tulane Ave,70119.0,1,"(0.0, 0.0)"
2997,C3283515,24,MEDICAL,2B,24,MEDICAL,2B,37369000,3513814,03/27/2015 02:58:05 AM,...,03/27/2015 02:58:05 AM,03/27/2015 03:16:22 AM,NAT,Necessary Action Taken,Y,8A03,X Canal St,70130.0,8,"(0.0, 0.0)"
2998,C3286415,43B,SEXUAL BATTERY,2B,43B,SEXUAL BATTERY,2B,37369000,3513814,03/27/2015 04:07:53 AM,...,03/27/2015 04:07:53 AM,03/27/2015 04:08:10 AM,RTF,REPORT TO FOLLOW,Y,8D05,005XX Bourbon St,70112.0,8,"(0.0, 0.0)"


In [85]:
# drop duplicates from column
# 預設留下重複資料第一筆(keep='first')
df.drop_duplicates(['TypeText'])

Unnamed: 0,NOPD_Item,Type_,TypeText,Priority,InitialType,InitialTypeText,InitialPriority,MapX,MapY,TimeCreate,...,TimeArrive,TimeClosed,Disposition,DispositionText,SelfInitiated,Beat,BLOCK_ADDRESS,Zip,PoliceDistrict,Location
0,A0002815,17J,JUVENILE ATTACHMENT,1F,17J,JUVENILE ATTACHMENT,1F,37369000,3513814,01/01/2015 12:09:56 AM,...,01/01/2015 12:09:56 AM,01/01/2015 03:06:28 AM,NAT,Necessary Action Taken,Y,8A03,Convention Center Blvd & Canal St,70130.0,8,"(0.0, 0.0)"
1,A0003715,24,MEDICAL,2B,24,MEDICAL,2B,37369000,3513814,01/01/2015 12:12:26 AM,...,01/01/2015 12:17:31 AM,01/01/2015 12:27:28 AM,NAT,Necessary Action Taken,N,1G02,S Broad Ave & Tulane Ave,70119.0,1,"(0.0, 0.0)"
2,A0007115,103M,MENTAL PATIENT,1A,21,COMPLAINT OTHER,1A,37369000,3513814,01/01/2015 12:31:43 AM,...,01/01/2015 01:21:31 AM,01/01/2015 02:24:28 AM,NAT,Necessary Action Taken,N,5M04,033XX Florida Ave,70117.0,5,"(0.0, 0.0)"
14,A0038015,29S,SUICIDE,2A,29S,SUICIDE,2A,37369000,3513814,01/01/2015 04:11:08 AM,...,01/01/2015 04:28:56 AM,01/01/2015 04:47:05 AM,RTF,REPORT TO FOLLOW,N,8I02,002XX Loyola Ave,70112.0,8,"(0.0, 0.0)"
30,A0062915,102,CRUELTY TO ANIMALS,1A,102,CRUELTY TO ANIMALS,1A,3694902,559833,01/01/2015 10:56:42 AM,...,,01/01/2015 08:15:40 PM,UNF,UNFOUNDED,N,7D02,Approx Loc:6027 Kuebel Dr,70126.0,7,"(30.032934, -90.02473)"
46,A0097115,21J,MISSING JUVENILE,2A,21J,MISSING JUVENILE,2A,37369000,3513814,01/01/2015 05:25:31 PM,...,,01/01/2015 05:26:33 PM,VOI,VOID,N,3T02,044XX Eastern St,70122.0,3,"(0.0, 0.0)"
50,A0113015,42,AGGRAVATED RAPE,2A,21,COMPLAINT OTHER,2A,37369000,3513814,01/01/2015 07:50:02 PM,...,01/01/2015 08:07:49 PM,01/01/2015 11:06:18 PM,RTF,REPORT TO FOLLOW,N,3V01,040XX Athis Ct,70126.0,3,"(0.0, 0.0)"
69,A0221015,21,COMPLAINT OTHER,1H,21,COMPLAINT OTHER,1H,37369000,3513814,01/02/2015 06:16:26 PM,...,01/02/2015 06:16:26 PM,01/02/2015 06:50:10 PM,NAT,Necessary Action Taken,Y,3D02,042XX D'Hemecourt St,70119.0,0,"(0.0, 0.0)"
108,A0327415,18,TRAFFIC INCIDENT,1H,18,TRAFFIC INCIDENT,1H,37369000,3513814,01/03/2015 06:20:14 PM,...,01/03/2015 06:20:14 PM,01/03/2015 07:06:11 PM,NAT,Necessary Action Taken,Y,,Tole & S Lope,,0,"(0.0, 0.0)"
168,A0600215,43,SIMPLE RAPE,2D,21,COMPLAINT OTHER,1H,37369000,3513814,01/06/2015 09:16:32 AM,...,01/06/2015 09:16:32 AM,01/06/2015 10:25:25 AM,RTF,REPORT TO FOLLOW,Y,6Q01,022XX Marengo St,70115.0,6,"(0.0, 0.0)"


Alternatively, you can add ‘keep’ and indicate
whether you’d like to keep:
* keep='first', keep the first argumen(default)
* keep='last', keep the last argument
* keep=False, drop all the duplicates

In [86]:
df.drop_duplicates(['TypeText'], keep='last')

Unnamed: 0,NOPD_Item,Type_,TypeText,Priority,InitialType,InitialTypeText,InitialPriority,MapX,MapY,TimeCreate,...,TimeArrive,TimeClosed,Disposition,DispositionText,SelfInitiated,Beat,BLOCK_ADDRESS,Zip,PoliceDistrict,Location
30,A0062915,102,CRUELTY TO ANIMALS,1A,102,CRUELTY TO ANIMALS,1A,3694902,559833,01/01/2015 10:56:42 AM,...,,01/01/2015 08:15:40 PM,UNF,UNFOUNDED,N,7D02,Approx Loc:6027 Kuebel Dr,70126.0,7,"(30.032934, -90.02473)"
175,A0612115,20,AUTO ACCIDENT,2A,20,AUTO ACCIDENT,2A,37369000,3513814,01/06/2015 11:03:38 AM,...,,01/06/2015 02:26:29 PM,RTF,REPORT TO FOLLOW,N,,US90B E,,0,"(0.0, 0.0)"
463,A1781715,35,SIMPLE BATTERY,1D,21,COMPLAINT OTHER,1H,37369000,3513814,01/15/2015 05:28:12 PM,...,01/15/2015 05:28:12 PM,01/15/2015 07:46:30 PM,RTF,REPORT TO FOLLOW,Y,,042XX D'Hemecourt St,,0,"(0.0, 0.0)"
639,A2459615,62A,"BURGLAR ALARM, SILENT",2C,62A,"BURGLAR ALARM, SILENT",2C,37369000,3513814,01/21/2015 09:58:55 AM,...,01/21/2015 10:07:33 AM,01/21/2015 10:26:52 AM,NAT,Necessary Action Taken,N,,015XX Orleans Ave,,0,"(0.0, 0.0)"
868,A3306115,17R,WARR STOP WITH RELEASE,0G,17R,WARR STOP WITH RELEASE,0G,37369000,3513814,01/27/2015 10:39:44 PM,...,01/27/2015 10:39:44 PM,01/27/2015 11:02:05 PM,NAT,Necessary Action Taken,Y,6I04,028XX S Claiborne Ave,70115.0,0,"(0.0, 0.0)"
1082,B0233615,107,SUSPICIOUS PERSON,2A,107,SUSPICIOUS PERSON,2A,37369000,3513814,02/02/2015 08:30:54 PM,...,02/02/2015 08:30:54 PM,02/02/2015 08:38:20 PM,NAT,Necessary Action Taken,Y,8D06,006XX Bourbon St,70112.0,0,"(0.0, 0.0)"
1127,G0082715,21R,RECOVERY OF VEHICLE,1G,21,COMPLAINT OTHER,1H,0,0,07/01/2015 04:34:33 PM,...,07/01/2015 04:34:33 PM,07/01/2015 06:05:08 PM,RTF,REPORT TO FOLLOW,Y,7I01,130XX Granville,,0,
1424,B1393515,67S,SHOPLIFTING,2A,67S,SHOPLIFTING,2A,3688025,539289,02/12/2015 11:54:30 AM,...,02/12/2015 11:56:27 AM,02/12/2015 01:15:44 PM,RTF,REPORT TO FOLLOW,N,5G03,018XX Almonaster Ave,70117.0,5,"(29.97666, -90.047191)"
2015,B3366815,42U,AGGRAVATED RAPE UNFOUNDED BY SPECIAL VICTIMS O...,1A,42,AGGRAVATED RAPE,2A,37369000,3513814,02/28/2015 08:40:38 AM,...,02/28/2015 08:45:24 AM,02/28/2015 09:29:14 AM,RTF,REPORT TO FOLLOW,N,6Q05,026XX Danneel St,70113.0,6,"(0.0, 0.0)"
2088,C0125015,20I,AUTO ACCIDENT WITH INJURY,2B,20,AUTO ACCIDENT,2A,3698831,557938,03/02/2015 05:17:04 AM,...,,03/02/2015 05:22:16 AM,DUP,DUPLICATE,N,7D05,Highrise & I-10 W,,7,"(30.027598, -90.012381)"


In [87]:
df.drop_duplicates(['TypeText'], keep=False)

Unnamed: 0,NOPD_Item,Type_,TypeText,Priority,InitialType,InitialTypeText,InitialPriority,MapX,MapY,TimeCreate,...,TimeArrive,TimeClosed,Disposition,DispositionText,SelfInitiated,Beat,BLOCK_ADDRESS,Zip,PoliceDistrict,Location
30,A0062915,102,CRUELTY TO ANIMALS,1A,102,CRUELTY TO ANIMALS,1A,3694902,559833,01/01/2015 10:56:42 AM,...,,01/01/2015 08:15:40 PM,UNF,UNFOUNDED,N,7D02,Approx Loc:6027 Kuebel Dr,70126.0,7,"(30.032934, -90.02473)"
175,A0612115,20,AUTO ACCIDENT,2A,20,AUTO ACCIDENT,2A,37369000,3513814,01/06/2015 11:03:38 AM,...,,01/06/2015 02:26:29 PM,RTF,REPORT TO FOLLOW,N,,US90B E,,0,"(0.0, 0.0)"
463,A1781715,35,SIMPLE BATTERY,1D,21,COMPLAINT OTHER,1H,37369000,3513814,01/15/2015 05:28:12 PM,...,01/15/2015 05:28:12 PM,01/15/2015 07:46:30 PM,RTF,REPORT TO FOLLOW,Y,,042XX D'Hemecourt St,,0,"(0.0, 0.0)"
639,A2459615,62A,"BURGLAR ALARM, SILENT",2C,62A,"BURGLAR ALARM, SILENT",2C,37369000,3513814,01/21/2015 09:58:55 AM,...,01/21/2015 10:07:33 AM,01/21/2015 10:26:52 AM,NAT,Necessary Action Taken,N,,015XX Orleans Ave,,0,"(0.0, 0.0)"
868,A3306115,17R,WARR STOP WITH RELEASE,0G,17R,WARR STOP WITH RELEASE,0G,37369000,3513814,01/27/2015 10:39:44 PM,...,01/27/2015 10:39:44 PM,01/27/2015 11:02:05 PM,NAT,Necessary Action Taken,Y,6I04,028XX S Claiborne Ave,70115.0,0,"(0.0, 0.0)"
1082,B0233615,107,SUSPICIOUS PERSON,2A,107,SUSPICIOUS PERSON,2A,37369000,3513814,02/02/2015 08:30:54 PM,...,02/02/2015 08:30:54 PM,02/02/2015 08:38:20 PM,NAT,Necessary Action Taken,Y,8D06,006XX Bourbon St,70112.0,0,"(0.0, 0.0)"
1127,G0082715,21R,RECOVERY OF VEHICLE,1G,21,COMPLAINT OTHER,1H,0,0,07/01/2015 04:34:33 PM,...,07/01/2015 04:34:33 PM,07/01/2015 06:05:08 PM,RTF,REPORT TO FOLLOW,Y,7I01,130XX Granville,,0,
1424,B1393515,67S,SHOPLIFTING,2A,67S,SHOPLIFTING,2A,3688025,539289,02/12/2015 11:54:30 AM,...,02/12/2015 11:56:27 AM,02/12/2015 01:15:44 PM,RTF,REPORT TO FOLLOW,N,5G03,018XX Almonaster Ave,70117.0,5,"(29.97666, -90.047191)"
2088,C0125015,20I,AUTO ACCIDENT WITH INJURY,2B,20,AUTO ACCIDENT,2A,3698831,557938,03/02/2015 05:17:04 AM,...,,03/02/2015 05:22:16 AM,DUP,DUPLICATE,N,7D05,Highrise & I-10 W,,7,"(30.027598, -90.012381)"
2199,C0512715,21N,NOISE COMPLAINT,1E,21N,NOISE COMPLAINT,1E,3664337,534925,03/05/2015 01:28:04 AM,...,03/05/2015 02:54:41 AM,03/05/2015 02:54:59 AM,UNF,UNFOUNDED,N,2S02,088XX Blk Fig St,70118.0,2,"(29.96537, -90.122149)"
