In [1]:
import pandas as pd
import numpy as np

In [2]:
df=pd.DataFrame({
    'Sex':['M','F','F','D','?'],
    'Age':[29,30,24,290,25]
})

In [3]:
df

Unnamed: 0,Sex,Age
0,M,29
1,F,30
2,F,24
3,D,290
4,?,25


In [5]:
#for categorical data as sex we begin by checking the unique values
df['Sex'].unique()

array(['M', 'F', 'D', '?'], dtype=object)

In [6]:
df['Sex'].value_counts()

F    2
M    1
?    1
D    1
Name: Sex, dtype: int64

In [7]:
#you can replace such values
df['Sex'].replace('D','M')

0    M
1    F
2    F
3    M
4    ?
Name: Sex, dtype: object

In [8]:
#replace can also receive a dictionary
df['Sex'].replace({'D':'F','N':'M'})

0    M
1    F
2    F
3    F
4    ?
Name: Sex, dtype: object

In [14]:
#if you have many columns to change you could apply it at dataframe level
df.replace({
    'Sex':{'D':'F','N':'M','?':'M'},
    'Age':{290:29}
})


Unnamed: 0,Sex,Age
0,M,29
1,F,30
2,F,24
3,F,29
4,M,25


In [15]:
#in case of figures removing the extra zeros
df[df['Age']>100]

Unnamed: 0,Sex,Age
3,D,290


In [17]:
#getting the values exceeding a particular level and specifying the column
df.loc[df['Age']>100,'Age']=df.loc[df['Age']>100,'Age']/10
df

Unnamed: 0,Sex,Age
0,M,29.0
1,F,30.0
2,F,24.0
3,D,29.0
4,?,25.0


# duplicate values

In [18]:
#one ambassador per country only needed
ambassadors=pd.Series([
    'France','United Kingdom','United Kingdom','Italy','Germany','Germany','Germany'],
    index=[  'Gérard Araud',
    'Kim Darroch',
    'Peter Westmacott',
    'Armando Varricchio',
    'Peter Wittig',
    'Peter Ammon',
    'Klaus Scharioth '
])

In [19]:
ambassadors

Gérard Araud                  France
Kim Darroch           United Kingdom
Peter Westmacott      United Kingdom
Armando Varricchio             Italy
Peter Wittig                 Germany
Peter Ammon                  Germany
Klaus Scharioth              Germany
dtype: object

In [20]:
#shows duplicated values
ambassadors.duplicated()

Gérard Araud          False
Kim Darroch           False
Peter Westmacott       True
Armando Varricchio    False
Peter Wittig          False
Peter Ammon            True
Klaus Scharioth        True
dtype: bool

In [21]:
#first encounters are not treated as duplicates but the rest are, this can be changed through keep parameter
ambassadors.duplicated(keep='last')

Gérard Araud          False
Kim Darroch            True
Peter Westmacott      False
Armando Varricchio    False
Peter Wittig           True
Peter Ammon            True
Klaus Scharioth       False
dtype: bool

In [22]:
#mark all of them as duplicates
ambassadors.duplicated(keep=False)

Gérard Araud          False
Kim Darroch            True
Peter Westmacott       True
Armando Varricchio    False
Peter Wittig           True
Peter Ammon            True
Klaus Scharioth        True
dtype: bool

In [24]:
#drop-duplicat also accepts the keep parameter
ambassadors.drop_duplicates()

Gérard Araud                  France
Kim Darroch           United Kingdom
Armando Varricchio             Italy
Peter Wittig                 Germany
dtype: object

In [25]:
ambassadors.drop_duplicates(keep='last')

Gérard Araud                  France
Peter Westmacott      United Kingdom
Armando Varricchio             Italy
Klaus Scharioth              Germany
dtype: object

In [26]:
ambassadors.drop_duplicates(keep=False)

Gérard Araud          France
Armando Varricchio     Italy
dtype: object

# duplicates in DataFrames

In [28]:
players=pd.DataFrame({
    'Names':['Kobe Bryant','LeBron James','Kobe Bryant','Carmelo Anthony','Kobe Bryant'],
    'Pos':['SG','SF','SG','SF','SF']
})

In [29]:
players

Unnamed: 0,Names,Pos
0,Kobe Bryant,SG
1,LeBron James,SF
2,Kobe Bryant,SG
3,Carmelo Anthony,SF
4,Kobe Bryant,SF


In [30]:
players.duplicated()

0    False
1    False
2     True
3    False
4    False
dtype: bool

In [33]:
#using subset to specify which column
players.duplicated(subset=["Names"])

0    False
1    False
2     True
3    False
4     True
dtype: bool

In [35]:
#same rules of keep still apply
players.duplicated(subset=['Names'],keep='last')

0     True
1    False
2     True
3    False
4    False
dtype: bool

In [36]:
players.drop_duplicates()

Unnamed: 0,Names,Pos
0,Kobe Bryant,SG
1,LeBron James,SF
3,Carmelo Anthony,SF
4,Kobe Bryant,SF


In [38]:
players.drop_duplicates(subset=['Names'])

Unnamed: 0,Names,Pos
0,Kobe Bryant,SG
1,LeBron James,SF
3,Carmelo Anthony,SF


In [39]:
players.drop_duplicates(subset=["Names"],keep='last')

Unnamed: 0,Names,Pos
1,LeBron James,SF
3,Carmelo Anthony,SF
4,Kobe Bryant,SF


# Text Handling

In [40]:
df = pd.DataFrame({
    'Data': [
        '1987_M_US _1',
        '1990?_M_UK_1',
        '1992_F_US_2',
        '1970?_M_   IT_1',
        '1985_F_I  T_2'
]})

In [41]:
df

Unnamed: 0,Data
0,1987_M_US _1
1,1990?_M_UK_1
2,1992_F_US_2
3,1970?_M_ IT_1
4,1985_F_I T_2


In [42]:
df['Data'].str.split('_')

0       [1987, M, US , 1]
1       [1990?, M, UK, 1]
2        [1992, F, US, 2]
3    [1970?, M,    IT, 1]
4      [1985, F, I  T, 2]
Name: Data, dtype: object

In [43]:
df=df['Data'].str.split('_',expand=True)

In [44]:
df

Unnamed: 0,0,1,2,3
0,1987,M,US,1
1,1990?,M,UK,1
2,1992,F,US,2
3,1970?,M,IT,1
4,1985,F,I T,2


In [45]:
df.columns=['Year','Sex','Country','No of children']

In [46]:
df

Unnamed: 0,Year,Sex,Country,No of children
0,1987,M,US,1
1,1990?,M,UK,1
2,1992,F,US,2
3,1970?,M,IT,1
4,1985,F,I T,2


In [48]:
#checking column contains a give value with the contain method 
df['Year'].str.contains('\?')
#escape the ? symbol as contains takes a regex/pattern as first symbol

0    False
1     True
2    False
3     True
4    False
Name: Year, dtype: bool

In [49]:
df['Country'].str.contains(' ')

0     True
1    False
2    False
3     True
4     True
Name: Country, dtype: bool

In [50]:
#spaces can be removed with the strip(lstrip and rstrip) also exists
df['Country'].str.strip()

0      US
1      UK
2      US
3      IT
4    I  T
Name: Country, dtype: object

In [51]:
df['Country'].str.replace(" ","")

0    US
1    UK
2    US
3    IT
4    IT
Name: Country, dtype: object

In [54]:
df['Year'].str.replace(r'(?P<year>\d{4})\?',lambda m: m.group('year'))

0    1987
1    1990
2    1992
3    1970
4    1985
Name: Year, dtype: object