In [1]:
import numpy as np
import pandas as pd

In [2]:
df = pd.DataFrame({
    'Sex': ['M', 'F', 'F', 'D', '?'],
    'Age': [29, 30, 24, 290, 25],
})
df

Unnamed: 0,Sex,Age
0,M,29
1,F,30
2,F,24
3,D,290
4,?,25


In [3]:
df['Sex'].unique()

array(['M', 'F', 'D', '?'], dtype=object)

In [4]:
df['Sex'].value_counts()

Unnamed: 0_level_0,count
Sex,Unnamed: 1_level_1
F,2
M,1
D,1
?,1


In [5]:
df['Sex'].replace('D', 'F')

Unnamed: 0,Sex
0,M
1,F
2,F
3,F
4,?


In [6]:
df['Sex'].replace({'D': 'F', 'N': 'M'})

Unnamed: 0,Sex
0,M
1,F
2,F
3,F
4,?


In [7]:
df.replace({
    'Sex': {
        'D': 'F',
        'N': 'M'
    },
    'Age': {
        290: 29
    }
})

Unnamed: 0,Sex,Age
0,M,29
1,F,30
2,F,24
3,F,29
4,?,25


In [8]:
df[df['Age'] > 100]

Unnamed: 0,Sex,Age
3,D,290


In [9]:
df.loc[df['Age'] > 100, 'Age'] = df.loc[df['Age'] > 100, 'Age'] / 10

In [10]:
df

Unnamed: 0,Sex,Age
0,M,29
1,F,30
2,F,24
3,D,29
4,?,25


In [12]:
ambassadors = pd.Series([
    'France',
    'United Kingdom',
    'United Kingdom',
    'Italy',
    'Germany',
    'Germany',
    'Germany',
], index=[
    'Gérard Araud',
    'Kim Darroch',
    'Peter Westmacott',
    'Armando Varricchio',
    'Peter Wittig',
    'Peter Ammon',
    'Klaus Scharioth '
])

In [13]:
ambassadors

Unnamed: 0,0
Gérard Araud,France
Kim Darroch,United Kingdom
Peter Westmacott,United Kingdom
Armando Varricchio,Italy
Peter Wittig,Germany
Peter Ammon,Germany
Klaus Scharioth,Germany


In [14]:
ambassadors.duplicated()

Unnamed: 0,0
Gérard Araud,False
Kim Darroch,False
Peter Westmacott,True
Armando Varricchio,False
Peter Wittig,False
Peter Ammon,True
Klaus Scharioth,True


In [15]:
ambassadors.duplicated(keep='last')

Unnamed: 0,0
Gérard Araud,False
Kim Darroch,True
Peter Westmacott,False
Armando Varricchio,False
Peter Wittig,True
Peter Ammon,True
Klaus Scharioth,False


In [16]:
ambassadors.duplicated(keep=False)

Unnamed: 0,0
Gérard Araud,False
Kim Darroch,True
Peter Westmacott,True
Armando Varricchio,False
Peter Wittig,True
Peter Ammon,True
Klaus Scharioth,True


In [17]:
ambassadors.drop_duplicates()

Unnamed: 0,0
Gérard Araud,France
Kim Darroch,United Kingdom
Armando Varricchio,Italy
Peter Wittig,Germany


In [18]:
ambassadors.drop_duplicates(keep='last')

Unnamed: 0,0
Gérard Araud,France
Peter Westmacott,United Kingdom
Armando Varricchio,Italy
Klaus Scharioth,Germany


In [19]:
ambassadors.drop_duplicates(keep=False)

Unnamed: 0,0
Gérard Araud,France
Armando Varricchio,Italy


In [20]:
players = pd.DataFrame({
    'Name': [
        'Kobe Bryant',
        'LeBron James',
        'Kobe Bryant',
        'Carmelo Anthony',
        'Kobe Bryant',
    ],
    'Pos': [
        'SG',
        'SF',
        'SG',
        'SF',
        'SF'
    ]
})

In [21]:
players

Unnamed: 0,Name,Pos
0,Kobe Bryant,SG
1,LeBron James,SF
2,Kobe Bryant,SG
3,Carmelo Anthony,SF
4,Kobe Bryant,SF


In [22]:
players.duplicated()

Unnamed: 0,0
0,False
1,False
2,True
3,False
4,False


In [23]:
players.duplicated(subset=['Name'])

Unnamed: 0,0
0,False
1,False
2,True
3,False
4,True


In [24]:
players.duplicated(subset=['Name'], keep='last')

Unnamed: 0,0
0,True
1,False
2,True
3,False
4,False


In [25]:
players.drop_duplicates()

Unnamed: 0,Name,Pos
0,Kobe Bryant,SG
1,LeBron James,SF
3,Carmelo Anthony,SF
4,Kobe Bryant,SF


In [26]:
players.drop_duplicates(subset=['Name'])

Unnamed: 0,Name,Pos
0,Kobe Bryant,SG
1,LeBron James,SF
3,Carmelo Anthony,SF


In [27]:
players.drop_duplicates(subset=['Name'], keep='last')

Unnamed: 0,Name,Pos
1,LeBron James,SF
3,Carmelo Anthony,SF
4,Kobe Bryant,SF


In [28]:
df = pd.DataFrame({
    'Data': [
        '1987_M_US _1',
        '1990?_M_UK_1',
        '1992_F_US_2',
        '1970?_M_   IT_1',
        '1985_F_I  T_2'
]})

In [29]:
df

Unnamed: 0,Data
0,1987_M_US _1
1,1990?_M_UK_1
2,1992_F_US_2
3,1970?_M_ IT_1
4,1985_F_I T_2


In [30]:
df['Data'].str.split('_')

Unnamed: 0,Data
0,"[1987, M, US , 1]"
1,"[1990?, M, UK, 1]"
2,"[1992, F, US, 2]"
3,"[1970?, M, IT, 1]"
4,"[1985, F, I T, 2]"


In [31]:
df['Data'].str.split('_', expand=True)

Unnamed: 0,0,1,2,3
0,1987,M,US,1
1,1990?,M,UK,1
2,1992,F,US,2
3,1970?,M,IT,1
4,1985,F,I T,2


In [32]:
df = df['Data'].str.split('_', expand=True)

In [33]:
df.columns = ['Year', 'Sex', 'Country', 'No Children']

In [34]:
df

Unnamed: 0,Year,Sex,Country,No Children
0,1987,M,US,1
1,1990?,M,UK,1
2,1992,F,US,2
3,1970?,M,IT,1
4,1985,F,I T,2


In [35]:
df['Year'].str.contains('\?')

Unnamed: 0,Year
0,False
1,True
2,False
3,True
4,False


In [36]:
df['Country'].str.contains('U')

Unnamed: 0,Country
0,True
1,True
2,True
3,False
4,False


In [37]:
df['Country'].str.strip()

Unnamed: 0,Country
0,US
1,UK
2,US
3,IT
4,I T


In [38]:
df['Country'].str.replace(' ', '')

Unnamed: 0,Country
0,US
1,UK
2,US
3,IT
4,IT


In [40]:
df['Year'].str.replace(r'(?P<year>\d{4})\?', lambda m: m.group('year'))


ValueError: Cannot use a callable replacement when regex=False