In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.DataFrame({
    'Sex' : ['M','F','F','D','?'],
    'Age' : [29, 30, 24, 290, 25]
})

In [3]:
df

Unnamed: 0,Sex,Age
0,M,29
1,F,30
2,F,24
3,D,290
4,?,25


In [4]:
#We have invalid data. Not Missing, but values don't make sense

In [6]:
#Find Unique -> Notice, and Identify then we can handle by remove/replace. So for sex, is a categorical M/F, we look at variety
df['Sex'].unique()


array(['M', 'F', 'D', '?'], dtype=object)

In [7]:
df['Sex'].value_counts()

F    2
M    1
D    1
?    1
Name: Sex, dtype: int64

In [9]:
#So, if we are sure, that D for eg, was a typo, we can reolace
df['Sex'].replace('D', 'F')

0    M
1    F
2    F
3    F
4    ?
Name: Sex, dtype: object

In [10]:
#Replace also accepts dict
df['Sex'].replace({'D':'F', 'N':'M'})

0    M
1    F
2    F
3    F
4    ?
Name: Sex, dtype: object

In [13]:
#If many cols to replace, do it at DF Level

df.replace({
    'Sex': {
        'D' : 'F',
        'N' : 'M'
        },
    
    'Age' : {
            290 :29
        }
    

    

    })

Unnamed: 0,Sex,Age
0,M,29
1,F,30
2,F,24
3,F,29
4,?,25


In [14]:
#Age cannot always be explicityly replaced, but we can remove extra zero from age col
#First, set not possible age. So 100 not valid

df[df['Age']>100]

Unnamed: 0,Sex,Age
3,D,290


In [15]:
#Here we divide by 10
df.loc[df['Age']> 100, 'Age'] = df.loc[df['Age']>100, 'Age']/10

In [16]:
df

Unnamed: 0,Sex,Age
0,M,29
1,F,30
2,F,24
3,D,29
4,?,25


In [17]:
#Handling Duplicates
#Series - Only one Ambassador per country
ambassadors = pd.Series([
    'France',
    'United Kingdom',
    'United Kingdom',
    'Italy',
    'Germany',
    'Germany',
    'Germany',
], index=[
    'Gérard Araud',
    'Kim Darroch',
    'Peter Westmacott',
    'Armando Varricchio',
    'Peter Wittig',
    'Peter Ammon',
    'Klaus Scharioth '
])

In [18]:
ambassadors

Gérard Araud                  France
Kim Darroch           United Kingdom
Peter Westmacott      United Kingdom
Armando Varricchio             Italy
Peter Wittig                 Germany
Peter Ammon                  Germany
Klaus Scharioth              Germany
dtype: object

In [23]:
#Methods: duplicated(), drop_duplicates()
ambassadors.duplicated(keep=False)

#Note, first entry of what is considered duplicate, is not regarded as duplicate. So, first is non-duplicate. can change this with keep
#Keep is 'last' considers last occurence as non-duplicate

Gérard Araud          False
Kim Darroch            True
Peter Westmacott       True
Armando Varricchio    False
Peter Wittig           True
Peter Ammon            True
Klaus Scharioth        True
dtype: bool

In [25]:
ambassadors[ambassadors.duplicated(keep = 'last')]

Kim Darroch     United Kingdom
Peter Wittig           Germany
Peter Ammon            Germany
dtype: object

In [27]:
ambassadors

Gérard Araud                  France
Kim Darroch           United Kingdom
Peter Westmacott      United Kingdom
Armando Varricchio             Italy
Peter Wittig                 Germany
Peter Ammon                  Germany
Klaus Scharioth              Germany
dtype: object

In [26]:
ambassadors.drop_duplicates() # keeps first occurence

Gérard Araud                  France
Kim Darroch           United Kingdom
Armando Varricchio             Italy
Peter Wittig                 Germany
dtype: object

In [32]:
ambassadors.drop_duplicates(keep = False) # Removes all dupls

Gérard Araud          France
Armando Varricchio     Italy
dtype: object

In [33]:
ambassadors.drop_duplicates(keep = 'last')

Gérard Araud                  France
Peter Westmacott      United Kingdom
Armando Varricchio             Italy
Klaus Scharioth              Germany
dtype: object

In [34]:
#DataFrames
players = pd.DataFrame({
    'Name': [
        'Kobe Bryant',
        'LeBron James',
        'Kobe Bryant',
        'Carmelo Anthony',
        'Kobe Bryant',
    ],
    'Pos': [
        'SG',
        'SF',
        'SG',
        'SF',
        'SF'
    ]
})

In [35]:
players
#Kobe is duplicateds
#Also, all col values should be dups
#Can customise with subset

Unnamed: 0,Name,Pos
0,Kobe Bryant,SG
1,LeBron James,SF
2,Kobe Bryant,SG
3,Carmelo Anthony,SF
4,Kobe Bryant,SF


In [36]:
players.duplicated()


0    False
1    False
2     True
3    False
4    False
dtype: bool

In [37]:
players.duplicated(subset = ['Name'])

0    False
1    False
2     True
3    False
4     True
dtype: bool

In [38]:
players.duplicated(subset=['Name'], keep='last')

0     True
1    False
2     True
3    False
4    False
dtype: bool

In [39]:
#Drop duplicates
players.drop_duplicates()

Unnamed: 0,Name,Pos
0,Kobe Bryant,SG
1,LeBron James,SF
3,Carmelo Anthony,SF
4,Kobe Bryant,SF


In [40]:
players.drop_duplicates(subset =['Name'])

Unnamed: 0,Name,Pos
0,Kobe Bryant,SG
1,LeBron James,SF
3,Carmelo Anthony,SF


In [41]:
players.drop_duplicates(subset =['Name'], keep ='last')

Unnamed: 0,Name,Pos
1,LeBron James,SF
3,Carmelo Anthony,SF
4,Kobe Bryant,SF


In [42]:
#Text Handling

df = pd.DataFrame({
    'Data': [
        '1987_M_US _1',
        '1990?_M_UK_1',
        '1992_F_US_2',
        '1970?_M_   IT_1',
        '1985_F_I  T_2'
]})

In [43]:
df

Unnamed: 0,Data
0,1987_M_US _1
1,1990?_M_UK_1
2,1992_F_US_2
3,1970?_M_ IT_1
4,1985_F_I T_2


In [44]:
#Columns are Year, Sex, County and num Childen
df['Data'].str.split('_')

0       [1987, M, US , 1]
1       [1990?, M, UK, 1]
2        [1992, F, US, 2]
3    [1970?, M,    IT, 1]
4      [1985, F, I  T, 2]
Name: Data, dtype: object

In [48]:
df = df['Data'].str.split('_', expand = True)

In [50]:
df

Unnamed: 0,0,1,2,3
0,1987,M,US,1
1,1990?,M,UK,1
2,1992,F,US,2
3,1970?,M,IT,1
4,1985,F,I T,2


In [51]:
df.columns = ['Year', 'Sex', 'Country', 'Num Children']

In [52]:
df

Unnamed: 0,Year,Sex,Country,Num Children
0,1987,M,US,1
1,1990?,M,UK,1
2,1992,F,US,2
3,1970?,M,IT,1
4,1985,F,I T,2


In [53]:
df['Year'].str.contains('\?')
#Regex arg, escape ? as it has special meaning , leters don't need escape

0    False
1     True
2    False
3     True
4    False
Name: Year, dtype: bool

In [54]:
df['Country'].str.contains('U')

0     True
1     True
2     True
3    False
4    False
Name: Country, dtype: bool

In [55]:
#Remove blank space
df['Country'].str.strip()

0      US
1      UK
2      US
3      IT
4    I  T
Name: Country, dtype: object

In [56]:
#Or with replace
df['Country'].str.replace(' ', '')

0    US
1    UK
2    US
3    IT
4    IT
Name: Country, dtype: object

In [57]:
#replace and contains take regex - can replace in bulk
df['Year'].str.replace(r'(?P\d{4})\?', lambda m: m.group('year'))

#As you can see, all these string/text-related operations are applied over the str attribute of the series. 
#That's because they have a special place in Series handling and you can read more about it here.

  df['Year'].str.replace(r'(?P\d{4})\?', lambda m: m.group('year'))


error: unknown extension ?P\d at position 1