# 1. Removing Duplicates

In [1]:
import numpy as np
import pandas as pd

In [2]:
dataframe = pd.DataFrame({'k1': ['one', 'two'] * 3 + ['two'],
                          'k2': [1, 1, 2, 3, 3, 4, 4]})

In [3]:
dataframe

Unnamed: 0,k1,k2
0,one,1
1,two,1
2,one,2
3,two,3
4,one,3
5,two,4
6,two,4


In [4]:
#returns True if the same row values in the next row
dataframe.duplicated()

0    False
1    False
2    False
3    False
4    False
5    False
6     True
dtype: bool

In [5]:
#drops the duplicated rows
dataframe.drop_duplicates()

Unnamed: 0,k1,k2
0,one,1
1,two,1
2,one,2
3,two,3
4,one,3
5,two,4


In [6]:
#by default drop_duplicates() perform ALL rows comparision for both rows
#to do it with only one/more column value

dataframe.drop_duplicates(['k2'])

Unnamed: 0,k1,k2
0,one,1
2,one,2
3,two,3
5,two,4


In [7]:
#keeping the last duplicates
dataframe.drop_duplicates(['k2'], keep='last')

Unnamed: 0,k1,k2
1,two,1
2,one,2
4,one,3
6,two,4


# 2. Transforming Data Using a Function or Mapping

In [8]:
data = pd.DataFrame({'food': ['bacon', 'pulled pork', 'bacon',
                              'Pastrami', 'corned beef', 'Bacon',
                              'pastrami', 'honey ham', 'nova lox'],
                     'ounces': [4, 3, 12, 6, 7.5, 8, 3, 5, 6]})

In [9]:
data

Unnamed: 0,food,ounces
0,bacon,4.0
1,pulled pork,3.0
2,bacon,12.0
3,Pastrami,6.0
4,corned beef,7.5
5,Bacon,8.0
6,pastrami,3.0
7,honey ham,5.0
8,nova lox,6.0


In [10]:
meat_to_animal = {
    'bacon': 'pig',
    'pulled pork': 'pig',
    'pastrami': 'cow',
    'corned beef': 'cow',
    'honey ham': 'pig',
    'nova lox': 'salmon'
}

In [11]:
lowercased = data['food'].str.lower()
lowercased

0          bacon
1    pulled pork
2          bacon
3       pastrami
4    corned beef
5          bacon
6       pastrami
7      honey ham
8       nova lox
Name: food, dtype: object

### 2.1 Mapping

In [12]:
data['animal'] = lowercased.map(meat_to_animal)

In [13]:
data

Unnamed: 0,food,ounces,animal
0,bacon,4.0,pig
1,pulled pork,3.0,pig
2,bacon,12.0,pig
3,Pastrami,6.0,cow
4,corned beef,7.5,cow
5,Bacon,8.0,pig
6,pastrami,3.0,cow
7,honey ham,5.0,pig
8,nova lox,6.0,salmon


## 3. Replacing Values

In [14]:
data = pd.Series([1, 2, 99, 100, 99, 5])
data

0      1
1      2
2     99
3    100
4     99
5      5
dtype: int64

In [15]:
#replacing 99 with 33 value
data.replace(99, 33)

0      1
1      2
2     33
3    100
4     33
5      5
dtype: int64

In [16]:
#multiple values replace
data.replace([99, 100], 18)

0     1
1     2
2    18
3    18
4    18
5     5
dtype: int64

In [17]:
#zipping each key-value replacement
data.replace([99, 100], [10, 20])

0     1
1     2
2    10
3    20
4    10
5     5
dtype: int64

In [18]:
#alternatively
data.replace({99: 10, 100: 20})

0     1
1     2
2    10
3    20
4    10
5     5
dtype: int64

## 4. Renaming Axis Indexes

In [19]:
data = pd.DataFrame(np.arange(12).reshape((3, 4)),
                    index=['Ohio', 'Colorado', 'New York'],
                    columns=['one', 'two', 'three', 'four'])

In [20]:
transform = lambda x: x[:4].upper()
#map()
data.index.map(transform)

Index(['OHIO', 'COLO', 'NEW '], dtype='object')

In [21]:
#in-palce
data.index = data.index.map(transform)
data

Unnamed: 0,one,two,three,four
OHIO,0,1,2,3
COLO,4,5,6,7
NEW,8,9,10,11


### 4.1 rename()

In [22]:
data.rename(index=str.title, columns=str.upper)

Unnamed: 0,ONE,TWO,THREE,FOUR
Ohio,0,1,2,3
Colo,4,5,6,7
New,8,9,10,11


In [23]:
data.rename(index={'COLO': 'California'},
            columns={'three': 'hundered'})

Unnamed: 0,one,two,hundered,four
OHIO,0,1,2,3
California,4,5,6,7
NEW,8,9,10,11


In [24]:
data

Unnamed: 0,one,two,three,four
OHIO,0,1,2,3
COLO,4,5,6,7
NEW,8,9,10,11


In [25]:
#in-place
data.rename(index={'COLO': 'LA'},
            columns={'three': 'hundered'},
            inplace=True)

In [26]:
data

Unnamed: 0,one,two,hundered,four
OHIO,0,1,2,3
LA,4,5,6,7
NEW,8,9,10,11
