In [2]:
import pandas as pd
import numpy as np

## Data Loading

### Loading JSON Data

In [3]:
obj = """
{
    "name": "Steve",
    "places_lived": ["India", "USA", "Spain", "Germany"],
    "pet": "None",
    "sibling": [
        {"name": "Henry", "age": 30, "pets": ["Alpha", "Beta"]},
        {"name": "Katie", "age": 38, "pets": ["Sixes", "Bru", "Cisco"]}
    ]
}
"""

In [4]:
obj

'\n{\n    "name": "Steve",\n    "places_lived": ["India", "USA", "Spain", "Germany"],\n    "pet": "None",\n    "sibling": [\n        {"name": "Henry", "age": 30, "pets": ["Alpha", "Beta"]},\n        {"name": "Katie", "age": 38, "pets": ["Sixes", "Bru", "Cisco"]}\n    ]\n}\n'

In [5]:
print(obj)


{
    "name": "Steve",
    "places_lived": ["India", "USA", "Spain", "Germany"],
    "pet": "None",
    "sibling": [
        {"name": "Henry", "age": 30, "pets": ["Alpha", "Beta"]},
        {"name": "Katie", "age": 38, "pets": ["Sixes", "Bru", "Cisco"]}
    ]
}



In [6]:
import json

In [7]:
result = json.loads(obj)

In [8]:
print(result)

{'name': 'Steve', 'places_lived': ['India', 'USA', 'Spain', 'Germany'], 'pet': 'None', 'sibling': [{'name': 'Henry', 'age': 30, 'pets': ['Alpha', 'Beta']}, {'name': 'Katie', 'age': 38, 'pets': ['Sixes', 'Bru', 'Cisco']}]}


In [9]:
print(type(result))

<class 'dict'>


In [10]:
print(type(obj))  # original data

<class 'str'>


### Back To Our Original Data

In [11]:
as_json = json.dumps(result)

In [12]:
print(as_json)

{"name": "Steve", "places_lived": ["India", "USA", "Spain", "Germany"], "pet": "None", "sibling": [{"name": "Henry", "age": 30, "pets": ["Alpha", "Beta"]}, {"name": "Katie", "age": 38, "pets": ["Sixes", "Bru", "Cisco"]}]}


In [13]:
print(type(as_json))

<class 'str'>


### Accessing Loading Data

In [14]:
result['sibling']

[{'name': 'Henry', 'age': 30, 'pets': ['Alpha', 'Beta']},
 {'name': 'Katie', 'age': 38, 'pets': ['Sixes', 'Bru', 'Cisco']}]

In [15]:
result['sibling'][0]

{'name': 'Henry', 'age': 30, 'pets': ['Alpha', 'Beta']}

In [16]:
result['sibling'][0]['pets']

['Alpha', 'Beta']

### Loading JSON Data Into DataFrame

In [18]:
pd.DataFrame(result['sibling'])

Unnamed: 0,name,age,pets
0,Henry,30,"[Alpha, Beta]"
1,Katie,38,"[Sixes, Bru, Cisco]"


In [19]:
pd.DataFrame(result['sibling'], columns=['name','age'])

Unnamed: 0,name,age
0,Henry,30
1,Katie,38


### Loading JSON File

In [20]:
js_df = pd.read_json('data.json')

In [21]:
js_df

Unnamed: 0,A,B,message
0,1,12,hello
1,5,3,world
2,9,7,foo
3,13,15,test
4,5,6,world
5,13,4,test
6,5,0,world


### Loading Excel File

In [22]:
exl_df = pd.read_excel('mybooks.xlsx')

In [23]:
exl_df

Unnamed: 0,sr.,name,class
0,1,abc,fy
1,2,def,ty
2,3,grt,ty
3,4,ader,sy
4,5,hola,sy


### Loading CSV File

In [24]:
csv_df = pd.read_csv('example.csv')

In [25]:
csv_df

Unnamed: 0,a,b,c,d,message
0,1,2,3,4,hello
1,5,6,7,8,world
2,9,10,11,12,foo
3,13,14,15,16,test
4,5,6,7,8,world
5,13,14,15,16,test
6,5,6,7,8,world


## Data Cleansing & Data Preparation

### Handling Missing Data

In [26]:
str_data = pd.Series(['anaconda','spyder',np.nan,'python'])

In [27]:
str_data

0    anaconda
1      spyder
2         NaN
3      python
dtype: object

In [28]:
str_data.isna()

0    False
1    False
2     True
3    False
dtype: bool

In [29]:
str_data.notna()

0     True
1     True
2    False
3     True
dtype: bool

In [30]:
str_data.isnull()

0    False
1    False
2     True
3    False
dtype: bool

In [31]:
str_data.notnull()

0     True
1     True
2    False
3     True
dtype: bool

### Filtering Missing Data

In [32]:
ser1 = pd.Series([1, np.nan, 3.5, np.nan, 5.4])

In [33]:
ser1

0    1.0
1    NaN
2    3.5
3    NaN
4    5.4
dtype: float64

In [34]:
ser1.dropna()

0    1.0
2    3.5
4    5.4
dtype: float64

In [35]:
ser1

0    1.0
1    NaN
2    3.5
3    NaN
4    5.4
dtype: float64

In [36]:
ser1[ser1.isna()]

1   NaN
3   NaN
dtype: float64

In [37]:
ser1[ser1.notna()]

0    1.0
2    3.5
4    5.4
dtype: float64

In [38]:
data = pd.DataFrame([
    [1, 6, 5, 3, 2],
    [3, np.nan, np.nan, np.nan, 7],
    [5, np.nan, np.nan, np.nan, np.nan],
    [np.nan, np.nan, np.nan, np.nan, np.nan],
    [np.nan, 6.5,3]
])

In [39]:
data

Unnamed: 0,0,1,2,3,4
0,1.0,6.0,5.0,3.0,2.0
1,3.0,,,,7.0
2,5.0,,,,
3,,,,,
4,,6.5,3.0,,


In [40]:
data.dropna()

Unnamed: 0,0,1,2,3,4
0,1.0,6.0,5.0,3.0,2.0


In [41]:
data.dropna(how='all')

Unnamed: 0,0,1,2,3,4
0,1.0,6.0,5.0,3.0,2.0
1,3.0,,,,7.0
2,5.0,,,,
4,,6.5,3.0,,


In [42]:
data.dropna(axis=1)

0
1
2
3
4


In [43]:
data.dropna(axis=1, how='all')

Unnamed: 0,0,1,2,3,4
0,1.0,6.0,5.0,3.0,2.0
1,3.0,,,,7.0
2,5.0,,,,
3,,,,,
4,,6.5,3.0,,


In [44]:
df = pd.DataFrame(np.random.randn(7,3))

In [45]:
df

Unnamed: 0,0,1,2
0,0.285425,0.505802,1.588509
1,-0.872976,1.329415,0.541907
2,-0.218799,-0.962346,1.531166
3,0.298225,-0.589677,0.808805
4,1.574851,0.616873,-1.272907
5,2.670291,-1.192176,-0.042352
6,0.803892,0.585326,1.247014


In [46]:
df.iloc[0:3,1] = np.nan

In [47]:
df

Unnamed: 0,0,1,2
0,0.285425,,1.588509
1,-0.872976,,0.541907
2,-0.218799,,1.531166
3,0.298225,-0.589677,0.808805
4,1.574851,0.616873,-1.272907
5,2.670291,-1.192176,-0.042352
6,0.803892,0.585326,1.247014


In [48]:
df.iloc[:2,2] = np.nan

In [49]:
df

Unnamed: 0,0,1,2
0,0.285425,,
1,-0.872976,,
2,-0.218799,,1.531166
3,0.298225,-0.589677,0.808805
4,1.574851,0.616873,-1.272907
5,2.670291,-1.192176,-0.042352
6,0.803892,0.585326,1.247014


In [50]:
df.fillna(0)  # throwing copy

Unnamed: 0,0,1,2
0,0.285425,0.0,0.0
1,-0.872976,0.0,0.0
2,-0.218799,0.0,1.531166
3,0.298225,-0.589677,0.808805
4,1.574851,0.616873,-1.272907
5,2.670291,-1.192176,-0.042352
6,0.803892,0.585326,1.247014


In [51]:
df

Unnamed: 0,0,1,2
0,0.285425,,
1,-0.872976,,
2,-0.218799,,1.531166
3,0.298225,-0.589677,0.808805
4,1.574851,0.616873,-1.272907
5,2.670291,-1.192176,-0.042352
6,0.803892,0.585326,1.247014


In [52]:
df.fillna({1:0,2:0.5})

Unnamed: 0,0,1,2
0,0.285425,0.0,0.5
1,-0.872976,0.0,0.5
2,-0.218799,0.0,1.531166
3,0.298225,-0.589677,0.808805
4,1.574851,0.616873,-1.272907
5,2.670291,-1.192176,-0.042352
6,0.803892,0.585326,1.247014


In [53]:
df.fillna(0.4, inplace=True)

In [54]:
df

Unnamed: 0,0,1,2
0,0.285425,0.4,0.4
1,-0.872976,0.4,0.4
2,-0.218799,0.4,1.531166
3,0.298225,-0.589677,0.808805
4,1.574851,0.616873,-1.272907
5,2.670291,-1.192176,-0.042352
6,0.803892,0.585326,1.247014


In [55]:
df = pd.DataFrame(np.random.randn(6,3))

In [56]:
df

Unnamed: 0,0,1,2
0,0.401158,0.72094,-0.197662
1,1.032774,0.810916,1.900233
2,0.327821,-0.134398,-0.261791
3,0.402401,1.133308,1.321393
4,-1.254992,-0.542844,0.427283
5,0.03557,-0.327401,0.291572


In [57]:
df.iloc[2:,1] = np.nan
df.iloc[4:,2] = np.nan

In [58]:
df

Unnamed: 0,0,1,2
0,0.401158,0.72094,-0.197662
1,1.032774,0.810916,1.900233
2,0.327821,,-0.261791
3,0.402401,,1.321393
4,-1.254992,,
5,0.03557,,


In [59]:
df.fillna(method='ffill')

Unnamed: 0,0,1,2
0,0.401158,0.72094,-0.197662
1,1.032774,0.810916,1.900233
2,0.327821,0.810916,-0.261791
3,0.402401,0.810916,1.321393
4,-1.254992,0.810916,1.321393
5,0.03557,0.810916,1.321393


In [60]:
df.fillna(method='ffill', limit=2)

Unnamed: 0,0,1,2
0,0.401158,0.72094,-0.197662
1,1.032774,0.810916,1.900233
2,0.327821,0.810916,-0.261791
3,0.402401,0.810916,1.321393
4,-1.254992,,1.321393
5,0.03557,,1.321393


In [61]:
ser2 = pd.Series([2, np.nan, 4.3, np.nan])

In [62]:
ser2

0    2.0
1    NaN
2    4.3
3    NaN
dtype: float64

In [63]:
ser2.fillna(data.mean())

0    2.00
1    6.25
2    4.30
3    3.00
dtype: float64

In [64]:
ser2

0    2.0
1    NaN
2    4.3
3    NaN
dtype: float64

#### Removing Duplicates

In [65]:
data = pd.DataFrame(
    {
        'k1':['one','two']*3 + ['two'],
        'k2':[1,3,4,3,1,4,4]
    }
)

In [66]:
data

Unnamed: 0,k1,k2
0,one,1
1,two,3
2,one,4
3,two,3
4,one,1
5,two,4
6,two,4


In [67]:
data.duplicated()

0    False
1    False
2    False
3     True
4     True
5    False
6     True
dtype: bool

In [68]:
data.drop_duplicates()

Unnamed: 0,k1,k2
0,one,1
1,two,3
2,one,4
5,two,4


In [69]:
data['v1'] = range(7)

In [70]:
data

Unnamed: 0,k1,k2,v1
0,one,1,0
1,two,3,1
2,one,4,2
3,two,3,3
4,one,1,4
5,two,4,5
6,two,4,6


In [71]:
data.drop_duplicates(['k2'])

Unnamed: 0,k1,k2,v1
0,one,1,0
1,two,3,1
2,one,4,2


In [72]:
data.drop_duplicates(['k1','k2'])

Unnamed: 0,k1,k2,v1
0,one,1,0
1,two,3,1
2,one,4,2
5,two,4,5


In [73]:
data['k1'][3] = 'TEN'

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['k1'][3] = 'TEN'


In [74]:
data

Unnamed: 0,k1,k2,v1
0,one,1,0
1,two,3,1
2,one,4,2
3,TEN,3,3
4,one,1,4
5,two,4,5
6,two,4,6


### Transforming Data Using Function Or Mapping

In [75]:
data = pd.DataFrame(
    {
        "food":['apple','egg','wheat','bread','rice','dal','grapes','pineapple','honey'],
        'calories':[4,3,12,6,7.5,8,3,5,6]
    }
)

In [76]:
data

Unnamed: 0,food,calories
0,apple,4.0
1,egg,3.0
2,wheat,12.0
3,bread,6.0
4,rice,7.5
5,dal,8.0
6,grapes,3.0
7,pineapple,5.0
8,honey,6.0


In [77]:
food_categories = {
    'apple':'fruit',
    'egg':'non veg food',
    'wheat':'grain',
    'bread':'veg food',
    'rice':'grain',
    'dal':'grain',
    'grapes':'fruit',
    'pineapple':'fruit',
    'honey':'veg food'
}

In [78]:
food_categories

{'apple': 'fruit',
 'egg': 'non veg food',
 'wheat': 'grain',
 'bread': 'veg food',
 'rice': 'grain',
 'dal': 'grain',
 'grapes': 'fruit',
 'pineapple': 'fruit',
 'honey': 'veg food'}

In [79]:
data['food'] = data['food'].str.upper()

In [80]:
data

Unnamed: 0,food,calories
0,APPLE,4.0
1,EGG,3.0
2,WHEAT,12.0
3,BREAD,6.0
4,RICE,7.5
5,DAL,8.0
6,GRAPES,3.0
7,PINEAPPLE,5.0
8,HONEY,6.0


In [81]:
data['food'] = data['food'].str.lower()

In [82]:
data

Unnamed: 0,food,calories
0,apple,4.0
1,egg,3.0
2,wheat,12.0
3,bread,6.0
4,rice,7.5
5,dal,8.0
6,grapes,3.0
7,pineapple,5.0
8,honey,6.0


In [83]:
data['category'] = data['food'].map(food_categories)

In [84]:
data

Unnamed: 0,food,calories,category
0,apple,4.0,fruit
1,egg,3.0,non veg food
2,wheat,12.0,grain
3,bread,6.0,veg food
4,rice,7.5,grain
5,dal,8.0,grain
6,grapes,3.0,fruit
7,pineapple,5.0,fruit
8,honey,6.0,veg food


In [85]:
ser3 = pd.Series([1,-99,2,-99,-1000,3])

In [86]:
ser3

0       1
1     -99
2       2
3     -99
4   -1000
5       3
dtype: int64

In [87]:
ser3.replace(-99, np.nan)

0       1.0
1       NaN
2       2.0
3       NaN
4   -1000.0
5       3.0
dtype: float64

In [88]:
ser3.replace([-99,3], np.nan)

0       1.0
1       NaN
2       2.0
3       NaN
4   -1000.0
5       NaN
dtype: float64

In [89]:
ser3

0       1
1     -99
2       2
3     -99
4   -1000
5       3
dtype: int64

In [90]:
ser3.replace([-99,3],[0,np.nan])

0       1.0
1       0.0
2       2.0
3       0.0
4   -1000.0
5       NaN
dtype: float64

In [91]:
ser3.replace({-99:np.nan,3:0})

0       1.0
1       NaN
2       2.0
3       NaN
4   -1000.0
5       0.0
dtype: float64

### Renaming Axis Indexes

In [92]:
data = pd.DataFrame(np.arange(12).reshape(3,4),
                   index=['Mum','Pune','Chennai'],
                   columns=['one','two','three','four'])

In [93]:
data

Unnamed: 0,one,two,three,four
Mum,0,1,2,3
Pune,4,5,6,7
Chennai,8,9,10,11


In [94]:
to_upper = lambda x : x.upper()

In [95]:
data.index

Index(['Mum', 'Pune', 'Chennai'], dtype='object')

In [96]:
data.index.map(to_upper)

Index(['MUM', 'PUNE', 'CHENNAI'], dtype='object')

In [97]:
data.index

Index(['Mum', 'Pune', 'Chennai'], dtype='object')

In [98]:
data.columns

Index(['one', 'two', 'three', 'four'], dtype='object')

In [99]:
data.columns.map(to_upper)

Index(['ONE', 'TWO', 'THREE', 'FOUR'], dtype='object')

In [100]:
data.columns

Index(['one', 'two', 'three', 'four'], dtype='object')

In [101]:
data

Unnamed: 0,one,two,three,four
Mum,0,1,2,3
Pune,4,5,6,7
Chennai,8,9,10,11


In [102]:
data.rename(index=str.upper)

Unnamed: 0,one,two,three,four
MUM,0,1,2,3
PUNE,4,5,6,7
CHENNAI,8,9,10,11


In [103]:
data.rename(columns=str.upper)

Unnamed: 0,ONE,TWO,THREE,FOUR
Mum,0,1,2,3
Pune,4,5,6,7
Chennai,8,9,10,11


In [104]:
data.rename(index=str.upper, columns=str.upper)

Unnamed: 0,ONE,TWO,THREE,FOUR
MUM,0,1,2,3
PUNE,4,5,6,7
CHENNAI,8,9,10,11


### Binning

In [105]:
ages = [20,22,25,24,27,19,21,27,31,28,61,45,41,32]

In [106]:
bins = [15,25,35,45,55,65]

In [107]:
category = pd.cut(ages, bins)

In [108]:
category

[(15, 25], (15, 25], (15, 25], (15, 25], (25, 35], ..., (25, 35], (55, 65], (35, 45], (35, 45], (25, 35]]
Length: 14
Categories (5, interval[int64]): [(15, 25] < (25, 35] < (35, 45] < (45, 55] < (55, 65]]

In [109]:
pd.value_counts(category)

(15, 25]    6
(25, 35]    5
(35, 45]    2
(55, 65]    1
(45, 55]    0
dtype: int64