## Categorical Data

In [1]:
import numpy as np
import pandas as pd

In [2]:
values = pd.Series(['apple','mango','apple','apple'],
                  index=[0,1,2,3])

In [3]:
values

0    apple
1    mango
2    apple
3    apple
dtype: object

In [4]:
pd.unique(values)

array(['apple', 'mango'], dtype=object)

In [5]:
values.unique()

array(['apple', 'mango'], dtype=object)

In [6]:
pd.value_counts(values)

apple    3
mango    1
dtype: int64

In [7]:
values.value_counts()

apple    3
mango    1
dtype: int64

In [10]:
fruits = ['apple','mango','apple','apple'] * 2

In [11]:
fruits

['apple', 'mango', 'apple', 'apple', 'apple', 'mango', 'apple', 'apple']

In [12]:
n = len(fruits)

In [13]:
n

8

In [14]:
df = pd.DataFrame({
    'fruit':fruits,
    'basket_id':np.arange(n),
    'count':np.random.randint(3,15,size=n),
    'weight':np.random.uniform(0,4,size=n)
}, columns=['basket_id','fruit','count','weight'])

In [15]:
df

Unnamed: 0,basket_id,fruit,count,weight
0,0,apple,11,0.026521
1,1,mango,5,3.820193
2,2,apple,3,1.175296
3,3,apple,5,2.305198
4,4,apple,4,2.937309
5,5,mango,14,0.038317
6,6,apple,7,0.059601
7,7,apple,4,2.111384


In [16]:
df['fruit']

0    apple
1    mango
2    apple
3    apple
4    apple
5    mango
6    apple
7    apple
Name: fruit, dtype: object

In [17]:
fruit_cat = df['fruit'].astype('category')

In [18]:
fruit_cat

0    apple
1    mango
2    apple
3    apple
4    apple
5    mango
6    apple
7    apple
Name: fruit, dtype: category
Categories (2, object): ['apple', 'mango']

In [19]:
res = fruit_cat.values

In [20]:
res

['apple', 'mango', 'apple', 'apple', 'apple', 'mango', 'apple', 'apple']
Categories (2, object): ['apple', 'mango']

In [21]:
res.categories

Index(['apple', 'mango'], dtype='object')

In [23]:
res.codes

array([0, 1, 0, 0, 0, 1, 0, 0], dtype=int8)

In [24]:
df['fruit'] = df['fruit'].astype('category')

In [25]:
df['fruit']

0    apple
1    mango
2    apple
3    apple
4    apple
5    mango
6    apple
7    apple
Name: fruit, dtype: category
Categories (2, object): ['apple', 'mango']

In [28]:
ser1 = pd.Series(['a','b','c','d'] * 2)

In [29]:
ser1

0    a
1    b
2    c
3    d
4    a
5    b
6    c
7    d
dtype: object

In [30]:
res = ser1.astype('category')

In [31]:
res

0    a
1    b
2    c
3    d
4    a
5    b
6    c
7    d
dtype: category
Categories (4, object): ['a', 'b', 'c', 'd']

In [34]:
res.cat.codes

0    0
1    1
2    2
3    3
4    0
5    1
6    2
7    3
dtype: int8

In [35]:
res.value_counts()

a    2
b    2
c    2
d    2
dtype: int64

In [36]:
cat_s = pd.Series(['a','b','c','d']*2, dtype='category')

In [37]:
cat_s

0    a
1    b
2    c
3    d
4    a
5    b
6    c
7    d
dtype: category
Categories (4, object): ['a', 'b', 'c', 'd']

In [38]:
dummy = pd.get_dummies(cat_s)

In [39]:
dummy

Unnamed: 0,a,b,c,d
0,1,0,0,0
1,0,1,0,0
2,0,0,1,0
3,0,0,0,1
4,1,0,0,0
5,0,1,0,0
6,0,0,1,0
7,0,0,0,1


### GroupBy

In [41]:
df = pd.DataFrame({
    'key':['a','b','c']*4,
    'value':np.arange(0,12)
})

In [42]:
df

Unnamed: 0,key,value
0,a,0
1,b,1
2,c,2
3,a,3
4,b,4
5,c,5
6,a,6
7,b,7
8,c,8
9,a,9


In [43]:
res = df.groupby('key').value

In [45]:
res.max()

key
a     9
b    10
c    11
Name: value, dtype: int32

In [46]:
res.min()

key
a    0
b    1
c    2
Name: value, dtype: int32

In [47]:
res.mean()

key
a    4.5
b    5.5
c    6.5
Name: value, dtype: float64

In [48]:
res.sum()

key
a    18
b    22
c    26
Name: value, dtype: int32

In [49]:
res.median()

key
a    4.5
b    5.5
c    6.5
Name: value, dtype: float64

In [50]:
res.count()

key
a    4
b    4
c    4
Name: value, dtype: int64

In [51]:
url = 'https://raw.githubusercontent.com/justmarkham/pandas-videos/master/data/drinks.csv'

In [52]:
df = pd.read_csv(url)

In [53]:
df

Unnamed: 0,country,beer_servings,spirit_servings,wine_servings,total_litres_of_pure_alcohol,continent
0,Afghanistan,0,0,0,0.0,Asia
1,Albania,89,132,54,4.9,Europe
2,Algeria,25,0,14,0.7,Africa
3,Andorra,245,138,312,12.4,Europe
4,Angola,217,57,45,5.9,Africa
...,...,...,...,...,...,...
188,Venezuela,333,100,3,7.7,South America
189,Vietnam,111,2,1,2.0,Asia
190,Yemen,6,0,0,0.1,Asia
191,Zambia,32,19,4,2.5,Africa


In [54]:
df.head()

Unnamed: 0,country,beer_servings,spirit_servings,wine_servings,total_litres_of_pure_alcohol,continent
0,Afghanistan,0,0,0,0.0,Asia
1,Albania,89,132,54,4.9,Europe
2,Algeria,25,0,14,0.7,Africa
3,Andorra,245,138,312,12.4,Europe
4,Angola,217,57,45,5.9,Africa


In [55]:
df.tail()

Unnamed: 0,country,beer_servings,spirit_servings,wine_servings,total_litres_of_pure_alcohol,continent
188,Venezuela,333,100,3,7.7,South America
189,Vietnam,111,2,1,2.0,Asia
190,Yemen,6,0,0,0.1,Asia
191,Zambia,32,19,4,2.5,Africa
192,Zimbabwe,64,18,4,4.7,Africa


In [56]:
df['beer_servings'].mean()

106.16062176165804

In [57]:
df['wine_servings'].mean()

49.45077720207254

In [58]:
df.groupby('continent').beer_servings.mean()

continent
Africa            61.471698
Asia              37.045455
Europe           193.777778
North America    145.434783
Oceania           89.687500
South America    175.083333
Name: beer_servings, dtype: float64

In [59]:
df.groupby('continent').wine_servings.mean()

continent
Africa            16.264151
Asia               9.068182
Europe           142.222222
North America     24.521739
Oceania           35.625000
South America     62.416667
Name: wine_servings, dtype: float64

In [61]:
df.groupby('continent').total_litres_of_pure_alcohol.mean()

continent
Africa           3.007547
Asia             2.170455
Europe           8.617778
North America    5.995652
Oceania          3.381250
South America    6.308333
Name: total_litres_of_pure_alcohol, dtype: float64

In [62]:
df[df['continent']=='Africa']

Unnamed: 0,country,beer_servings,spirit_servings,wine_servings,total_litres_of_pure_alcohol,continent
2,Algeria,25,0,14,0.7,Africa
4,Angola,217,57,45,5.9,Africa
18,Benin,34,4,13,1.1,Africa
22,Botswana,173,35,35,5.4,Africa
26,Burkina Faso,25,7,7,4.3,Africa
27,Burundi,88,0,0,6.3,Africa
28,Cote d'Ivoire,37,1,7,4.0,Africa
29,Cabo Verde,144,56,16,4.0,Africa
31,Cameroon,147,1,4,5.8,Africa
33,Central African Republic,17,2,1,1.8,Africa


In [63]:
df[df['continent']=='Asia']

Unnamed: 0,country,beer_servings,spirit_servings,wine_servings,total_litres_of_pure_alcohol,continent
0,Afghanistan,0,0,0,0.0,Asia
12,Bahrain,42,63,7,2.0,Asia
13,Bangladesh,0,0,0,0.0,Asia
19,Bhutan,23,0,0,0.4,Asia
24,Brunei,31,2,1,0.6,Asia
30,Cambodia,57,65,1,2.2,Asia
36,China,79,192,8,5.0,Asia
46,North Korea,0,0,0,0.0,Asia
77,India,9,114,0,2.2,Asia
78,Indonesia,5,1,0,0.1,Asia


In [65]:
df[df['continent']=='Africa'].mean()

beer_servings                   61.471698
spirit_servings                 16.339623
wine_servings                   16.264151
total_litres_of_pure_alcohol     3.007547
dtype: float64

In [66]:
df[df['continent']=='Asia'].mean()

beer_servings                   37.045455
spirit_servings                 60.840909
wine_servings                    9.068182
total_litres_of_pure_alcohol     2.170455
dtype: float64

In [67]:
df.groupby('continent').beer_servings.max()

continent
Africa           376
Asia             247
Europe           361
North America    285
Oceania          306
South America    333
Name: beer_servings, dtype: int64

In [68]:
df.groupby('continent').wine_servings.max()

continent
Africa           233
Asia             123
Europe           370
North America    100
Oceania          212
South America    221
Name: wine_servings, dtype: int64

In [69]:
df.groupby('continent').beer_servings.agg(['count','min','max','mean','count'])

Unnamed: 0_level_0,count,min,max,mean,count
continent,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Africa,53,0,376,61.471698,53
Asia,44,0,247,37.045455,44
Europe,45,0,361,193.777778,45
North America,23,1,285,145.434783,23
Oceania,16,0,306,89.6875,16
South America,12,93,333,175.083333,12


In [70]:
df.groupby('continent').mean()

Unnamed: 0_level_0,beer_servings,spirit_servings,wine_servings,total_litres_of_pure_alcohol
continent,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Africa,61.471698,16.339623,16.264151,3.007547
Asia,37.045455,60.840909,9.068182,2.170455
Europe,193.777778,132.555556,142.222222,8.617778
North America,145.434783,165.73913,24.521739,5.995652
Oceania,89.6875,58.4375,35.625,3.38125
South America,175.083333,114.75,62.416667,6.308333


### String Handling

In [71]:
val = 'a,b, steve'

In [72]:
val

'a,b, steve'

In [73]:
res = val.split(',')

In [74]:
res

['a', 'b', ' steve']

In [76]:
'::'.join(res)

'a::b:: steve'

In [77]:
txt = """
dave@gmail.com
Steve@google.com
rob@gmail.com
ryan@yahoo.com
"""

In [78]:
pattern = r'[a-z A-Z]+@[a-zA-Z]+[.com]+'

In [79]:
import re

In [83]:
re.findall(pattern,txt)

['dave@gmail.com', 'Steve@google.com', 'rob@gmail.com', 'ryan@yahoo.com']

In [84]:
regex = re.compile(pattern)

In [85]:
regex.findall(txt)

['dave@gmail.com', 'Steve@google.com', 'rob@gmail.com', 'ryan@yahoo.com']

In [91]:
data = {
    "Punit":'punit@yahoo.com',
    "Dharmesh":'dharmesh@gmail.com',
    "Rob":'rob@gmail.com',
    "Steve":np.nan
}

In [92]:
data_ser = pd.Series(data)

In [93]:
data_ser

Punit          punit@yahoo.com
Dharmesh    dharmesh@gmail.com
Rob              rob@gmail.com
Steve                      NaN
dtype: object

In [94]:
data_ser.isnull()

Punit       False
Dharmesh    False
Rob         False
Steve        True
dtype: bool

In [95]:
data_ser.isna()

Punit       False
Dharmesh    False
Rob         False
Steve        True
dtype: bool

In [96]:
data_ser.str.contains('gmail')

Punit       False
Dharmesh     True
Rob          True
Steve         NaN
dtype: object

In [98]:
data_ser.str.contains('h')

Punit        True
Dharmesh     True
Rob         False
Steve         NaN
dtype: object

In [99]:
data_ser.str.upper()

Punit          PUNIT@YAHOO.COM
Dharmesh    DHARMESH@GMAIL.COM
Rob              ROB@GMAIL.COM
Steve                      NaN
dtype: object

In [100]:
data_ser.str.endswith('com')

Punit       True
Dharmesh    True
Rob         True
Steve        NaN
dtype: object

In [101]:
data_ser.str.len()

Punit       15.0
Dharmesh    18.0
Rob         13.0
Steve        NaN
dtype: float64

In [102]:
data_ser.str.split('@')

Punit          [punit, yahoo.com]
Dharmesh    [dharmesh, gmail.com]
Rob              [rob, gmail.com]
Steve                         NaN
dtype: object

In [106]:
data_ser[1].count('h')

2

In [107]:
data_ser[1].count('dharmesh')

1