# Chapter 7: Data Cleaning and Preperation

## 7.1: Handling Missing Data

In [2]:
import pandas as pd
import numpy as np

In [3]:
float_data=pd.Series([1.2,-3.5,np.nan,0])
float_data

0    1.2
1   -3.5
2    NaN
3    0.0
dtype: float64

In [4]:
float_data.isna()

0    False
1    False
2     True
3    False
dtype: bool

In [5]:
string_data=pd.Series(["aardvark",np.nan,None,"avocado"])
string_data

0    aardvark
1         NaN
2        None
3     avocado
dtype: object

In [6]:
string_data.isna()

0    False
1     True
2     True
3    False
dtype: bool

In [7]:
float_data=pd.Series([1,2,None],dtype="float64")
float_data

0    1.0
1    2.0
2    NaN
dtype: float64

In [8]:
float_data.isna()

0    False
1    False
2     True
dtype: bool

### Filtering Out Missing Data

In [9]:
data=pd.Series([1,np.nan,3.5,np.nan,7])
data.dropna()

0    1.0
2    3.5
4    7.0
dtype: float64

In [10]:
data[data.notna()]

0    1.0
2    3.5
4    7.0
dtype: float64

In [11]:
data=pd.DataFrame([[1,6.5,3],[1,np.nan,np.nan],[np.nan,np.nan,np.nan],[np.nan,6.5,3]])
data

Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,,
2,,,
3,,6.5,3.0


In [12]:
data.dropna()

Unnamed: 0,0,1,2
0,1.0,6.5,3.0


In [13]:
data.dropna(how="all")

Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,,
3,,6.5,3.0


In [14]:
data[4]=np.nan
data

Unnamed: 0,0,1,2,4
0,1.0,6.5,3.0,
1,1.0,,,
2,,,,
3,,6.5,3.0,


In [15]:
data.dropna(axis="columns",how="all")

Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,,
2,,,
3,,6.5,3.0


In [16]:
df=pd.DataFrame(np.random.standard_normal((7,3)))
df.iloc[:4,1]=np.nan
df.iloc[:2,2]=np.nan
df

Unnamed: 0,0,1,2
0,-0.640416,,
1,1.90069,,
2,0.52712,,0.213824
3,1.684393,,-0.397866
4,-1.255383,-1.277396,0.572826
5,0.510108,2.022289,1.441096
6,0.162779,0.844844,0.502551


In [17]:
df.dropna()

Unnamed: 0,0,1,2
4,-1.255383,-1.277396,0.572826
5,0.510108,2.022289,1.441096
6,0.162779,0.844844,0.502551


In [18]:
df.dropna(axis="columns")

Unnamed: 0,0
0,-0.640416
1,1.90069
2,0.52712
3,1.684393
4,-1.255383
5,0.510108
6,0.162779


In [19]:
df.dropna(how="all",axis="columns")

Unnamed: 0,0,1,2
0,-0.640416,,
1,1.90069,,
2,0.52712,,0.213824
3,1.684393,,-0.397866
4,-1.255383,-1.277396,0.572826
5,0.510108,2.022289,1.441096
6,0.162779,0.844844,0.502551


In [20]:
df.dropna(how="all")

Unnamed: 0,0,1,2
0,-0.640416,,
1,1.90069,,
2,0.52712,,0.213824
3,1.684393,,-0.397866
4,-1.255383,-1.277396,0.572826
5,0.510108,2.022289,1.441096
6,0.162779,0.844844,0.502551


In [21]:
df.dropna(axis="rows")

Unnamed: 0,0,1,2
4,-1.255383,-1.277396,0.572826
5,0.510108,2.022289,1.441096
6,0.162779,0.844844,0.502551


In [22]:
df.dropna(thresh=2)

Unnamed: 0,0,1,2
2,0.52712,,0.213824
3,1.684393,,-0.397866
4,-1.255383,-1.277396,0.572826
5,0.510108,2.022289,1.441096
6,0.162779,0.844844,0.502551


### Filling In Missing Data

In [23]:
df

Unnamed: 0,0,1,2
0,-0.640416,,
1,1.90069,,
2,0.52712,,0.213824
3,1.684393,,-0.397866
4,-1.255383,-1.277396,0.572826
5,0.510108,2.022289,1.441096
6,0.162779,0.844844,0.502551


In [24]:
df.fillna(0)

Unnamed: 0,0,1,2
0,-0.640416,0.0,0.0
1,1.90069,0.0,0.0
2,0.52712,0.0,0.213824
3,1.684393,0.0,-0.397866
4,-1.255383,-1.277396,0.572826
5,0.510108,2.022289,1.441096
6,0.162779,0.844844,0.502551


In [25]:
df.fillna({1:0.5,2:0})

Unnamed: 0,0,1,2
0,-0.640416,0.5,0.0
1,1.90069,0.5,0.0
2,0.52712,0.5,0.213824
3,1.684393,0.5,-0.397866
4,-1.255383,-1.277396,0.572826
5,0.510108,2.022289,1.441096
6,0.162779,0.844844,0.502551


In [26]:
df = pd.DataFrame(np.random.standard_normal((6, 3)))

In [27]:
df

Unnamed: 0,0,1,2
0,-1.274297,0.60401,-0.326938
1,-2.758795,-0.100215,0.676026
2,-2.578116,0.133034,0.190926
3,-0.049428,2.466866,-0.762843
4,-1.484681,0.184148,1.724909
5,-0.480116,0.922713,-0.255565


In [28]:
df.iloc[2:,1]=np.nan
df.iloc[4:,2]=np.nan
df

Unnamed: 0,0,1,2
0,-1.274297,0.60401,-0.326938
1,-2.758795,-0.100215,0.676026
2,-2.578116,,0.190926
3,-0.049428,,-0.762843
4,-1.484681,,
5,-0.480116,,


In [29]:
df.fillna(method="ffill")

  df.fillna(method="ffill")


Unnamed: 0,0,1,2
0,-1.274297,0.60401,-0.326938
1,-2.758795,-0.100215,0.676026
2,-2.578116,-0.100215,0.190926
3,-0.049428,-0.100215,-0.762843
4,-1.484681,-0.100215,-0.762843
5,-0.480116,-0.100215,-0.762843


In [30]:
df.ffill()

Unnamed: 0,0,1,2
0,-1.274297,0.60401,-0.326938
1,-2.758795,-0.100215,0.676026
2,-2.578116,-0.100215,0.190926
3,-0.049428,-0.100215,-0.762843
4,-1.484681,-0.100215,-0.762843
5,-0.480116,-0.100215,-0.762843


In [31]:
df.fillna(method="ffill",limit=2)

  df.fillna(method="ffill",limit=2)


Unnamed: 0,0,1,2
0,-1.274297,0.60401,-0.326938
1,-2.758795,-0.100215,0.676026
2,-2.578116,-0.100215,0.190926
3,-0.049428,-0.100215,-0.762843
4,-1.484681,,-0.762843
5,-0.480116,,-0.762843


In [32]:
data=pd.Series([1,np.nan,3.5,np.nan,7])
data
data.fillna(data.mean())

0    1.0
1    NaN
2    3.5
3    NaN
4    7.0
dtype: float64

0    1.000000
1    3.833333
2    3.500000
3    3.833333
4    7.000000
dtype: float64

## 7.2: Data Transformation

### Removing Duplicates

In [33]:
data=pd.DataFrame({"k1":["one","two"]*3+["two"],
                   "k2":[1,1,2,3,3,4,4]})
data

Unnamed: 0,k1,k2
0,one,1
1,two,1
2,one,2
3,two,3
4,one,3
5,two,4
6,two,4


In [34]:
data.duplicated()

0    False
1    False
2    False
3    False
4    False
5    False
6     True
dtype: bool

In [35]:
data.drop_duplicates()

Unnamed: 0,k1,k2
0,one,1
1,two,1
2,one,2
3,two,3
4,one,3
5,two,4


In [36]:
data["v1"]=range(7)
data

Unnamed: 0,k1,k2,v1
0,one,1,0
1,two,1,1
2,one,2,2
3,two,3,3
4,one,3,4
5,two,4,5
6,two,4,6


In [37]:
data.drop_duplicates(["k1"])

Unnamed: 0,k1,k2,v1
0,one,1,0
1,two,1,1


In [38]:
data.drop_duplicates(["k1","k2"])

Unnamed: 0,k1,k2,v1
0,one,1,0
1,two,1,1
2,one,2,2
3,two,3,3
4,one,3,4
5,two,4,5


In [39]:
data.drop_duplicates(["k1","k2"],keep="last")

Unnamed: 0,k1,k2,v1
0,one,1,0
1,two,1,1
2,one,2,2
3,two,3,3
4,one,3,4
6,two,4,6


### Transforming Data Using a Function or Mapping

In [40]:
data=pd.DataFrame({"food":["bacon","pulled pork","bacon","pastrami","corned beef","bacon","pastrami","honey ham","nova lox"],
                   "ounces":[4,3,12,6,7.5,8,3,5,6]})

In [41]:
data

Unnamed: 0,food,ounces
0,bacon,4.0
1,pulled pork,3.0
2,bacon,12.0
3,pastrami,6.0
4,corned beef,7.5
5,bacon,8.0
6,pastrami,3.0
7,honey ham,5.0
8,nova lox,6.0


In [42]:
meat_to_animal={
    "bacon":"pig",
    "pulled pork":"pig",
    "pastrami":"cow",
    "corned beef":"cow",
    "honey ham":"pig",
    "nova lox":"salmon"
}

In [43]:
data["animal"]=data["food"].map(meat_to_animal)

In [44]:
data

Unnamed: 0,food,ounces,animal
0,bacon,4.0,pig
1,pulled pork,3.0,pig
2,bacon,12.0,pig
3,pastrami,6.0,cow
4,corned beef,7.5,cow
5,bacon,8.0,pig
6,pastrami,3.0,cow
7,honey ham,5.0,pig
8,nova lox,6.0,salmon


In [45]:
def get_animal(x):
    return meat_to_animal[x]

In [46]:
data["food"].map(get_animal)

0       pig
1       pig
2       pig
3       cow
4       cow
5       pig
6       cow
7       pig
8    salmon
Name: food, dtype: object

In [47]:
meat_to_animal[data["food"][1]]

'pig'

### Replacing Values

In [48]:
data = pd.Series([1., -999., 2., -999., -1000., 3.])
data

0       1.0
1    -999.0
2       2.0
3    -999.0
4   -1000.0
5       3.0
dtype: float64

In [49]:
data.replace(-999,np.nan)

0       1.0
1       NaN
2       2.0
3       NaN
4   -1000.0
5       3.0
dtype: float64

In [50]:
data.replace([-999,-1000],np.nan)

0    1.0
1    NaN
2    2.0
3    NaN
4    NaN
5    3.0
dtype: float64

In [51]:
data.replace([-999,-1000],[np.nan,0])

0    1.0
1    NaN
2    2.0
3    NaN
4    0.0
5    3.0
dtype: float64

In [52]:
data.replace({-999:np.nan,-1000:0})

0    1.0
1    NaN
2    2.0
3    NaN
4    0.0
5    3.0
dtype: float64

### Renaming Axis Indexes

In [53]:
data=pd.DataFrame(np.arange(12).reshape((3,4)),
                  index=["Ohio","Colorado","New York"],
                  columns=["one","two","three","four"])
data

Unnamed: 0,one,two,three,four
Ohio,0,1,2,3
Colorado,4,5,6,7
New York,8,9,10,11


In [54]:
def transform(x):
    return x[:4].upper()

In [55]:
data.index.map(transform)

Index(['OHIO', 'COLO', 'NEW '], dtype='object')

In [56]:
data.index=data.index.map(transform)
data

Unnamed: 0,one,two,three,four
OHIO,0,1,2,3
COLO,4,5,6,7
NEW,8,9,10,11


In [57]:
data.rename(index=str.title,columns=str.upper)

Unnamed: 0,ONE,TWO,THREE,FOUR
Ohio,0,1,2,3
Colo,4,5,6,7
New,8,9,10,11


In [58]:
data

Unnamed: 0,one,two,three,four
OHIO,0,1,2,3
COLO,4,5,6,7
NEW,8,9,10,11


In [60]:
data.rename(index={"OHIO":"INDIANA","COLO":"COLO1"},columns={"three":"pekaboo"})

Unnamed: 0,one,two,pekaboo,four
INDIANA,0,1,2,3
COLO1,4,5,6,7
NEW,8,9,10,11


In [61]:
data

Unnamed: 0,one,two,three,four
OHIO,0,1,2,3
COLO,4,5,6,7
NEW,8,9,10,11


### Discretization and Binning

In [62]:
ages = [20, 22, 25, 27, 21, 23, 37, 31, 61, 45, 41, 32]

In [63]:
bins=[18,25,35,60,100]

In [64]:
age_categories=pd.cut(ages,bins)

In [65]:
age_categories

[(18, 25], (18, 25], (18, 25], (25, 35], (18, 25], ..., (25, 35], (60, 100], (35, 60], (35, 60], (25, 35]]
Length: 12
Categories (4, interval[int64, right]): [(18, 25] < (25, 35] < (35, 60] < (60, 100]]

In [66]:
pd.DataFrame(age_categories)

Unnamed: 0,0
0,"(18, 25]"
1,"(18, 25]"
2,"(18, 25]"
3,"(25, 35]"
4,"(18, 25]"
5,"(18, 25]"
6,"(35, 60]"
7,"(25, 35]"
8,"(60, 100]"
9,"(35, 60]"


In [67]:
age_categories.codes

array([0, 0, 0, 1, 0, 0, 2, 1, 3, 2, 2, 1], dtype=int8)

In [68]:
age_categories.categories

IntervalIndex([(18, 25], (25, 35], (35, 60], (60, 100]], dtype='interval[int64, right]')

In [69]:
age_categories.categories[0]

Interval(18, 25, closed='right')

In [70]:
pd.value_counts(age_categories)

  pd.value_counts(age_categories)


(18, 25]     5
(25, 35]     3
(35, 60]     3
(60, 100]    1
Name: count, dtype: int64

In [71]:
age_categories.value_counts()

(18, 25]     5
(25, 35]     3
(35, 60]     3
(60, 100]    1
Name: count, dtype: int64

In [72]:
pd.cut(ages,bins,right=False)

[[18, 25), [18, 25), [25, 35), [25, 35), [18, 25), ..., [25, 35), [60, 100), [35, 60), [35, 60), [25, 35)]
Length: 12
Categories (4, interval[int64, left]): [[18, 25) < [25, 35) < [35, 60) < [60, 100)]

In [73]:
pd.DataFrame({"Age":ages,"Categories":age_categories,"CategoriesLeft":pd.cut(ages,bins,right=False)})

Unnamed: 0,Age,Categories,CategoriesLeft
0,20,"(18, 25]","[18, 25)"
1,22,"(18, 25]","[18, 25)"
2,25,"(18, 25]","[25, 35)"
3,27,"(25, 35]","[25, 35)"
4,21,"(18, 25]","[18, 25)"
5,23,"(18, 25]","[18, 25)"
6,37,"(35, 60]","[35, 60)"
7,31,"(25, 35]","[25, 35)"
8,61,"(60, 100]","[60, 100)"
9,45,"(35, 60]","[35, 60)"


In [74]:
group_names=["Youth","YoungAdult","MiddleAged","Senior"]

In [75]:
pd.cut(ages,bins,labels=group_names)

['Youth', 'Youth', 'Youth', 'YoungAdult', 'Youth', ..., 'YoungAdult', 'Senior', 'MiddleAged', 'MiddleAged', 'YoungAdult']
Length: 12
Categories (4, object): ['Youth' < 'YoungAdult' < 'MiddleAged' < 'Senior']

In [77]:
pd.DataFrame({"Age":ages,"Categories":age_categories,"CategoriesLeft":pd.cut(ages,bins,right=False),"CategoriesLabels":pd.cut(ages,bins,labels=group_names,right=False)})

Unnamed: 0,Age,Categories,CategoriesLeft,CategoriesLabels
0,20,"(18, 25]","[18, 25)",Youth
1,22,"(18, 25]","[18, 25)",Youth
2,25,"(18, 25]","[25, 35)",YoungAdult
3,27,"(25, 35]","[25, 35)",YoungAdult
4,21,"(18, 25]","[18, 25)",Youth
5,23,"(18, 25]","[18, 25)",Youth
6,37,"(35, 60]","[35, 60)",MiddleAged
7,31,"(25, 35]","[25, 35)",YoungAdult
8,61,"(60, 100]","[60, 100)",Senior
9,45,"(35, 60]","[35, 60)",MiddleAged


In [78]:
data=np.random.uniform(size=20)
data

array([0.98177292, 0.89422377, 0.12789214, 0.51109077, 0.78145715,
       0.28924503, 0.03022487, 0.29762126, 0.84143876, 0.97762259,
       0.07342818, 0.72308566, 0.87338528, 0.3065427 , 0.71782325,
       0.3594448 , 0.66601893, 0.17526583, 0.17651038, 0.93000652])

In [80]:
pd.cut(data,4,precision=2)

[(0.74, 0.98], (0.74, 0.98], (0.029, 0.27], (0.51, 0.74], (0.74, 0.98], ..., (0.27, 0.51], (0.51, 0.74], (0.029, 0.27], (0.029, 0.27], (0.74, 0.98]]
Length: 20
Categories (4, interval[float64, right]): [(0.029, 0.27] < (0.27, 0.51] < (0.51, 0.74] < (0.74, 0.98]]

In [81]:
pd.DataFrame({"Data":data,"Categories":pd.cut(data,4,precision=2)})

Unnamed: 0,Data,Categories
0,0.981773,"(0.74, 0.98]"
1,0.894224,"(0.74, 0.98]"
2,0.127892,"(0.029, 0.27]"
3,0.511091,"(0.51, 0.74]"
4,0.781457,"(0.74, 0.98]"
5,0.289245,"(0.27, 0.51]"
6,0.030225,"(0.029, 0.27]"
7,0.297621,"(0.27, 0.51]"
8,0.841439,"(0.74, 0.98]"
9,0.977623,"(0.74, 0.98]"


In [82]:
data=np.random.standard_normal(1000)
quartiles=pd.qcut(data,4,precision=2)
quartiles

[(-0.78, -0.048], (-3.2699999999999996, -0.78], (-0.048, 0.6], (-0.78, -0.048], (0.6, 3.38], ..., (-0.78, -0.048], (-3.2699999999999996, -0.78], (-0.78, -0.048], (-0.78, -0.048], (0.6, 3.38]]
Length: 1000
Categories (4, interval[float64, right]): [(-3.2699999999999996, -0.78] < (-0.78, -0.048] < (-0.048, 0.6] < (0.6, 3.38]]

In [83]:
quartiles.value_counts()

(-3.2699999999999996, -0.78]    250
(-0.78, -0.048]                 250
(-0.048, 0.6]                   250
(0.6, 3.38]                     250
Name: count, dtype: int64

In [84]:
pd.qcut(data, [0, 0.1, 0.6, 0.9, 1.]).value_counts()

(-3.2609999999999997, -1.313]    100
(-1.313, 0.203]                  500
(0.203, 1.261]                   300
(1.261, 3.385]                   100
Name: count, dtype: int64

#### Detecting and Filtering Outliers

In [85]:
data = pd.DataFrame(np.random.standard_normal((1000, 4)))
data.describe()

Unnamed: 0,0,1,2,3
count,1000.0,1000.0,1000.0,1000.0
mean,-0.023908,0.091547,0.012299,-0.022518
std,0.99953,1.053756,0.947468,0.968258
min,-3.967179,-2.854483,-2.998318,-2.863949
25%,-0.706474,-0.629245,-0.640215,-0.701825
50%,-0.006945,0.054797,0.013605,-0.038172
75%,0.607068,0.808496,0.609257,0.593175
max,3.573477,3.548515,3.025234,2.945159


In [86]:
col=data[2]
col

0     -0.481176
1     -0.161935
2      0.392225
3      1.263079
4      1.079282
         ...   
995    1.535278
996    0.559238
997    0.451192
998   -0.535692
999    0.922951
Name: 2, Length: 1000, dtype: float64

In [87]:
col[col.abs()>3]

459    3.025234
Name: 2, dtype: float64

In [91]:
data[(data.abs()>3).any(axis="columns")]

Unnamed: 0,0,1,2,3
288,3.573477,-0.740646,-1.228347,-0.666379
435,0.10618,3.097397,1.558867,-1.050126
459,0.979711,0.009145,3.025234,-0.053606
490,-1.496296,3.548515,1.127777,2.062835
549,-0.656142,3.377648,-0.765377,0.270102
667,1.91223,3.024172,0.136589,-0.626912
688,-3.967179,-0.078517,-0.828974,-1.259834
713,-3.251425,0.717042,1.145415,0.534551
816,-0.580718,3.062962,1.771214,1.489105


In [93]:
data[data.abs()>3]=np.sign(data)*3
data.describe()

Unnamed: 0,0,1,2,3
count,1000.0,1000.0,1000.0,1000.0
mean,-0.023263,0.090436,0.012274,-0.022518
std,0.993478,1.050464,0.947388,0.968258
min,-3.0,-2.854483,-2.998318,-2.863949
25%,-0.706474,-0.629245,-0.640215,-0.701825
50%,-0.006945,0.054797,0.013605,-0.038172
75%,0.607068,0.808496,0.609257,0.593175
max,3.0,3.0,3.0,2.945159


### Permutation and Random Sampling

In [94]:
df = pd.DataFrame(np.arange(5 * 7).reshape((5, 7)))
df

Unnamed: 0,0,1,2,3,4,5,6
0,0,1,2,3,4,5,6
1,7,8,9,10,11,12,13
2,14,15,16,17,18,19,20
3,21,22,23,24,25,26,27
4,28,29,30,31,32,33,34


In [95]:
sampler=np.random.permutation(5)
sampler

array([2, 3, 4, 1, 0])

In [97]:
df.take(sampler)
df.iloc[sampler]

Unnamed: 0,0,1,2,3,4,5,6
2,14,15,16,17,18,19,20
3,21,22,23,24,25,26,27
4,28,29,30,31,32,33,34
1,7,8,9,10,11,12,13
0,0,1,2,3,4,5,6


Unnamed: 0,0,1,2,3,4,5,6
2,14,15,16,17,18,19,20
3,21,22,23,24,25,26,27
4,28,29,30,31,32,33,34
1,7,8,9,10,11,12,13
0,0,1,2,3,4,5,6


In [98]:
column_sampler=np.random.permutation(7)
column_sampler

array([1, 5, 6, 3, 2, 0, 4])

In [99]:
df.take(column_sampler,axis="columns")

Unnamed: 0,1,5,6,3,2,0,4
0,1,5,6,3,2,0,4
1,8,12,13,10,9,7,11
2,15,19,20,17,16,14,18
3,22,26,27,24,23,21,25
4,29,33,34,31,30,28,32


In [100]:
df.sample(n=3,axis="columns")

Unnamed: 0,3,1,4
0,3,1,4
1,10,8,11
2,17,15,18
3,24,22,25
4,31,29,32


In [101]:
df.sample(n=3)

Unnamed: 0,0,1,2,3,4,5,6
4,28,29,30,31,32,33,34
1,7,8,9,10,11,12,13
2,14,15,16,17,18,19,20


In [102]:
choices=pd.Series([5,7,-1,6,4])
choices

0    5
1    7
2   -1
3    6
4    4
dtype: int64

In [104]:
choices.sample(n=10,replace=True)

4    4
2   -1
3    6
1    7
4    4
2   -1
4    4
2   -1
0    5
3    6
dtype: int64

### Computing Indicator/Dummy Variables

In [105]:
df=pd.DataFrame({"key":["b","b","a","c","a","b"],
                 "data1":range(6)})
df

Unnamed: 0,key,data1
0,b,0
1,b,1
2,a,2
3,c,3
4,a,4
5,b,5


In [107]:
pd.get_dummies(df["key"],dtype=float)

Unnamed: 0,a,b,c
0,0.0,1.0,0.0
1,0.0,1.0,0.0
2,1.0,0.0,0.0
3,0.0,0.0,1.0
4,1.0,0.0,0.0
5,0.0,1.0,0.0


In [108]:
dummies=pd.get_dummies(df["key"],prefix="key",dtype=float)
dummies

Unnamed: 0,key_a,key_b,key_c
0,0.0,1.0,0.0
1,0.0,1.0,0.0
2,1.0,0.0,0.0
3,0.0,0.0,1.0
4,1.0,0.0,0.0
5,0.0,1.0,0.0


In [111]:
df_with_dummy=df[["data1"]].join(dummies)
df_with_dummy

Unnamed: 0,data1,key_a,key_b,key_c
0,0,0.0,1.0,0.0
1,1,0.0,1.0,0.0
2,2,1.0,0.0,0.0
3,3,0.0,0.0,1.0
4,4,1.0,0.0,0.0
5,5,0.0,1.0,0.0


In [114]:
from sympy import python


mnames=["movie_id","title","genres"]
movies=pd.read_table("../../datasets/movielens/movies.dat",sep="::",names=mnames,header=None,engine="python")
movies

Unnamed: 0,movie_id,title,genres
0,1,Toy Story (1995),Animation|Children's|Comedy
1,2,Jumanji (1995),Adventure|Children's|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama
4,5,Father of the Bride Part II (1995),Comedy
...,...,...,...
3878,3948,Meet the Parents (2000),Comedy
3879,3949,Requiem for a Dream (2000),Drama
3880,3950,Tigerland (2000),Drama
3881,3951,Two Family House (2000),Drama


In [115]:
movies[:10]

Unnamed: 0,movie_id,title,genres
0,1,Toy Story (1995),Animation|Children's|Comedy
1,2,Jumanji (1995),Adventure|Children's|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama
4,5,Father of the Bride Part II (1995),Comedy
5,6,Heat (1995),Action|Crime|Thriller
6,7,Sabrina (1995),Comedy|Romance
7,8,Tom and Huck (1995),Adventure|Children's
8,9,Sudden Death (1995),Action
9,10,GoldenEye (1995),Action|Adventure|Thriller


In [117]:
dummies=movies["genres"].str.get_dummies("|")
dummies.iloc[:10]

Unnamed: 0,Action,Adventure,Animation,Children's,Comedy,Crime,Documentary,Drama,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,0,0,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0
1,0,1,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0
2,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0
3,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0
5,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0
6,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0
7,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
8,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
9,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0


In [118]:
movies_windic=movies.join(dummies.add_prefix("Genre_"))
movies_windic

Unnamed: 0,movie_id,title,genres,Genre_Action,Genre_Adventure,Genre_Animation,Genre_Children's,Genre_Comedy,Genre_Crime,Genre_Documentary,...,Genre_Fantasy,Genre_Film-Noir,Genre_Horror,Genre_Musical,Genre_Mystery,Genre_Romance,Genre_Sci-Fi,Genre_Thriller,Genre_War,Genre_Western
0,1,Toy Story (1995),Animation|Children's|Comedy,0,0,1,1,1,0,0,...,0,0,0,0,0,0,0,0,0,0
1,2,Jumanji (1995),Adventure|Children's|Fantasy,0,1,0,1,0,0,0,...,1,0,0,0,0,0,0,0,0,0
2,3,Grumpier Old Men (1995),Comedy|Romance,0,0,0,0,1,0,0,...,0,0,0,0,0,1,0,0,0,0
3,4,Waiting to Exhale (1995),Comedy|Drama,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
4,5,Father of the Bride Part II (1995),Comedy,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3878,3948,Meet the Parents (2000),Comedy,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
3879,3949,Requiem for a Dream (2000),Drama,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3880,3950,Tigerland (2000),Drama,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3881,3951,Two Family House (2000),Drama,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [119]:
movies_windic.iloc[0]

movie_id                                       1
title                           Toy Story (1995)
genres               Animation|Children's|Comedy
Genre_Action                                   0
Genre_Adventure                                0
Genre_Animation                                1
Genre_Children's                               1
Genre_Comedy                                   1
Genre_Crime                                    0
Genre_Documentary                              0
Genre_Drama                                    0
Genre_Fantasy                                  0
Genre_Film-Noir                                0
Genre_Horror                                   0
Genre_Musical                                  0
Genre_Mystery                                  0
Genre_Romance                                  0
Genre_Sci-Fi                                   0
Genre_Thriller                                 0
Genre_War                                      0
Genre_Western       

In [120]:
np.random.seed(12345)
values=np.random.uniform(size=10)
values

array([0.92961609, 0.31637555, 0.18391881, 0.20456028, 0.56772503,
       0.5955447 , 0.96451452, 0.6531771 , 0.74890664, 0.65356987])

In [121]:
bins=[0,0.2,0.4,0.6,0.8,1]

In [122]:
pd.get_dummies(pd.cut(values,bins))

Unnamed: 0,"(0.0, 0.2]","(0.2, 0.4]","(0.4, 0.6]","(0.6, 0.8]","(0.8, 1.0]"
0,False,False,False,False,True
1,False,True,False,False,False
2,True,False,False,False,False
3,False,True,False,False,False
4,False,False,True,False,False
5,False,False,True,False,False
6,False,False,False,False,True
7,False,False,False,True,False
8,False,False,False,True,False
9,False,False,False,True,False


## 7.3: Extension Data Types

In [123]:
s = pd.Series([1, 2, 3, None])
s
s.dtype

0    1.0
1    2.0
2    3.0
3    NaN
dtype: float64

dtype('float64')

In [124]:
s1=pd.Series([1,2,3])
s1

0    1
1    2
2    3
dtype: int64

In [125]:
s = pd.Series([1, 2, 3, None], dtype=pd.Int64Dtype())
s
s.isna()
s.dtype

0       1
1       2
2       3
3    <NA>
dtype: Int64

0    False
1    False
2    False
3     True
dtype: bool

Int64Dtype()

In [126]:
s[3] is pd.NA

True

In [127]:
s = pd.Series([1, 2, 3, None], dtype="Int64")
s

0       1
1       2
2       3
3    <NA>
dtype: Int64

In [128]:
s = pd.Series(['one', 'two', None, 'three'], dtype=pd.StringDtype())
s

0      one
1      two
2     <NA>
3    three
dtype: string

In [129]:
df=pd.DataFrame({"A":[1,2,None,4],
                 "B":["one","two","three",None],
                 "C":[False,None,False,True]})
df

Unnamed: 0,A,B,C
0,1.0,one,False
1,2.0,two,
2,,three,False
3,4.0,,True


In [130]:
df["A"].dtype

dtype('float64')

In [131]:
df["A"]=df["A"].astype("Int64")
df["B"]=df["B"].astype("string")
df["C"]=df["C"].astype("boolean")
df

Unnamed: 0,A,B,C
0,1.0,one,False
1,2.0,two,
2,,three,False
3,4.0,,True


## 7.4: String Manipulation

### Python Built-In String Object Methods

### Regular Expressions

### String Functions in pandas