# Data Cleaning and Preparation

In [1]:
# %load nbheader.py
%reload_ext autoreload
%autoreload 2

In [2]:
import re
import os
import numpy as np
import pandas as pd
from datetime import datetime

## 7.1 Handling Missing Data

### General Introduction

- Missing data referred as **NA** or **None** in Python 
- In statistics applications, NA data may either be data that does not exist or that exists but was not observed 

**Table 7.1: NA handling object methods**

**Method**	|Description

**dropna**	|Filter axis labels based on whether values for each label have missing data, with varying thresholds for how much missing data to tolerate.

**fillna**	|Fill in missing data with some value or using an interpolation method such as "ffill" or "bfill".

**isna**	|Return Boolean values indicating which values are missing/NA.

**notna**	|Negation of isna, returns True for non-NA values and False for NA values.


In [3]:
string_data = pd.Series(["aardvark", np.nan, None, "avocado"])
print(string_data)
string_data.isna()

0    aardvark
1         NaN
2        None
3     avocado
dtype: object


0    False
1     True
2     True
3    False
dtype: bool

In [4]:
float_data = pd.Series([1, 2, None], dtype='float64')
print(float_data)
float_data.isna()

0    1.0
1    2.0
2    NaN
dtype: float64


0    False
1    False
2     True
dtype: bool

### Filtering Out Missing Data

In [5]:
data = pd.DataFrame([[1., 6.5, 3.], [1., np.nan, np.nan],
                     [np.nan, np.nan, np.nan], [np.nan, 6.5, 3.]])
data

Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,,
2,,,
3,,6.5,3.0


In [6]:
data.dropna()

Unnamed: 0,0,1,2
0,1.0,6.5,3.0


In [7]:
data.dropna(how="all")

Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,,
3,,6.5,3.0


In [8]:
data[4] = np.nan
data.dropna(axis="columns", how="all")

Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,,
2,,,
3,,6.5,3.0


In [9]:
df = pd.DataFrame(np.random.standard_normal((7, 3)))
df.iloc[:4, 1] = np.nan
df.iloc[:2, 2] = np.nan
df

Unnamed: 0,0,1,2
0,-0.448018,,
1,-0.484652,,
2,0.430686,,1.761646
3,1.212015,,-0.59672
4,-0.366866,-1.19339,-1.301488
5,-1.47605,0.007802,-1.042865
6,0.346808,-0.29992,-1.043567


In [10]:
df.dropna()

Unnamed: 0,0,1,2
4,-0.366866,-1.19339,-1.301488
5,-1.47605,0.007802,-1.042865
6,0.346808,-0.29992,-1.043567


In [11]:
df.dropna(thresh=2)

Unnamed: 0,0,1,2
2,0.430686,,1.761646
3,1.212015,,-0.59672
4,-0.366866,-1.19339,-1.301488
5,-1.47605,0.007802,-1.042865
6,0.346808,-0.29992,-1.043567


### Filling In Missing Data

**Table 7.2: fillna function arguments**

**Argument**|	**Description**

**value**|	Scalar value or dictionary-like object to use to fill missing values

**method**|	Interpolation method: one of "bfill" (backward fill) or "ffill" (forward fill); default is None

**axis**|	Axis to fill on ("index" or "columns"); default is axis="index"

**limit**|	For forward and backward filling, maximum number of consecutive periods to fill


In [12]:
df.fillna(0)

Unnamed: 0,0,1,2
0,-0.448018,0.0,0.0
1,-0.484652,0.0,0.0
2,0.430686,0.0,1.761646
3,1.212015,0.0,-0.59672
4,-0.366866,-1.19339,-1.301488
5,-1.47605,0.007802,-1.042865
6,0.346808,-0.29992,-1.043567


In [13]:
df.fillna({1: 0.5, 2: 0})

Unnamed: 0,0,1,2
0,-0.448018,0.5,0.0
1,-0.484652,0.5,0.0
2,0.430686,0.5,1.761646
3,1.212015,0.5,-0.59672
4,-0.366866,-1.19339,-1.301488
5,-1.47605,0.007802,-1.042865
6,0.346808,-0.29992,-1.043567


In [14]:
df = pd.DataFrame(np.random.standard_normal((6, 3)))
df.iloc[2:, 1] = np.nan
df.iloc[4:, 2] = np.nan
df

Unnamed: 0,0,1,2
0,0.173701,0.440177,-0.976099
1,-1.287002,-0.166779,1.232086
2,-0.201447,,-0.1945
3,-3.135314,,0.397302
4,-0.828791,,
5,-0.061246,,


In [15]:
df.fillna(method="ffill")

  df.fillna(method="ffill")


Unnamed: 0,0,1,2
0,0.173701,0.440177,-0.976099
1,-1.287002,-0.166779,1.232086
2,-0.201447,-0.166779,-0.1945
3,-3.135314,-0.166779,0.397302
4,-0.828791,-0.166779,0.397302
5,-0.061246,-0.166779,0.397302


In [16]:
df.fillna(method="ffill", limit=2)

  df.fillna(method="ffill", limit=2)


Unnamed: 0,0,1,2
0,0.173701,0.440177,-0.976099
1,-1.287002,-0.166779,1.232086
2,-0.201447,-0.166779,-0.1945
3,-3.135314,-0.166779,0.397302
4,-0.828791,,0.397302
5,-0.061246,,0.397302


## 7.2 Data Transformation

### Removing Duplicates

In [17]:
data = pd.DataFrame({"k1": ["one", "two"] * 3 + ["two"],
                     "k2": [1, 1, 2, 3, 3, 4, 4]})
data

Unnamed: 0,k1,k2
0,one,1
1,two,1
2,one,2
3,two,3
4,one,3
5,two,4
6,two,4


Find duplicated rows - create Boolean Series indicating a duplicated row or not:

In [18]:
data.duplicated()

0    False
1    False
2    False
3    False
4    False
5    False
6     True
dtype: bool

Filter out and remove duplicated rows with **drop_duplicates** method. Keep the first row and remove all other duplicates:

In [19]:
data.drop_duplicates()

Unnamed: 0,k1,k2
0,one,1
1,two,1
2,one,2
3,two,3
4,one,3
5,two,4


Filter duplicate row values in a subset of columns. Keep last occurrence of duplicated rows: 

In [20]:
data["v1"] = range(7)
data

Unnamed: 0,k1,k2,v1
0,one,1,0
1,two,1,1
2,one,2,2
3,two,3,3
4,one,3,4
5,two,4,5
6,two,4,6


In [21]:
data.drop_duplicates(["k1", "k2"], keep="last")

Unnamed: 0,k1,k2,v1
0,one,1,0
1,two,1,1
2,one,2,2
3,two,3,3
4,one,3,4
6,two,4,6


### Transforming Data Using a Function or Mapping 

In [22]:
data = pd.DataFrame({"food": ["bacon", "pulled pork", "bacon",
                              "pastrami", "corned beef", "bacon",
                              "pastrami", "honey ham", "nova lox"],
                     "ounces": [4, 3, 12, 6, 7.5, 8, 3, 5, 6]})
data

Unnamed: 0,food,ounces
0,bacon,4.0
1,pulled pork,3.0
2,bacon,12.0
3,pastrami,6.0
4,corned beef,7.5
5,bacon,8.0
6,pastrami,3.0
7,honey ham,5.0
8,nova lox,6.0


In [23]:
meat_to_animal = {
  "bacon": "pig",
  "pulled pork": "pig",
  "pastrami": "cow",
  "corned beef": "cow",
  "honey ham": "pig",
  "nova lox": "salmon"
}

In [24]:
data["animal"] = data["food"].map(meat_to_animal)
data

Unnamed: 0,food,ounces,animal
0,bacon,4.0,pig
1,pulled pork,3.0,pig
2,bacon,12.0,pig
3,pastrami,6.0,cow
4,corned beef,7.5,cow
5,bacon,8.0,pig
6,pastrami,3.0,cow
7,honey ham,5.0,pig
8,nova lox,6.0,salmon


In [25]:
def get_animal(x):
    return meat_to_animal[x]

data["food"].map(get_animal)

0       pig
1       pig
2       pig
3       cow
4       cow
5       pig
6       cow
7       pig
8    salmon
Name: food, dtype: object

### Replacing Values and Renaming Axis Indexes

Filling in missing data with the **.fillna()** method:

In [26]:
data = pd.Series([1., -999., 2., -999., -1000., 3.])
data

0       1.0
1    -999.0
2       2.0
3    -999.0
4   -1000.0
5       3.0
dtype: float64

In [27]:
data.replace([-999, -1000], [np.nan, 0])

0    1.0
1    NaN
2    2.0
3    NaN
4    0.0
5    3.0
dtype: float64

In [28]:
data.replace({-999: np.nan, -1000: 0})

0    1.0
1    NaN
2    2.0
3    NaN
4    0.0
5    3.0
dtype: float64

Renaming Axis Indexes:

In [29]:
data = pd.DataFrame(np.arange(12).reshape((3, 4)),
                    index=["Ohio", "Colorado", "New York"],
                    columns=["one", "two", "three", "four"])

In [30]:
def transform(x):
    return x[:4].upper()

data.index.map(transform)

Index(['OHIO', 'COLO', 'NEW '], dtype='object')

In [31]:
data.index = data.index.map(transform)
data

Unnamed: 0,one,two,three,four
OHIO,0,1,2,3
COLO,4,5,6,7
NEW,8,9,10,11


In [32]:
data.rename(index={"OHIO": "INDIANA"},
            columns={"three": "peekaboo"})

Unnamed: 0,one,two,peekaboo,four
INDIANA,0,1,2,3
COLO,4,5,6,7
NEW,8,9,10,11


### Detecting and Filtering Outliers 




In [33]:
data = pd.DataFrame(np.random.standard_normal((1000, 4)))
data.describe()

Unnamed: 0,0,1,2,3
count,1000.0,1000.0,1000.0,1000.0
mean,0.015358,-0.042278,-0.004721,-0.029233
std,0.999109,1.026366,0.942871,1.027145
min,-3.308178,-3.322973,-2.912168,-3.07002
25%,-0.648336,-0.738615,-0.608397,-0.793201
50%,0.002264,-0.085495,0.01999,-0.030466
75%,0.71944,0.660783,0.609754,0.69464
max,3.550452,3.896942,2.701694,3.188416


Select all rows having a value exceeding 3 or –3, use the **any** method on a Boolean DataFrame:

In [34]:
data[(data.abs() > 3).any(axis="columns")]

Unnamed: 0,0,1,2,3
31,-0.070603,-3.248475,0.020411,-1.190478
65,-0.985728,3.896942,0.124875,-0.310974
223,-0.702728,-3.310425,-0.529013,-1.070876
279,-1.501543,-3.322973,-0.550701,0.679729
457,-3.308178,-1.439958,-0.637497,1.019623
483,0.77302,0.017844,0.440322,3.188416
507,-3.046169,-0.948939,-0.800483,1.176218
549,3.048777,-0.518991,0.152208,0.396135
791,0.940219,1.136673,-0.172422,-3.07002
795,1.107096,3.114501,0.524139,-2.165481


Set values based on these criteria:

In [35]:
data[data.abs() > 3] = np.sign(data) * 3
data.describe()

Unnamed: 0,0,1,2,3
count,1000.0,1000.0,1000.0,1000.0
mean,0.014655,-0.042408,-0.004721,-0.029352
std,0.99456,1.020276,0.942871,1.026366
min,-3.0,-3.0,-2.912168,-3.0
25%,-0.648336,-0.738615,-0.608397,-0.793201
50%,0.002264,-0.085495,0.01999,-0.030466
75%,0.71944,0.660783,0.609754,0.69464
max,3.0,3.0,2.701694,3.0


### Permutation and Random Sampling 

Permuting (randomly reordering) a Series or the rows in a DataFrame using the **numpy.random.permutation** function:

In [36]:
df = pd.DataFrame(np.arange(5 * 7).reshape((5, 7)))
df

Unnamed: 0,0,1,2,3,4,5,6
0,0,1,2,3,4,5,6
1,7,8,9,10,11,12,13
2,14,15,16,17,18,19,20
3,21,22,23,24,25,26,27
4,28,29,30,31,32,33,34


In [37]:
sampler = np.random.permutation(5)
sampler

array([4, 0, 1, 3, 2])

In [38]:
df.take(sampler)

Unnamed: 0,0,1,2,3,4,5,6
4,28,29,30,31,32,33,34
0,0,1,2,3,4,5,6
1,7,8,9,10,11,12,13
3,21,22,23,24,25,26,27
2,14,15,16,17,18,19,20


In [39]:
df.iloc[sampler]

Unnamed: 0,0,1,2,3,4,5,6
4,28,29,30,31,32,33,34
0,0,1,2,3,4,5,6
1,7,8,9,10,11,12,13
3,21,22,23,24,25,26,27
2,14,15,16,17,18,19,20


Permutation of the columns with **axis="columns"** flag in **.take()** method:

In [40]:
column_sampler = np.random.permutation(7)
column_sampler

array([1, 2, 6, 5, 4, 3, 0])

In [41]:
df.take(column_sampler, axis="columns")

Unnamed: 0,1,2,6,5,4,3,0
0,1,2,6,5,4,3,0
1,8,9,13,12,11,10,7
2,15,16,20,19,18,17,14
3,22,23,27,26,25,24,21
4,29,30,34,33,32,31,28


 Select a random subset without replacement:

In [42]:
df.sample(n=3)

Unnamed: 0,0,1,2,3,4,5,6
2,14,15,16,17,18,19,20
1,7,8,9,10,11,12,13
0,0,1,2,3,4,5,6


Generate a sample with replacement (to allow repeat choices):

In [43]:
choices = pd.Series([5, 7, -1, 6, 4])
choices.sample(n=10, replace=True)

4    4
3    6
4    4
2   -1
1    7
2   -1
0    5
3    6
2   -1
3    6
dtype: int64

### Computing Indicator/Dummy Variables

In [44]:
mnames = ["movie_id", "title", "genres"]
movies = pd.read_table("./DataScienceProgramming/06-Data-Transformation-Visualization/movies.dat", sep="::",
                       header=None, names=mnames, engine="python")
movies[:10]

Unnamed: 0,movie_id,title,genres
0,1,Toy Story (1995),Animation|Children's|Comedy
1,2,Jumanji (1995),Adventure|Children's|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama
4,5,Father of the Bride Part II (1995),Comedy
5,6,Heat (1995),Action|Crime|Thriller
6,7,Sabrina (1995),Comedy|Romance
7,8,Tom and Huck (1995),Adventure|Children's
8,9,Sudden Death (1995),Action
9,10,GoldenEye (1995),Action|Adventure|Thriller


In [45]:
dummies = movies["genres"].str.get_dummies("|")
dummies.iloc[:10, :6]

Unnamed: 0,Action,Adventure,Animation,Children's,Comedy,Crime
0,0,0,1,1,1,0
1,0,1,0,1,0,0
2,0,0,0,0,1,0
3,0,0,0,0,1,0
4,0,0,0,0,1,0
5,1,0,0,0,0,1
6,0,0,0,0,1,0
7,0,1,0,1,0,0
8,1,0,0,0,0,0
9,1,1,0,0,0,0


## 7.4 String Manipulation

- Python as popular raw data manipulation language in part due to its ease of use for string and text processing.

- Text operations made simple with the string object’s built-in methods.

- Regular expressions necessary for more complex pattern matching and text manipulations.

- Apply string and regular expressions concisely on whole arrays of data, while also handling missing data.

### Python Built-In String Object Methods

**Table 7.4: Python built-in string methods**

**Method**|	**Description**

**count**|	Return the number of nonoverlapping occurrences of substring in the string

**endswith**|	Return True if string ends with suffix

**startswith**|	Return True if string starts with prefix

**join**|	Use string as delimiter for concatenating a sequence of other strings

**index**|	Return starting index of the first occurrence of passed substring if found in the string; otherwise, raises ValueError if not found

**find**|	Return position of first character of first occurrence of substring in the string; like index, but returns –1 if not found

**rfind**|	Return position of first character of last occurrence of substring in the string; returns –1 if not found

**replace**|	Replace occurrences of string with another string

**strip, rstrip, lstrip**|  Trim whitespace, including newlines on both sides, on the right side, or on the left side, respectively

**split**|	Break string into list of substrings using passed delimiter

**lower**|	Convert alphabet characters to lowercase

**upper**|	Convert alphabet characters to uppercase

**casefold**|	Convert characters to lowercase, and convert any region-specific variable character combinations to a common comparable form

**ljust, rjust**|	Left justify or right justify, respectively; pad opposite side of string with spaces (or some other fill character) to return a string with a minimum width

In [46]:
val = "a,b,  guido"
val.split(",")

['a', 'b', '  guido']

In [47]:
pieces = [x.strip() for x in val.split(",")]
pieces

['a', 'b', 'guido']

In [48]:
first, second, third = pieces
first + "::" + second + "::" + third

'a::b::guido'

In [49]:
"::".join(pieces)

'a::b::guido'

In [50]:
"guido" in val

True

In [51]:
val.index(",")

1

In [52]:
val.find(":")

-1

In [53]:
val.count(",")

2

In [54]:
val.replace(",", "::")

'a::b::  guido'

In [55]:
val.replace(",", "")

'ab  guido'

### Regular Expressions

- Regular expressions provide a flexible way to search or match (often more complex) string patterns in text. 

- A single expression, commonly called a regex, is a string formed according to the regular expression language. 

- Python’s built-in re module is responsible for applying regular expressions to strings.


**Table 7.5: Regular expression methods**

**Method**|	**Description**

**Mfindall**|	Return all nonoverlapping matching patterns in a string as a list.

**Mfinditer**|	Like findall, but returns an iterator.

**Mmatch**|	Match pattern at start of string and optionally segment pattern components into groups; if the pattern matches, return a match object, and otherwise None.

**Msearch**|	Scan string for match to pattern, returning a match object if so; unlike match, the match can be anywhere in the string as opposed to only at the beginning.

**Msplit**|	Break string into pieces at each occurrence of pattern.

**Msub, subn**|	Replace all (sub) or first n occurrences (subn) of pattern in string with replacement expression; use symbols **\1**, **\2**, ... to refer to match group elements in the replacement string.


The regex describing one or more whitespace characters is **\s+**:

In [56]:
import re
text = "foo    bar\t baz  \tqux"
re.split(r"\s+", text)

['foo', 'bar', 'baz', 'qux']

Alternatively, the regular expression is first compiled, and then its **split** method is called on the passed text:

In [57]:
regex = re.compile(r"\s+")
regex.split(text)

['foo', 'bar', 'baz', 'qux']

In [58]:
regex.findall(text)

['    ', '\t ', '  \t']

**match** and **search** are closely related to **findall**:

- **findall** returns all matches in a string.

- **search** returns only the first match. 

- **match** only matches at the beginning of the string.

In [59]:
text = """Dave dave@google.com
Steve steve@gmail.com
Rob rob@gmail.com
Ryan ryan@yahoo.com"""
pattern = r"[A-Z0-9._%+-]+@[A-Z0-9.-]+\.[A-Z]{2,4}"

# re.IGNORECASE makes the regex case insensitive
regex = re.compile(pattern, flags=re.IGNORECASE)

In [60]:
regex.findall(text)

['dave@google.com', 'steve@gmail.com', 'rob@gmail.com', 'ryan@yahoo.com']

In [61]:
m = regex.search(text)
m

<re.Match object; span=(5, 20), match='dave@google.com'>

In [62]:
text[m.start():m.end()]

'dave@google.com'

In [63]:
print(regex.match(text))
print(regex.sub("REDACTED", text))

None
Dave REDACTED
Steve REDACTED
Rob REDACTED
Ryan REDACTED


Put parentheses around the parts of the pattern you want to segment:

In [64]:
pattern = r"([A-Z0-9._%+-]+)@([A-Z0-9.-]+)\.([A-Z]{2,4})"
regex = re.compile(pattern, flags=re.IGNORECASE)

A match object produced by this modified regex returns a tuple of the pattern components with its groups method:

In [65]:
m = regex.match("wesm@bright.net")
m.groups()

('wesm', 'bright', 'net')

In [66]:
regex.findall(text)

[('dave', 'google', 'com'),
 ('steve', 'gmail', 'com'),
 ('rob', 'gmail', 'com'),
 ('ryan', 'yahoo', 'com')]

**sub** also has access to groups in each match using special symbols like **\1** and **\2**.

In [67]:
print(regex.sub(r"Username: \1, Domain: \2, Suffix: \3", text))

Dave Username: dave, Domain: google, Suffix: com
Steve Username: steve, Domain: gmail, Suffix: com
Rob Username: rob, Domain: gmail, Suffix: com
Ryan Username: ryan, Domain: yahoo, Suffix: com


### String Functions in pandas

Series has array-oriented methods for string operations that skip over and propagate NA values. 


**Table 7.6: Partial listing of Series string methods**

**Method**	**Description**

cat	Concatenate strings element-wise with optional delimiter

contains	Return Boolean array if each string contains pattern/regex

count	Count occurrences of pattern

extract	Use a regular expression with groups to extract one or more strings from a Series of strings; the result will be a DataFrame with one column per group

endswith	Equivalent to x.endswith(pattern) for each element

startswith	Equivalent to x.startswith(pattern) for each element

findall	Compute list of all occurrences of pattern/regex for each string

get	Index into each element (retrieve i-th element)

isalnum	Equivalent to built-in str.alnum

isalpha	Equivalent to built-in str.isalpha

isdecimal	Equivalent to built-in str.isdecimal

isdigit	Equivalent to built-in str.isdigit

islower	Equivalent to built-in str.islower

isnumeric	Equivalent to built-in str.isnumeric

isupper	Equivalent to built-in str.isupper

join	Join strings in each element of the Series with passed separator

len	Compute length of each string

lower, upper	Convert cases; equivalent to x.lower() or x.upper() for each element

match	Use re.match with the passed regular expression on each element, returning True or False whether it matches

pad	Add whitespace to left, right, or both sides of strings

center	Equivalent to pad(side="both")

repeat	Duplicate values (e.g., s.str.repeat(3) is equivalent to x * 3 for each string)

replace	Replace occurrences of pattern/regex with some other string

slice	Slice each string in the Series

split	Split strings on delimiter or regular expression

strip	Trim whitespace from both sides, including newlines

rstrip	Trim whitespace on right side

lstrip	Trim whitespace on left side

In [68]:
data = {"Dave": "dave@google.com", "Steve": "steve@gmail.com",
        "Rob": "rob@gmail.com", "Wes": np.nan}
data = pd.Series(data)
data

Dave     dave@google.com
Steve    steve@gmail.com
Rob        rob@gmail.com
Wes                  NaN
dtype: object

In [69]:
data.isna()

Dave     False
Steve    False
Rob      False
Wes       True
dtype: bool

In [70]:
data.str.contains("gmail")

Dave     False
Steve     True
Rob       True
Wes        NaN
dtype: object

In [71]:
data_as_string_ext = data.astype('string')
data_as_string_ext

Dave     dave@google.com
Steve    steve@gmail.com
Rob        rob@gmail.com
Wes                 <NA>
dtype: string

In [72]:
data_as_string_ext.str.contains("gmail")

Dave     False
Steve     True
Rob       True
Wes       <NA>
dtype: boolean

Regular expressions can be used, too, along with any re options like **IGNORECASE**:

In [73]:
pattern = r"([A-Z0-9._%+-]+)@([A-Z0-9.-]+)\.([A-Z]{2,4})"
data.str.findall(pattern, flags=re.IGNORECASE)

Dave     [(dave, google, com)]
Steve    [(steve, gmail, com)]
Rob        [(rob, gmail, com)]
Wes                        NaN
dtype: object

In [74]:
matches = data.str.findall(pattern, flags=re.IGNORECASE).str[0]

In [75]:
matches.str.get(1)

Dave     google
Steve     gmail
Rob       gmail
Wes         NaN
dtype: object

The str.extract method will return the captured groups of a regular expression as a DataFrame:

In [76]:
data.str.extract(pattern, flags=re.IGNORECASE)

Unnamed: 0,0,1,2
Dave,dave,google,com
Steve,steve,gmail,com
Rob,rob,gmail,com
Wes,,,


## 7.5 Categorical Data

In [77]:
fruits = ['apple', 'orange', 'apple', 'apple'] * 2
N = len(fruits)
rng = np.random.default_rng(seed=12345)

df = pd.DataFrame({'fruit': fruits,
                   'basket_id': np.arange(N),
                   'count': rng.integers(3, 15, size=N),
                   'weight': rng.uniform(0, 4, size=N)},
                  columns=['basket_id', 'fruit', 'count', 'weight'])
df

Unnamed: 0,basket_id,fruit,count,weight
0,0,apple,11,1.564438
1,1,orange,5,1.331256
2,2,apple,12,2.393235
3,3,apple,6,0.746937
4,4,apple,5,2.691024
5,5,orange,12,3.767211
6,6,apple,10,0.992983
7,7,apple,11,3.795525


In [78]:
fruit_cat = df['fruit'].astype('category')
fruit_cat

0     apple
1    orange
2     apple
3     apple
4     apple
5    orange
6     apple
7     apple
Name: fruit, dtype: category
Categories (2, object): ['apple', 'orange']

In [79]:
c = fruit_cat.array
type(c)

pandas.core.arrays.categorical.Categorical

In [80]:
c.categories

Index(['apple', 'orange'], dtype='object')

In [81]:
c.codes

array([0, 1, 0, 0, 0, 1, 0, 0], dtype=int8)

In [82]:
dict(enumerate(c.categories))

{0: 'apple', 1: 'orange'}

In [83]:
df['fruit'] = df['fruit'].astype('category')
df['fruit']

0     apple
1    orange
2     apple
3     apple
4     apple
5    orange
6     apple
7     apple
Name: fruit, dtype: category
Categories (2, object): ['apple', 'orange']