In [1]:
import pandas as pd
import numpy as np

In [2]:
obj = pd.Series([-7, -5, 7, 4, 2, 0, 4])

In [3]:
obj

0   -7
1   -5
2    7
3    4
4    2
5    0
6    4
dtype: int64

In [4]:
obj.rank(
    method="dense",
)

0    1.0
1    2.0
2    6.0
3    5.0
4    4.0
5    3.0
6    5.0
dtype: float64

In [5]:
obj.rank(method="first")

0    1.0
1    2.0
2    7.0
3    5.0
4    4.0
5    3.0
6    6.0
dtype: float64

In [6]:
frame = pd.DataFrame(
    {"b":[4.3, 7, -3, 2],
     "a":[0, 1, 0, 1],
     "c":[-2, 5, 8, -2.5]}
)

In [7]:
frame

Unnamed: 0,b,a,c
0,4.3,0,-2.0
1,7.0,1,5.0
2,-3.0,0,8.0
3,2.0,1,-2.5


In [8]:
frame.rank(axis="columns",
           method="max",
           
          )

Unnamed: 0,b,a,c
0,3.0,2.0,1.0
1,3.0,1.0,2.0
2,1.0,2.0,3.0
3,3.0,2.0,1.0


In [9]:
obj = pd.Series(np.arange(5),
                index=['a', 'a', 'b', 'b', 'c'])

In [10]:
obj

a    0
a    1
b    2
b    3
c    4
dtype: int32

In [11]:
obj.index.is_unique

False

In [12]:
df = pd.DataFrame(np.random.standard_normal((5, 3)),
                  index=['a', 'a', 'b', 'b', 'c'])

In [13]:
df

Unnamed: 0,0,1,2
a,-0.529194,0.618311,-1.888502
a,-0.814793,0.083848,1.361224
b,-0.021465,0.129387,-1.242669
b,0.065437,0.616025,0.340376
c,2.945844,0.545052,0.222376


In [14]:
df.loc["a"]

Unnamed: 0,0,1,2
a,-0.529194,0.618311,-1.888502
a,-0.814793,0.083848,1.361224


In [15]:
df.loc["c"]

0    2.945844
1    0.545052
2    0.222376
Name: c, dtype: float64

In [16]:
df = pd.DataFrame([[1.4, np.nan], [7.1, -4.5],
                   [np.nan, np.nan], [0.75, -1.3]],
                  index=['a', 'b', 'c', 'd'],
                  columns=['one', 'two'])

In [17]:
df

Unnamed: 0,one,two
a,1.4,
b,7.1,-4.5
c,,
d,0.75,-1.3


In [18]:
df.sum(axis=0)

one    9.25
two   -5.80
dtype: float64

In [19]:
df.sum(axis="index",
       # skipna=False,
      )

one    9.25
two   -5.80
dtype: float64

In [20]:
df.sum(axis=1,
       # skipna=False,
      )

a    1.40
b    2.60
c    0.00
d   -0.55
dtype: float64

In [21]:
df.idxmax()

one    b
two    d
dtype: object

In [22]:
df.describe().loc['mean']

one    3.083333
two   -2.900000
Name: mean, dtype: float64

In [23]:
# non-numeric data
obj = pd.Series(['a', 'a', 'b', 'c']*4)

In [24]:
obj.describe()

count     16
unique     3
top        a
freq       8
dtype: object

In [25]:
obj = pd.Series(['c', 'a', 'd', 'a', 'a', 'b', 'b', 'c', 'c'])

In [26]:
obj.unique()

array(['c', 'a', 'd', 'b'], dtype=object)

In [27]:
pd.Series(obj).value_counts()

c    3
a    3
b    2
d    1
Name: count, dtype: int64

In [28]:
data = pd.DataFrame({"qu1": [1, 3, 4, 3, 4],
                     "qu2": [2, 3, 1, 2, 3],
                     "qu3": [1, 5, 2, 4, 4]})

In [29]:
data.qu1.value_counts().sort_index()

qu1
1    1
3    2
4    2
Name: count, dtype: int64

In [30]:
data.value_counts()

qu1  qu2  qu3
1    2    1      1
3    2    4      1
     3    5      1
4    1    2      1
     3    4      1
Name: count, dtype: int64

In [31]:
import sys

In [32]:
data.to_csv(sys.stdout, index=False, columns=['qu1', 'qu3'], sep="|")

qu1|qu3
1|1
3|5
4|2
3|4
4|4


In [33]:
import sqlite3

In [34]:
query = """
CREATE TABLE test
(a VARCHAR(20), b VARCHAR(20), c REAL, d INTEGER);"""

In [35]:
con = sqlite3.connect("mydata.sqlite")
con.execute(query)
con.commit()

OperationalError: table test already exists

In [36]:
data = [("Atlanta", "Georgia", 1.25, 6),
        ("Tallahassee", "Florida", 2.6, 3),
        ("Sacramento", "California", 1.7, 5)]

In [37]:
stmt = "INSERT INTO test VALUES(?, ?, ?, ?)"
con.executemany(stmt, data)
con.commit()

In [38]:
cursor = con.execute("SELECT * FROM test")
rows = cursor.fetchall()

In [39]:
rows

[('Atlanta', 'Georgia', 1.25, 6),
 ('Tallahassee', 'Florida', 2.6, 3),
 ('Sacramento', 'California', 1.7, 5),
 ('Atlanta', 'Georgia', 1.25, 6),
 ('Tallahassee', 'Florida', 2.6, 3),
 ('Sacramento', 'California', 1.7, 5),
 ('Atlanta', 'Georgia', 1.25, 6),
 ('Tallahassee', 'Florida', 2.6, 3),
 ('Sacramento', 'California', 1.7, 5),
 ('Atlanta', 'Georgia', 1.25, 6),
 ('Tallahassee', 'Florida', 2.6, 3),
 ('Sacramento', 'California', 1.7, 5),
 ('Atlanta', 'Georgia', 1.25, 6),
 ('Tallahassee', 'Florida', 2.6, 3),
 ('Sacramento', 'California', 1.7, 5),
 ('Atlanta', 'Georgia', 1.25, 6),
 ('Tallahassee', 'Florida', 2.6, 3),
 ('Sacramento', 'California', 1.7, 5),
 ('Atlanta', 'Georgia', 1.25, 6),
 ('Tallahassee', 'Florida', 2.6, 3),
 ('Sacramento', 'California', 1.7, 5)]

In [40]:
cursor.description

(('a', None, None, None, None, None, None),
 ('b', None, None, None, None, None, None),
 ('c', None, None, None, None, None, None),
 ('d', None, None, None, None, None, None))

In [41]:
df = pd.DataFrame(rows, columns=[x[0] for x in cursor.description])

In [42]:
df

Unnamed: 0,a,b,c,d
0,Atlanta,Georgia,1.25,6
1,Tallahassee,Florida,2.6,3
2,Sacramento,California,1.7,5
3,Atlanta,Georgia,1.25,6
4,Tallahassee,Florida,2.6,3
5,Sacramento,California,1.7,5
6,Atlanta,Georgia,1.25,6
7,Tallahassee,Florida,2.6,3
8,Sacramento,California,1.7,5
9,Atlanta,Georgia,1.25,6


In [43]:
import sqlalchemy as sqla

In [44]:
db = sqla.create_engine("sqlite:///mydata.sqlite")

In [45]:
df = pd.read_sql("SELECT * FROM test", db)

In [46]:
df

Unnamed: 0,a,b,c,d
0,Atlanta,Georgia,1.25,6
1,Tallahassee,Florida,2.6,3
2,Sacramento,California,1.7,5
3,Atlanta,Georgia,1.25,6
4,Tallahassee,Florida,2.6,3
5,Sacramento,California,1.7,5
6,Atlanta,Georgia,1.25,6
7,Tallahassee,Florida,2.6,3
8,Sacramento,California,1.7,5
9,Atlanta,Georgia,1.25,6


In [47]:
# drop duplicates
data = pd.DataFrame({"k1": ["one", "two"]*3 + ["two"],
                     "k2": [1, 1, 2, 3, 3, 4, 4]})

In [48]:
data

Unnamed: 0,k1,k2
0,one,1
1,two,1
2,one,2
3,two,3
4,one,3
5,two,4
6,two,4


In [49]:
# data.duplicated()

In [50]:
# data = data.drop_duplicates()

In [51]:
# data

In [52]:
data.loc[:,'v1'] = range(7)

In [53]:
data

Unnamed: 0,k1,k2,v1
0,one,1,0
1,two,1,1
2,one,2,2
3,two,3,3
4,one,3,4
5,two,4,5
6,two,4,6


In [54]:
data.drop_duplicates(subset=['k2'])

Unnamed: 0,k1,k2,v1
0,one,1,0
2,one,2,2
3,two,3,3
5,two,4,5


In [55]:
data = pd.DataFrame({"food": ["bacon", "pulled pork", "bacon", "pastrami", "corned beef", "bacon", "pastrami", "honey ham", "nova lox"],
                     "ounces": [4, 3, 12, 6, 7.5, 8, 3, 5, 6]})

In [56]:
data

Unnamed: 0,food,ounces
0,bacon,4.0
1,pulled pork,3.0
2,bacon,12.0
3,pastrami,6.0
4,corned beef,7.5
5,bacon,8.0
6,pastrami,3.0
7,honey ham,5.0
8,nova lox,6.0


In [57]:
meat_to_animal = {
    "bacon": "pig",
    "pulled pork": "pig",
    "pastrami": "cow",
    "corned beef": "cow",
    "honey ham": "pig",
    "nova lox" : "Salmon"
}

In [58]:
data['animal'] = data["food"].map(meat_to_animal)

In [59]:
data

Unnamed: 0,food,ounces,animal
0,bacon,4.0,pig
1,pulled pork,3.0,pig
2,bacon,12.0,pig
3,pastrami,6.0,cow
4,corned beef,7.5,cow
5,bacon,8.0,pig
6,pastrami,3.0,cow
7,honey ham,5.0,pig
8,nova lox,6.0,Salmon


In [60]:
data = pd.Series([1., -999., 2.0, -999.0, -1000, -1000, 3])

In [61]:
data

0       1.0
1    -999.0
2       2.0
3    -999.0
4   -1000.0
5   -1000.0
6       3.0
dtype: float64

In [62]:
data.replace([-999, -1000], np.nan)

0    1.0
1    NaN
2    2.0
3    NaN
4    NaN
5    NaN
6    3.0
dtype: float64

In [63]:
data.replace([-999, -1000], [np.nan, 0])

0    1.0
1    NaN
2    2.0
3    NaN
4    0.0
5    0.0
6    3.0
dtype: float64

In [64]:
data.replace({-999: np.nan, -1000: 0})

0    1.0
1    NaN
2    2.0
3    NaN
4    0.0
5    0.0
6    3.0
dtype: float64

In [65]:
data

0       1.0
1    -999.0
2       2.0
3    -999.0
4   -1000.0
5   -1000.0
6       3.0
dtype: float64

In [66]:
# binning and discretization
ages = [18, 22, 25, 26, 21, 23, 37, 31, 61, 45, 41, 32]

In [67]:
bins = [18, 25, 35, 60, 100]
# bins = 5

In [68]:
age_categories = pd.cut(ages, bins)

In [69]:
age_categories

[NaN, (18.0, 25.0], (18.0, 25.0], (25.0, 35.0], (18.0, 25.0], ..., (25, 35], (60, 100], (35, 60], (35, 60], (25, 35]]
Length: 12
Categories (4, interval[int64, right]): [(18, 25] < (25, 35] < (35, 60] < (60, 100]]

In [70]:
ages

[18, 22, 25, 26, 21, 23, 37, 31, 61, 45, 41, 32]

In [71]:
df = pd.DataFrame([[1, 2.12], [3.356, 4.567]])

In [72]:
df

Unnamed: 0,0,1
0,1.0,2.12
1,3.356,4.567


In [73]:
df_copy = df.copy()

In [74]:
df_copy.iloc[0,0] = pd.NA

In [75]:
df_copy

Unnamed: 0,0,1
0,,2.12
1,3.356,4.567


In [76]:
df_copy.map(lambda x: len(str(x)), na_action=None)

Unnamed: 0,0,1
0,3,4
1,5,5


In [77]:
df

Unnamed: 0,0,1
0,1.0,2.12
1,3.356,4.567


In [78]:
df.map(round, ndigits=1)

Unnamed: 0,0,1
0,1.0,2.1
1,3.4,4.6


In [79]:
data = pd.DataFrame(np.arange(12).reshape(3, 4),
                    index=['Computer Engineering', 'Civil Engineering', 'Electrical Engineering'],
                    columns=['Year One', 'Year Two', 'Year Three', 'Year Four']
                   )

In [80]:
data

Unnamed: 0,Year One,Year Two,Year Three,Year Four
Computer Engineering,0,1,2,3
Civil Engineering,4,5,6,7
Electrical Engineering,8,9,10,11


In [81]:
def transform(x):
    return x[:4].upper()

In [82]:
data.index = data.index.map(transform)

In [83]:
data

Unnamed: 0,Year One,Year Two,Year Three,Year Four
COMP,0,1,2,3
CIVI,4,5,6,7
ELEC,8,9,10,11


In [84]:
data = data.rename(index=str.title,
                   columns=str.upper,
                  )

In [85]:
data

Unnamed: 0,YEAR ONE,YEAR TWO,YEAR THREE,YEAR FOUR
Comp,0,1,2,3
Civi,4,5,6,7
Elec,8,9,10,11


In [86]:
pd.cut(np.array([1, 7, 5, 4, 6, 3]), 3,
      right=False,
      )

[[1.0, 3.0), [5.0, 7.006), [5.0, 7.006), [3.0, 5.0), [5.0, 7.006), [3.0, 5.0)]
Categories (3, interval[float64, left]): [[1.0, 3.0) < [3.0, 5.0) < [5.0, 7.006)]

In [87]:
# Binning with qcut
data = np.random.standard_normal(1000)
quartiles = pd.qcut(data, 4, precision=2)

In [88]:
quartiles

[(0.67, 3.38], (-0.66, -0.023], (0.67, 3.38], (-2.94, -0.66], (-0.66, -0.023], ..., (-2.94, -0.66], (-0.66, -0.023], (-2.94, -0.66], (-0.023, 0.67], (-0.023, 0.67]]
Length: 1000
Categories (4, interval[float64, right]): [(-2.94, -0.66] < (-0.66, -0.023] < (-0.023, 0.67] < (0.67, 3.38]]

In [89]:
data = pd.DataFrame(np.random.standard_normal((1000, 4)))

In [90]:
data.describe()

Unnamed: 0,0,1,2,3
count,1000.0,1000.0,1000.0,1000.0
mean,-0.038539,0.030842,-0.025268,0.070313
std,1.022806,0.990678,0.994194,1.012894
min,-3.004494,-2.835869,-2.841264,-3.259322
25%,-0.726732,-0.613063,-0.688388,-0.587454
50%,-0.048788,0.029912,-0.050202,0.064662
75%,0.638077,0.755984,0.65619,0.754523
max,3.306997,2.710109,2.718868,3.144282


In [91]:
# Getting outliers where value is greater than three in any column
data[(data.abs() > 3).any(axis="columns")] 

Unnamed: 0,0,1,2,3
116,3.306997,2.710109,0.392845,-0.854552
416,3.111063,0.167879,0.431323,-0.335082
445,-0.801733,1.590613,1.510086,3.144282
467,0.314506,-0.46339,-1.055473,-3.259322
496,-0.389951,-0.385804,-0.474387,3.107553
769,-2.268234,-0.540221,1.569961,-3.070551
891,-3.004494,-0.128474,1.324935,-2.220377
994,0.176676,-0.231734,-0.765129,3.056297


In [92]:
# Capping the outlier values to 3
data[data.abs() > 3] = np.sign(data) * 3

In [93]:
data[(data.abs() == 3).any(axis="columns")]

Unnamed: 0,0,1,2,3
116,3.0,2.710109,0.392845,-0.854552
416,3.0,0.167879,0.431323,-0.335082
445,-0.801733,1.590613,1.510086,3.0
467,0.314506,-0.46339,-1.055473,-3.0
496,-0.389951,-0.385804,-0.474387,3.0
769,-2.268234,-0.540221,1.569961,-3.0
891,-3.0,-0.128474,1.324935,-2.220377
994,0.176676,-0.231734,-0.765129,3.0


In [94]:
# Permutations
df = pd.DataFrame(np.arange(5*7).reshape((5, 7)))

In [95]:
df

Unnamed: 0,0,1,2,3,4,5,6
0,0,1,2,3,4,5,6
1,7,8,9,10,11,12,13
2,14,15,16,17,18,19,20
3,21,22,23,24,25,26,27
4,28,29,30,31,32,33,34


In [96]:
sampler = np.random.permutation(5)

In [97]:
sampler

array([2, 4, 1, 0, 3])

In [98]:
# Sampling
df.take(sampler)

Unnamed: 0,0,1,2,3,4,5,6
2,14,15,16,17,18,19,20
4,28,29,30,31,32,33,34
1,7,8,9,10,11,12,13
0,0,1,2,3,4,5,6
3,21,22,23,24,25,26,27


In [99]:
df = pd.DataFrame({"key": ['b', 'b', 'a', 'c', 'a', 'b'],
                   "data1": range(6)})

In [100]:
df

Unnamed: 0,key,data1
0,b,0
1,b,1
2,a,2
3,c,3
4,a,4
5,b,5


In [101]:
pd.get_dummies(df["key"])

Unnamed: 0,a,b,c
0,False,True,False
1,False,True,False
2,True,False,False
3,False,False,True
4,True,False,False
5,False,True,False


In [102]:
s = pd.Series(list('abca'))

In [103]:
s

0    a
1    b
2    c
3    a
dtype: object

In [104]:
pd.get_dummies(s)

Unnamed: 0,a,b,c
0,True,False,False
1,False,True,False
2,False,False,True
3,True,False,False


In [105]:
s1 = ['a', 'b', np.nan]

In [106]:
pd.get_dummies(s1)

Unnamed: 0,a,b
0,True,False
1,False,True
2,False,False


In [107]:
pd.get_dummies(s1,
               dummy_na=True,
              prefix="key",
              prefix_sep = "-",
              )

Unnamed: 0,key-a,key-b,key-nan
0,True,False,False
1,False,True,False
2,False,False,True


In [108]:
s1 = pd.Series(["a|b", "a", "a|c"])

In [109]:
s1

0    a|b
1      a
2    a|c
dtype: object

In [110]:
s1.str.get_dummies()

Unnamed: 0,a,b,c
0,1,1,0
1,1,0,0
2,1,0,1


In [111]:
s = pd.Series([1, 2, 3, None], dtype=pd.Int64Dtype())

In [112]:
s

0       1
1       2
2       3
3    <NA>
dtype: Int64

In [113]:
s.isna()

0    False
1    False
2    False
3     True
dtype: bool

In [114]:
s.dtype

Int64Dtype()

In [115]:
s[3]

<NA>

In [116]:
s[3] is pd.NA

True

In [117]:
s = pd.Series(["one", "two", None, "three'"], dtype=pd.StringDtype())

In [118]:
s

0       one
1       two
2      <NA>
3    three'
dtype: string

In [119]:
df = pd.DataFrame({"A": [1, 2, None, 4],
                   "B": ["one", "two", "three", None],
                   "C": [False, None, False, True]})

In [120]:
df

Unnamed: 0,A,B,C
0,1.0,one,False
1,2.0,two,
2,,three,False
3,4.0,,True


In [121]:
df['A'] = df['A'].astype("Int64")
df['B'] = df['B'].astype("string")
df["C"] = df["C"].astype("boolean")

In [122]:
df

Unnamed: 0,A,B,C
0,1.0,one,False
1,2.0,two,
2,,three,False
3,4.0,,True


In [123]:
import re

In [124]:
text = "foo bar\t baz \tqux"

In [125]:
re.split(r"\s+", text)

['foo', 'bar', 'baz', 'qux']

In [126]:
regex = re.compile(r"\s+")

In [127]:
regex.split(text)

['foo', 'bar', 'baz', 'qux']

In [128]:
regex.findall(text)

[' ', '\t ', ' \t']

In [241]:
pattern = r"[A-Z0-9._%+-]+@[A-Z0-9.-]+\.[A-Z]{2,4}"

In [242]:
text = """Dave dave@google.com
Steve steve@gmail.com
Rob rob@gmail.com
steven.kakaire@mak.ac.ug
Ryan ryan@yahoo.com"""

In [243]:
text

'Dave dave@google.com\nSteve steve@gmail.com\nRob rob@gmail.com\nsteven.kakaire@mak.ac.ug\nRyan ryan@yahoo.com'

In [244]:
regex = re.compile(pattern, flags=re.IGNORECASE)

In [245]:
regex.findall(text)

['dave@google.com',
 'steve@gmail.com',
 'rob@gmail.com',
 'steven.kakaire@mak.ac.ug',
 'ryan@yahoo.com']

In [246]:
print

<function print(*args, sep=' ', end='\n', file=None, flush=False)>

In [247]:
m = regex.search(text)

In [248]:
m

<re.Match object; span=(5, 20), match='dave@google.com'>

In [249]:
m.start()

5

In [250]:
m.end()

20

In [251]:
text[m.start():m.end()]

'dave@google.com'

In [316]:
indexPattern = r'U+[0-9]+/[0-9]+'

In [320]:
regexIndex = re.compile(indexPattern, flags=re.IGNORECASE)

In [325]:
regexIndex.match('U0391/673 LUKABWE DOUGLAS MARKM 2022 42 MUKONO ACC BSC. Accounting (EVE)')[0]

'U0391/673'

In [437]:
student = 'U0391/673 LUKABWE DOUGLAS MARKM 2022 42 MUKONO ACC BSC. Accounting (EVE)'

In [442]:
pd.Series(student.split()).str.contains('M')

0     False
1     False
2     False
3      True
4     False
5     False
6      True
7     False
8     False
9     False
10    False
dtype: bool

In [406]:
namePattern = r'([A-Z]+){2,10}'

In [407]:
regexName = re.compile(namePattern, flags=re.IGNORECASE)

In [408]:
regexName.findall(student)

['E', 'S', 'M', 'O', 'C', 'C', 'g', 'E']

In [286]:
student = 'U0391/673 LUKABWE DOUGLAS MARKM 2022 42 MUKONO ACC BSC. Accounting (EVE)'

In [287]:
regex.match('U0391/673 LUKABWE DOUGLAS MARKM 2022 42 MUKONO ACC BSC. Accounting (EVE)')[0]

'U0391/673'

In [288]:
pattern = r"\b[a-zA-Z\\]+"

In [289]:
regex = re.compile(pattern, flags=re.IGNORECASE)

In [290]:
m = re.match(pattern, student)

In [291]:
m

<re.Match object; span=(0, 1), match='U'>

In [292]:
yearPattern = r"\b[0-9]{4}"

In [293]:
regex = re.compile(yearPattern)

In [311]:
regex.findall(student)

['2022']

In [296]:
student

'U0391/673 LUKABWE DOUGLAS MARKM 2022 42 MUKONO ACC BSC. Accounting (EVE)'

In [297]:
# print(regex.sub("REDACTED", text))

In [298]:
# pattern = r"([A-Z0-9._%+-]+)@([A-Z0-9._]+)\.([A-Z]{2,4})"

In [299]:
# regex = re.compile(pattern, flags=re.IGNORECASE)

In [300]:
# m = regex.match("wesm@right.net")

In [301]:
# m.groups()

In [302]:
# kakaire = regex.match("steven.kakaire@mak.ac.ug")

In [303]:
# kakaire.groups()

In [304]:
# regex.findall(text)

In [305]:
# raw string needs not to be written on multiple lines.
# print(regex.sub(r"Username: \1, Domain: \2, Suffix: \3",
#                 text))

In [409]:
# print(regex.sub(r"Username: \1, Domain: \2, Suffix: \3", text))

<h1>STRING MANIPULATION</h1>
<p>Real world data is messy and requires a lot of string manipulation.</p>

In [421]:
data = {"Hawa Jamal": "hawa.jamal@gmail.com",
        "King Kakaire": "steven.kakaire@mak.ac.ug",
        "Nalubega Daisy Mercy": "daisy.nalubega@yahoo.com",
        "rashidah Naluwongo": np.nan
       }

In [422]:
data = pd.Series(data)

In [423]:
data

Hawa Jamal                  hawa.jamal@gmail.com
King Kakaire            steven.kakaire@mak.ac.ug
Nalubega Daisy Mercy    daisy.nalubega@yahoo.com
rashidah Naluwongo                           NaN
dtype: object

In [427]:
data.str.contains("com")

Hawa Jamal               True
King Kakaire            False
Nalubega Daisy Mercy     True
rashidah Naluwongo        NaN
dtype: object

In [428]:
s1 = pd.Series(['Mouse', 'dog', 'house and parrot', '23', np.nan])

In [429]:
s1.str.contains('og', regex=False)

0    False
1     True
2    False
3    False
4      NaN
dtype: object

In [447]:
s1.loc[s1.str.contains('house | parrot', na=False)]

2    house and parrot
dtype: object

In [450]:
# Specifying na to be True
s1.str.contains('og', na=False, regex=True)

0    False
1     True
2    False
3    False
4    False
dtype: bool

In [451]:
# ignoring sensitivity
s1.str.contains('PARROT', flags=re.IGNORECASE, regex=True)

0    False
1    False
2     True
3    False
4      NaN
dtype: object

In [453]:
s2 = pd.Series(['40', '40.0', '41', '41.0', '35'])

In [454]:
s2.str.contains('.0', regex=True)

0     True
1     True
2    False
3     True
4    False
dtype: bool

In [456]:
s2.str.contains('//d', regex=True)

0    False
1    False
2    False
3    False
4    False
dtype: bool

In [458]:
data

Hawa Jamal                  hawa.jamal@gmail.com
King Kakaire            steven.kakaire@mak.ac.ug
Nalubega Daisy Mercy    daisy.nalubega@yahoo.com
rashidah Naluwongo                           NaN
dtype: object

In [459]:
data_as_string_ext = data.astype('string')

In [460]:
data_as_string_ext

Hawa Jamal                  hawa.jamal@gmail.com
King Kakaire            steven.kakaire@mak.ac.ug
Nalubega Daisy Mercy    daisy.nalubega@yahoo.com
rashidah Naluwongo                          <NA>
dtype: string

In [463]:
data_as_string_ext.str.contains("mak")

Hawa Jamal              False
King Kakaire             True
Nalubega Daisy Mercy    False
rashidah Naluwongo       <NA>
dtype: boolean

In [470]:
pattern = r"([A-Z0-9._%+-]+)@([A-Z0-9.-]+)\.([A-Z]{2,4})"

In [472]:
data.str.findall(pattern, flags=re.IGNORECASE)

Hawa Jamal                  [(hawa.jamal, gmail, com)]
King Kakaire            [(steven.kakaire, mak.ac, ug)]
Nalubega Daisy Mercy    [(daisy.nalubega, yahoo, com)]
rashidah Naluwongo                                 NaN
dtype: object

In [475]:
data.str.findall(pattern, flags=re.IGNORECASE).iloc[0]

[('hawa.jamal', 'gmail', 'com')]

In [478]:
matches = data.str.findall(pattern, flags=re.IGNORECASE).str[0]

In [479]:
matches.str.get(1)

Hawa Jamal               gmail
King Kakaire            mak.ac
Nalubega Daisy Mercy     yahoo
rashidah Naluwongo         NaN
dtype: object

In [482]:
data.str.extract(pattern, flags=re.IGNORECASE)

Unnamed: 0,0,1,2
Hawa Jamal,hawa.jamal,gmail,com
King Kakaire,steven.kakaire,mak.ac,ug
Nalubega Daisy Mercy,daisy.nalubega,yahoo,com
rashidah Naluwongo,,,


<h1>Categoriacal Data</h1>

In [493]:
values = pd.Series(['apple', 'orange', 'apple','apple']*2)

In [494]:
values

0     apple
1    orange
2     apple
3     apple
4     apple
5    orange
6     apple
7     apple
dtype: object

In [495]:
pd.unique(values)

array(['apple', 'orange'], dtype=object)

In [507]:
values = pd.Series([0, 1, 0, 0, 0]*2)

In [497]:
values

0    0
1    1
2    0
3    0
4    0
5    0
6    1
7    0
8    0
9    0
dtype: int64

In [498]:
dim = pd.Series(['apple', 'orange'])

In [508]:
dim.take(values)

0     apple
1    orange
0     apple
0     apple
0     apple
0     apple
1    orange
0     apple
0     apple
0     apple
dtype: object

In [509]:
fruits = ['apple', 'orange', 'apple', 'apple'] * 2

In [510]:
N = len(fruits)

In [511]:
fruits

['apple', 'orange', 'apple', 'apple', 'apple', 'orange', 'apple', 'apple']

In [512]:
rng = np.random.default_rng(seed=1234)

In [513]:
df = pd.DataFrame({'fruit': fruits,
                   'basket_id': np.arange(N),
                   'count': rng.integers(3, 15, N),
                   'weight': rng.uniform(0, 4, size=N)},
                  columns=['basket_id', 'fruit', 'count', 'weight'])

In [514]:
df

Unnamed: 0,basket_id,fruit,count,weight
0,0,apple,14,1.276388
1,1,orange,14,0.472365
2,2,apple,14,0.967065
3,3,apple,7,1.274136
4,4,apple,5,3.856317
5,5,orange,14,1.054599
6,6,apple,4,1.764024
7,7,apple,6,2.439483


In [515]:
df['fruit']

0     apple
1    orange
2     apple
3     apple
4     apple
5    orange
6     apple
7     apple
Name: fruit, dtype: object

In [516]:
fruit_cat = df['fruit'].astype('category')

In [517]:
fruit_cat

0     apple
1    orange
2     apple
3     apple
4     apple
5    orange
6     apple
7     apple
Name: fruit, dtype: category
Categories (2, object): ['apple', 'orange']

In [518]:
c = fruit_cat.array

In [519]:
c

['apple', 'orange', 'apple', 'apple', 'apple', 'orange', 'apple', 'apple']
Categories (2, object): ['apple', 'orange']

In [520]:
type(c)

pandas.core.arrays.categorical.Categorical