In [629]:
import pandas as pd
import numpy as np

In [630]:
obj = pd.Series([-7, -5, 7, 4, 2, 0, 4])

In [631]:
obj

0   -7
1   -5
2    7
3    4
4    2
5    0
6    4
dtype: int64

In [632]:
obj.rank(
    method="dense",
)

0    1.0
1    2.0
2    6.0
3    5.0
4    4.0
5    3.0
6    5.0
dtype: float64

In [633]:
obj.rank(method="first")

0    1.0
1    2.0
2    7.0
3    5.0
4    4.0
5    3.0
6    6.0
dtype: float64

In [634]:
frame = pd.DataFrame(
    {"b":[4.3, 7, -3, 2],
     "a":[0, 1, 0, 1],
     "c":[-2, 5, 8, -2.5]}
)

In [635]:
frame

Unnamed: 0,b,a,c
0,4.3,0,-2.0
1,7.0,1,5.0
2,-3.0,0,8.0
3,2.0,1,-2.5


In [636]:
frame.rank(axis="columns",
           method="max",
           
          )

Unnamed: 0,b,a,c
0,3.0,2.0,1.0
1,3.0,1.0,2.0
2,1.0,2.0,3.0
3,3.0,2.0,1.0


In [637]:
obj = pd.Series(np.arange(5),
                index=['a', 'a', 'b', 'b', 'c'])

In [638]:
obj

a    0
a    1
b    2
b    3
c    4
dtype: int32

In [639]:
obj.index.is_unique

False

In [640]:
df = pd.DataFrame(np.random.standard_normal((5, 3)),
                  index=['a', 'a', 'b', 'b', 'c'])

In [641]:
df

Unnamed: 0,0,1,2
a,0.094226,-0.510407,-1.474774
a,0.86375,-0.946003,1.099909
b,-1.256394,-1.017517,-0.575038
b,-0.147204,-0.828725,-0.12321
c,0.016188,0.347791,-0.527993


In [642]:
df.loc["a"]

Unnamed: 0,0,1,2
a,0.094226,-0.510407,-1.474774
a,0.86375,-0.946003,1.099909


In [643]:
df.loc["c"]

0    0.016188
1    0.347791
2   -0.527993
Name: c, dtype: float64

In [644]:
df = pd.DataFrame([[1.4, np.nan], [7.1, -4.5],
                   [np.nan, np.nan], [0.75, -1.3]],
                  index=['a', 'b', 'c', 'd'],
                  columns=['one', 'two'])

In [645]:
df

Unnamed: 0,one,two
a,1.4,
b,7.1,-4.5
c,,
d,0.75,-1.3


In [646]:
df.sum(axis=0)

one    9.25
two   -5.80
dtype: float64

In [647]:
df.sum(axis="index",
       # skipna=False,
      )

one    9.25
two   -5.80
dtype: float64

In [648]:
df.sum(axis=1,
       # skipna=False,
      )

a    1.40
b    2.60
c    0.00
d   -0.55
dtype: float64

In [649]:
df.idxmax()

one    b
two    d
dtype: object

In [650]:
df.describe().loc['mean']

one    3.083333
two   -2.900000
Name: mean, dtype: float64

In [651]:
# non-numeric data
obj = pd.Series(['a', 'a', 'b', 'c']*4)

In [652]:
obj.describe()

count     16
unique     3
top        a
freq       8
dtype: object

In [653]:
obj = pd.Series(['c', 'a', 'd', 'a', 'a', 'b', 'b', 'c', 'c'])

In [654]:
obj.unique()

array(['c', 'a', 'd', 'b'], dtype=object)

In [655]:
pd.Series(obj).value_counts()

c    3
a    3
b    2
d    1
Name: count, dtype: int64

In [656]:
data = pd.DataFrame({"qu1": [1, 3, 4, 3, 4],
                     "qu2": [2, 3, 1, 2, 3],
                     "qu3": [1, 5, 2, 4, 4]})

In [657]:
data.qu1.value_counts().sort_index()

qu1
1    1
3    2
4    2
Name: count, dtype: int64

In [658]:
data.value_counts()

qu1  qu2  qu3
1    2    1      1
3    2    4      1
     3    5      1
4    1    2      1
     3    4      1
Name: count, dtype: int64

In [659]:
import sys

In [660]:
data.to_csv(sys.stdout, index=False, columns=['qu1', 'qu3'], sep="|")

qu1|qu3
1|1
3|5
4|2
3|4
4|4


In [661]:
import sqlite3

In [662]:
query = """
CREATE TABLE test
(a VARCHAR(20), b VARCHAR(20), c REAL, d INTEGER);"""

In [663]:
con = sqlite3.connect("mydata.sqlite")
con.execute(query)
con.commit()

OperationalError: table test already exists

In [664]:
data = [("Atlanta", "Georgia", 1.25, 6),
        ("Tallahassee", "Florida", 2.6, 3),
        ("Sacramento", "California", 1.7, 5)]

In [665]:
stmt = "INSERT INTO test VALUES(?, ?, ?, ?)"
con.executemany(stmt, data)
con.commit()

In [666]:
cursor = con.execute("SELECT * FROM test")
rows = cursor.fetchall()

In [667]:
rows

[('Atlanta', 'Georgia', 1.25, 6),
 ('Tallahassee', 'Florida', 2.6, 3),
 ('Sacramento', 'California', 1.7, 5),
 ('Atlanta', 'Georgia', 1.25, 6),
 ('Tallahassee', 'Florida', 2.6, 3),
 ('Sacramento', 'California', 1.7, 5),
 ('Atlanta', 'Georgia', 1.25, 6),
 ('Tallahassee', 'Florida', 2.6, 3),
 ('Sacramento', 'California', 1.7, 5),
 ('Atlanta', 'Georgia', 1.25, 6),
 ('Tallahassee', 'Florida', 2.6, 3),
 ('Sacramento', 'California', 1.7, 5),
 ('Atlanta', 'Georgia', 1.25, 6),
 ('Tallahassee', 'Florida', 2.6, 3),
 ('Sacramento', 'California', 1.7, 5),
 ('Atlanta', 'Georgia', 1.25, 6),
 ('Tallahassee', 'Florida', 2.6, 3),
 ('Sacramento', 'California', 1.7, 5)]

In [668]:
cursor.description

(('a', None, None, None, None, None, None),
 ('b', None, None, None, None, None, None),
 ('c', None, None, None, None, None, None),
 ('d', None, None, None, None, None, None))

In [669]:
df = pd.DataFrame(rows, columns=[x[0] for x in cursor.description])

In [670]:
df

Unnamed: 0,a,b,c,d
0,Atlanta,Georgia,1.25,6
1,Tallahassee,Florida,2.6,3
2,Sacramento,California,1.7,5
3,Atlanta,Georgia,1.25,6
4,Tallahassee,Florida,2.6,3
5,Sacramento,California,1.7,5
6,Atlanta,Georgia,1.25,6
7,Tallahassee,Florida,2.6,3
8,Sacramento,California,1.7,5
9,Atlanta,Georgia,1.25,6


In [671]:
import sqlalchemy as sqla

In [672]:
db = sqla.create_engine("sqlite:///mydata.sqlite")

In [673]:
df = pd.read_sql("SELECT * FROM test", db)

In [674]:
df

Unnamed: 0,a,b,c,d
0,Atlanta,Georgia,1.25,6
1,Tallahassee,Florida,2.6,3
2,Sacramento,California,1.7,5
3,Atlanta,Georgia,1.25,6
4,Tallahassee,Florida,2.6,3
5,Sacramento,California,1.7,5
6,Atlanta,Georgia,1.25,6
7,Tallahassee,Florida,2.6,3
8,Sacramento,California,1.7,5
9,Atlanta,Georgia,1.25,6


In [675]:
# drop duplicates
data = pd.DataFrame({"k1": ["one", "two"]*3 + ["two"],
                     "k2": [1, 1, 2, 3, 3, 4, 4]})

In [676]:
data

Unnamed: 0,k1,k2
0,one,1
1,two,1
2,one,2
3,two,3
4,one,3
5,two,4
6,two,4


In [677]:
# data.duplicated()

In [678]:
# data = data.drop_duplicates()

In [679]:
# data

In [680]:
data.loc[:,'v1'] = range(7)

In [681]:
data

Unnamed: 0,k1,k2,v1
0,one,1,0
1,two,1,1
2,one,2,2
3,two,3,3
4,one,3,4
5,two,4,5
6,two,4,6


In [682]:
data.drop_duplicates(subset=['k2'])

Unnamed: 0,k1,k2,v1
0,one,1,0
2,one,2,2
3,two,3,3
5,two,4,5


In [683]:
data = pd.DataFrame({"food": ["bacon", "pulled pork", "bacon", "pastrami", "corned beef", "bacon", "pastrami", "honey ham", "nova lox"],
                     "ounces": [4, 3, 12, 6, 7.5, 8, 3, 5, 6]})

In [684]:
data

Unnamed: 0,food,ounces
0,bacon,4.0
1,pulled pork,3.0
2,bacon,12.0
3,pastrami,6.0
4,corned beef,7.5
5,bacon,8.0
6,pastrami,3.0
7,honey ham,5.0
8,nova lox,6.0


In [685]:
meat_to_animal = {
    "bacon": "pig",
    "pulled pork": "pig",
    "pastrami": "cow",
    "corned beef": "cow",
    "honey ham": "pig",
    "nova lox" : "Salmon"
}

In [686]:
data['animal'] = data["food"].map(meat_to_animal)

In [687]:
data

Unnamed: 0,food,ounces,animal
0,bacon,4.0,pig
1,pulled pork,3.0,pig
2,bacon,12.0,pig
3,pastrami,6.0,cow
4,corned beef,7.5,cow
5,bacon,8.0,pig
6,pastrami,3.0,cow
7,honey ham,5.0,pig
8,nova lox,6.0,Salmon


In [688]:
data = pd.Series([1., -999., 2.0, -999.0, -1000, -1000, 3])

In [689]:
data

0       1.0
1    -999.0
2       2.0
3    -999.0
4   -1000.0
5   -1000.0
6       3.0
dtype: float64

In [690]:
data.replace([-999, -1000], np.nan)

0    1.0
1    NaN
2    2.0
3    NaN
4    NaN
5    NaN
6    3.0
dtype: float64

In [691]:
data.replace([-999, -1000], [np.nan, 0])

0    1.0
1    NaN
2    2.0
3    NaN
4    0.0
5    0.0
6    3.0
dtype: float64

In [692]:
data.replace({-999: np.nan, -1000: 0})

0    1.0
1    NaN
2    2.0
3    NaN
4    0.0
5    0.0
6    3.0
dtype: float64

In [693]:
data

0       1.0
1    -999.0
2       2.0
3    -999.0
4   -1000.0
5   -1000.0
6       3.0
dtype: float64

In [694]:
# binning and discretization
ages = [18, 22, 25, 26, 21, 23, 37, 31, 61, 45, 41, 32]

In [695]:
bins = [18, 25, 35, 60, 100]
# bins = 5

In [696]:
age_categories = pd.cut(ages, bins)

In [697]:
age_categories

[NaN, (18.0, 25.0], (18.0, 25.0], (25.0, 35.0], (18.0, 25.0], ..., (25, 35], (60, 100], (35, 60], (35, 60], (25, 35]]
Length: 12
Categories (4, interval[int64, right]): [(18, 25] < (25, 35] < (35, 60] < (60, 100]]

In [698]:
ages

[18, 22, 25, 26, 21, 23, 37, 31, 61, 45, 41, 32]

In [699]:
df = pd.DataFrame([[1, 2.12], [3.356, 4.567]])

In [700]:
df

Unnamed: 0,0,1
0,1.0,2.12
1,3.356,4.567


In [701]:
df_copy = df.copy()

In [702]:
df_copy.iloc[0,0] = pd.NA

In [703]:
df_copy

Unnamed: 0,0,1
0,,2.12
1,3.356,4.567


In [704]:
df_copy.map(lambda x: len(str(x)), na_action=None)

Unnamed: 0,0,1
0,3,4
1,5,5


In [705]:
df

Unnamed: 0,0,1
0,1.0,2.12
1,3.356,4.567


In [706]:
df.map(round, ndigits=1)

Unnamed: 0,0,1
0,1.0,2.1
1,3.4,4.6


In [707]:
data = pd.DataFrame(np.arange(12).reshape(3, 4),
                    index=['Computer Engineering', 'Civil Engineering', 'Electrical Engineering'],
                    columns=['Year One', 'Year Two', 'Year Three', 'Year Four']
                   )

In [708]:
data

Unnamed: 0,Year One,Year Two,Year Three,Year Four
Computer Engineering,0,1,2,3
Civil Engineering,4,5,6,7
Electrical Engineering,8,9,10,11


In [709]:
def transform(x):
    return x[:4].upper()

In [710]:
data.index = data.index.map(transform)

In [711]:
data

Unnamed: 0,Year One,Year Two,Year Three,Year Four
COMP,0,1,2,3
CIVI,4,5,6,7
ELEC,8,9,10,11


In [712]:
data = data.rename(index=str.title,
                   columns=str.upper,
                  )

In [713]:
data

Unnamed: 0,YEAR ONE,YEAR TWO,YEAR THREE,YEAR FOUR
Comp,0,1,2,3
Civi,4,5,6,7
Elec,8,9,10,11


In [714]:
pd.cut(np.array([1, 7, 5, 4, 6, 3]), 3,
      right=False,
      )

[[1.0, 3.0), [5.0, 7.006), [5.0, 7.006), [3.0, 5.0), [5.0, 7.006), [3.0, 5.0)]
Categories (3, interval[float64, left]): [[1.0, 3.0) < [3.0, 5.0) < [5.0, 7.006)]

In [715]:
# Binning with qcut
data = np.random.standard_normal(1000)
quartiles = pd.qcut(data, 4, precision=2)

In [716]:
quartiles

[(0.72, 2.79], (-3.67, -0.67], (-0.67, 0.014], (-0.67, 0.014], (-3.67, -0.67], ..., (0.72, 2.79], (-0.67, 0.014], (0.72, 2.79], (0.014, 0.72], (-0.67, 0.014]]
Length: 1000
Categories (4, interval[float64, right]): [(-3.67, -0.67] < (-0.67, 0.014] < (0.014, 0.72] < (0.72, 2.79]]

In [717]:
data = pd.DataFrame(np.random.standard_normal((1000, 4)))

In [718]:
data.describe()

Unnamed: 0,0,1,2,3
count,1000.0,1000.0,1000.0,1000.0
mean,-0.005453,-0.005832,-0.002175,0.008111
std,1.014474,0.984346,1.012143,0.979474
min,-3.240728,-3.289581,-3.834386,-3.282688
25%,-0.701635,-0.628296,-0.696157,-0.642337
50%,0.009248,-0.025322,-0.013096,-0.01159
75%,0.66684,0.598637,0.642694,0.656001
max,2.817363,3.239479,2.981201,4.199049


In [719]:
# Getting outliers where value is greater than three in any column
data[(data.abs() > 3).any(axis="columns")] 

Unnamed: 0,0,1,2,3
5,-0.084051,0.287708,0.899454,4.199049
223,0.870339,1.467386,-3.04182,-0.082682
444,-3.240728,-0.660331,0.909348,-1.3487
456,0.057741,1.693604,-3.834386,0.534444
492,-0.37732,-0.657573,-0.969631,-3.282688
493,-0.582247,3.239479,-0.519771,-0.031794
509,-0.817823,-0.69994,-3.300493,-0.006072
847,-3.002965,1.746794,1.075133,-0.197581
879,0.01728,0.456067,-0.931064,3.54696
911,-0.950384,-3.289581,-1.583426,-0.627678


In [720]:
# Capping the outlier values to 3
data[data.abs() > 3] = np.sign(data) * 3

In [721]:
data[(data.abs() == 3).any(axis="columns")]

Unnamed: 0,0,1,2,3
5,-0.084051,0.287708,0.899454,3.0
223,0.870339,1.467386,-3.0,-0.082682
444,-3.0,-0.660331,0.909348,-1.3487
456,0.057741,1.693604,-3.0,0.534444
492,-0.37732,-0.657573,-0.969631,-3.0
493,-0.582247,3.0,-0.519771,-0.031794
509,-0.817823,-0.69994,-3.0,-0.006072
847,-3.0,1.746794,1.075133,-0.197581
879,0.01728,0.456067,-0.931064,3.0
911,-0.950384,-3.0,-1.583426,-0.627678


In [722]:
# Permutations
df = pd.DataFrame(np.arange(5*7).reshape((5, 7)))

In [723]:
df

Unnamed: 0,0,1,2,3,4,5,6
0,0,1,2,3,4,5,6
1,7,8,9,10,11,12,13
2,14,15,16,17,18,19,20
3,21,22,23,24,25,26,27
4,28,29,30,31,32,33,34


In [724]:
sampler = np.random.permutation(5)

In [725]:
sampler

array([3, 4, 1, 2, 0])

In [726]:
# Sampling
df.take(sampler)

Unnamed: 0,0,1,2,3,4,5,6
3,21,22,23,24,25,26,27
4,28,29,30,31,32,33,34
1,7,8,9,10,11,12,13
2,14,15,16,17,18,19,20
0,0,1,2,3,4,5,6


In [727]:
df = pd.DataFrame({"key": ['b', 'b', 'a', 'c', 'a', 'b'],
                   "data1": range(6)})

In [728]:
df

Unnamed: 0,key,data1
0,b,0
1,b,1
2,a,2
3,c,3
4,a,4
5,b,5


In [729]:
pd.get_dummies(df["key"])

Unnamed: 0,a,b,c
0,False,True,False
1,False,True,False
2,True,False,False
3,False,False,True
4,True,False,False
5,False,True,False


In [730]:
s = pd.Series(list('abca'))

In [731]:
s

0    a
1    b
2    c
3    a
dtype: object

In [732]:
pd.get_dummies(s)

Unnamed: 0,a,b,c
0,True,False,False
1,False,True,False
2,False,False,True
3,True,False,False


In [733]:
s1 = ['a', 'b', np.nan]

In [734]:
pd.get_dummies(s1)

Unnamed: 0,a,b
0,True,False
1,False,True
2,False,False


In [735]:
pd.get_dummies(s1,
               dummy_na=True,
              prefix="key",
              prefix_sep = "-",
              )

Unnamed: 0,key-a,key-b,key-nan
0,True,False,False
1,False,True,False
2,False,False,True


In [736]:
s1 = pd.Series(["a|b", "a", "a|c"])

In [737]:
s1

0    a|b
1      a
2    a|c
dtype: object

In [738]:
s1.str.get_dummies()

Unnamed: 0,a,b,c
0,1,1,0
1,1,0,0
2,1,0,1


In [739]:
s = pd.Series([1, 2, 3, None], dtype=pd.Int64Dtype())

In [740]:
s

0       1
1       2
2       3
3    <NA>
dtype: Int64

In [741]:
s.isna()

0    False
1    False
2    False
3     True
dtype: bool

In [742]:
s.dtype

Int64Dtype()

In [743]:
s[3]

<NA>

In [744]:
s[3] is pd.NA

True

In [745]:
s = pd.Series(["one", "two", None, "three'"], dtype=pd.StringDtype())

In [746]:
s

0       one
1       two
2      <NA>
3    three'
dtype: string

In [747]:
df = pd.DataFrame({"A": [1, 2, None, 4],
                   "B": ["one", "two", "three", None],
                   "C": [False, None, False, True]})

In [748]:
df

Unnamed: 0,A,B,C
0,1.0,one,False
1,2.0,two,
2,,three,False
3,4.0,,True


In [749]:
df['A'] = df['A'].astype("Int64")
df['B'] = df['B'].astype("string")
df["C"] = df["C"].astype("boolean")

In [750]:
df

Unnamed: 0,A,B,C
0,1.0,one,False
1,2.0,two,
2,,three,False
3,4.0,,True


In [751]:
import re

In [752]:
text = "foo bar\t baz \tqux"

In [753]:
re.split(r"\s+", text)

['foo', 'bar', 'baz', 'qux']

In [754]:
regex = re.compile(r"\s+")

In [755]:
regex.split(text)

['foo', 'bar', 'baz', 'qux']

In [756]:
regex.findall(text)

[' ', '\t ', ' \t']

In [757]:
pattern = r"[A-Z0-9._%+-]+@[A-Z0-9.-]+\.[A-Z]{2,4}"

In [758]:
text = """Dave dave@google.com
Steve steve@gmail.com
Rob rob@gmail.com
Ryan ryan@yahoo.com"""

In [759]:
text

'Dave dave@google.com\nSteve steve@gmail.com\nRob rob@gmail.com\nRyan ryan@yahoo.com'

In [760]:
regex = re.compile(pattern, flags=re.IGNORECASE)

In [761]:
regex.findall(text)

['dave@google.com', 'steve@gmail.com', 'rob@gmail.com', 'ryan@yahoo.com']

In [762]:
m = regex.search(text)

In [763]:
m

<re.Match object; span=(5, 20), match='dave@google.com'>

In [764]:
m.start()

5

In [765]:
m.end()

20

In [766]:
text[m.start():m.end()]

'dave@google.com'

In [773]:
pattern = r'U+[0-9]+/[0-9]+'

In [774]:
regex = re.compile(pattern, flags=re.IGNORECASE)

In [776]:
regex.match('U0391/673 LUKABWE DOUGLAS MARKM 2022 42 MUKONO ACC BSC. Accounting (EVE)')[0]

'U0391/673'

In [812]:
namePattern = r'[A-Z]+\s{2, 5}'

In [813]:
student = 'U0391/673 LUKABWE DOUGLAS MARKM 2022 42 MUKONO ACC BSC. Accounting (EVE)'

In [772]:
regex.match('U0391/673 LUKABWE DOUGLAS MARKM 2022 42 MUKONO ACC BSC. Accounting (EVE)')[0]

'U0391/673'

In [1232]:
pattern = r"\s([a-zA-Z]+){5}"

In [1233]:
regex = re.compile(pattern, flags=re.IGNORECASE)

In [1234]:
m = re.match(pattern, student)

In [1235]:
m

In [1236]:
yearPattern = r"\b[0-9]{4}"

In [1237]:
regex = re.compile(yearPattern)

In [1238]:
m = regex.findall(student)

In [1239]:
m

['2022']

In [1240]:
student

'U0391/673 LUKABWE DOUGLAS MARKM 2022 42 MUKONO ACC BSC. Accounting (EVE)'