In [50]:
import pandas as pd
import numpy as np

In [51]:
obj = pd.Series([-7, -5, 7, 4, 2, 0, 4])

In [52]:
obj

0   -7
1   -5
2    7
3    4
4    2
5    0
6    4
dtype: int64

In [53]:
obj.rank(
    method="dense",
)

0    1.0
1    2.0
2    6.0
3    5.0
4    4.0
5    3.0
6    5.0
dtype: float64

In [54]:
obj.rank(method="first")

0    1.0
1    2.0
2    7.0
3    5.0
4    4.0
5    3.0
6    6.0
dtype: float64

In [55]:
frame = pd.DataFrame(
    {"b":[4.3, 7, -3, 2],
     "a":[0, 1, 0, 1],
     "c":[-2, 5, 8, -2.5]}
)

In [56]:
frame

Unnamed: 0,b,a,c
0,4.3,0,-2.0
1,7.0,1,5.0
2,-3.0,0,8.0
3,2.0,1,-2.5


In [57]:
frame.rank(axis="columns",
           method="max",
           
          )

Unnamed: 0,b,a,c
0,3.0,2.0,1.0
1,3.0,1.0,2.0
2,1.0,2.0,3.0
3,3.0,2.0,1.0


In [58]:
obj = pd.Series(np.arange(5),
                index=['a', 'a', 'b', 'b', 'c'])

In [59]:
obj

a    0
a    1
b    2
b    3
c    4
dtype: int32

In [60]:
obj.index.is_unique

False

In [61]:
df = pd.DataFrame(np.random.standard_normal((5, 3)),
                  index=['a', 'a', 'b', 'b', 'c'])

In [62]:
df

Unnamed: 0,0,1,2
a,-0.073834,0.175518,2.124127
a,0.1741,0.593354,-0.881286
b,-1.33705,-0.169525,-1.636911
b,-0.441938,0.016174,-0.851091
c,-0.061377,-1.704079,1.475307


In [63]:
df.loc["a"]

Unnamed: 0,0,1,2
a,-0.073834,0.175518,2.124127
a,0.1741,0.593354,-0.881286


In [64]:
df.loc["c"]

0   -0.061377
1   -1.704079
2    1.475307
Name: c, dtype: float64

In [65]:
df = pd.DataFrame([[1.4, np.nan], [7.1, -4.5],
                   [np.nan, np.nan], [0.75, -1.3]],
                  index=['a', 'b', 'c', 'd'],
                  columns=['one', 'two'])

In [66]:
df

Unnamed: 0,one,two
a,1.4,
b,7.1,-4.5
c,,
d,0.75,-1.3


In [67]:
df.sum(axis=0)

one    9.25
two   -5.80
dtype: float64

In [68]:
df.sum(axis="index",
       # skipna=False,
      )

one    9.25
two   -5.80
dtype: float64

In [69]:
df.sum(axis=1,
       # skipna=False,
      )

a    1.40
b    2.60
c    0.00
d   -0.55
dtype: float64

In [70]:
df.idxmax()

one    b
two    d
dtype: object

In [71]:
df.describe().loc['mean']

one    3.083333
two   -2.900000
Name: mean, dtype: float64

In [72]:
# non-numeric data
obj = pd.Series(['a', 'a', 'b', 'c']*4)

In [73]:
obj.describe()

count     16
unique     3
top        a
freq       8
dtype: object

In [74]:
obj = pd.Series(['c', 'a', 'd', 'a', 'a', 'b', 'b', 'c', 'c'])

In [75]:
obj.unique()

array(['c', 'a', 'd', 'b'], dtype=object)

In [76]:
pd.Series(obj).value_counts()

c    3
a    3
b    2
d    1
Name: count, dtype: int64

In [77]:
data = pd.DataFrame({"qu1": [1, 3, 4, 3, 4],
                     "qu2": [2, 3, 1, 2, 3],
                     "qu3": [1, 5, 2, 4, 4]})

In [79]:
data.qu1.value_counts().sort_index()

qu1
1    1
3    2
4    2
Name: count, dtype: int64

In [80]:
data.value_counts()

qu1  qu2  qu3
1    2    1      1
3    2    4      1
     3    5      1
4    1    2      1
     3    4      1
Name: count, dtype: int64

In [81]:
import sys

In [82]:
data.to_csv(sys.stdout, index=False, columns=['qu1', 'qu3'], sep="|")

qu1|qu3
1|1
3|5
4|2
3|4
4|4


In [83]:
import sqlite3

In [84]:
query = """
CREATE TABLE test
(a VARCHAR(20), b VARCHAR(20), c REAL, d INTEGER);"""

In [88]:
con = sqlite3.connect("mydata.sqlite")
con.execute(query)
con.commit()

OperationalError: table test already exists

In [None]:
data = [("Atlanta", "Georgia", 1.25, 6),
        ("Tallahassee", "Florida", 2.6, 3),
        ("Sacramento", "California", 1.7, 5)]

In [None]:
stmt = "INSERT INTO test VALUES(?, ?, ?, ?)"
con.executemany(stmt, data)
con.commit()

In [None]:
cursor = con.execute("SELECT * FROM test")
rows = cursor.fetchall()

In [None]:
rows

In [None]:
cursor.description

In [None]:
df = pd.DataFrame(rows, columns=[x[0] for x in cursor.description])

In [None]:
df

In [None]:
import sqlalchemy as sqla

In [None]:
db = sqla.create_engine("sqlite:///mydata.sqlite")

In [None]:
df = pd.read_sql("SELECT * FROM test", db)

In [None]:
df

In [None]:
# drop duplicates
data = pd.DataFrame({"k1": ["one", "two"]*3 + ["two"],
                     "k2": [1, 1, 2, 3, 3, 4, 4]})

In [None]:
data

In [None]:
# data.duplicated()

In [None]:
# data = data.drop_duplicates()

In [None]:
# data

In [None]:
data.loc[:,'v1'] = range(7)

In [None]:
data

In [None]:
data.drop_duplicates(subset=['k2'])

In [None]:
data = pd.DataFrame({"food": ["bacon", "pulled pork", "bacon", "pastrami", "corned beef", "bacon", "pastrami", "honey ham", "nova lox"],
                     "ounces": [4, 3, 12, 6, 7.5, 8, 3, 5, 6]})

In [None]:
data

In [None]:
meat_to_animal = {
    "bacon": "pig",
    "pulled pork": "pig",
    "pastrami": "cow",
    "corned beef": "cow",
    "honey ham": "pig",
    "nova lox" : "Salmon"
}

In [None]:
data['animal'] = data["food"].map(meat_to_animal)

In [None]:
data

In [None]:
data = pd.Series([1., -999., 2.0, -999.0, -1000, -1000, 3])

In [None]:
data

In [None]:
data.replace([-999, -1000], np.nan)

In [None]:
data.replace([-999, -1000], [np.nan, 0])

In [None]:
data.replace({-999: np.nan, -1000: 0})

In [None]:
data

In [None]:
# binning and discretization
ages = [18, 22, 25, 26, 21, 23, 37, 31, 61, 45, 41, 32]

In [None]:
bins = [18, 25, 35, 60, 100]
# bins = 5

In [None]:
age_categories = pd.cut(ages, bins)

In [None]:
age_categories

In [None]:
ages

In [89]:
df = pd.DataFrame([[1, 2.12], [3.356, 4.567]])

In [90]:
df

Unnamed: 0,0,1
0,1.0,2.12
1,3.356,4.567


In [91]:
df_copy = df.copy()

In [92]:
df_copy.iloc[0,0] = pd.NA

In [93]:
df_copy

Unnamed: 0,0,1
0,,2.12
1,3.356,4.567


In [94]:
df_copy.map(lambda x: len(str(x)), na_action=None)

Unnamed: 0,0,1
0,3,4
1,5,5


In [95]:
df

Unnamed: 0,0,1
0,1.0,2.12
1,3.356,4.567


In [96]:
df.map(round, ndigits=1)

Unnamed: 0,0,1
0,1.0,2.1
1,3.4,4.6


In [97]:
data = pd.DataFrame(np.arange(12).reshape(3, 4),
                    index=['Computer Engineering', 'Civil Engineering', 'Electrical Engineering'],
                    columns=['Year One', 'Year Two', 'Year Three', 'Year Four']
                   )

In [98]:
data

Unnamed: 0,Year One,Year Two,Year Three,Year Four
Computer Engineering,0,1,2,3
Civil Engineering,4,5,6,7
Electrical Engineering,8,9,10,11


In [99]:
def transform(x):
    return x[:4].upper()

In [100]:
data.index = data.index.map(transform)

In [101]:
data

Unnamed: 0,Year One,Year Two,Year Three,Year Four
COMP,0,1,2,3
CIVI,4,5,6,7
ELEC,8,9,10,11


In [102]:
data = data.rename(index=str.title,
                   columns=str.upper,
                  )

In [103]:
data

Unnamed: 0,YEAR ONE,YEAR TWO,YEAR THREE,YEAR FOUR
Comp,0,1,2,3
Civi,4,5,6,7
Elec,8,9,10,11


In [104]:
pd.cut(np.array([1, 7, 5, 4, 6, 3]), 3,
      right=False,
      )

[[1.0, 3.0), [5.0, 7.006), [5.0, 7.006), [3.0, 5.0), [5.0, 7.006), [3.0, 5.0)]
Categories (3, interval[float64, left]): [[1.0, 3.0) < [3.0, 5.0) < [5.0, 7.006)]

In [105]:
# Binning with qcut
data = np.random.standard_normal(1000)
quartiles = pd.qcut(data, 4, precision=2)

In [106]:
quartiles

[(-0.71, -0.071], (-0.071, 0.61], (-3.4, -0.71], (-3.4, -0.71], (-3.4, -0.71], ..., (0.61, 3.04], (-0.71, -0.071], (-3.4, -0.71], (-0.71, -0.071], (-0.071, 0.61]]
Length: 1000
Categories (4, interval[float64, right]): [(-3.4, -0.71] < (-0.71, -0.071] < (-0.071, 0.61] < (0.61, 3.04]]

In [114]:
data = pd.DataFrame(np.random.standard_normal((1000, 4)))

In [115]:
data.describe()

Unnamed: 0,0,1,2,3
count,1000.0,1000.0,1000.0,1000.0
mean,-0.00717,0.028418,-0.015948,0.006705
std,0.989475,1.012638,0.999225,1.011584
min,-3.863079,-3.206427,-2.990626,-2.8291
25%,-0.682522,-0.660944,-0.727271,-0.689514
50%,0.015922,0.040501,0.021965,-0.002855
75%,0.68727,0.716144,0.706774,0.707296
max,2.586903,3.27495,3.162104,4.229807


In [116]:
# Getting outliers where value is greater than three in any column
data[(data.abs() > 3).any(axis="columns")] 

Unnamed: 0,0,1,2,3
78,-0.23964,-3.014826,-0.329446,-0.904177
135,-3.863079,1.449744,-0.094607,0.560016
136,1.356443,0.013333,3.162104,0.515915
245,-0.766604,3.27495,-0.586928,-0.243599
247,-0.095794,-3.206427,1.167684,0.941339
254,0.795129,0.010861,-0.279162,3.010606
392,-0.320554,-0.329598,0.280289,4.229807
667,-3.194463,0.257463,0.768994,-0.360105
677,0.915903,-0.145163,3.136314,-0.619483
754,-3.243369,-0.607739,2.411551,0.227206


In [117]:
# Capping the outlier values to 3
data[data.abs() > 3] = np.sign(data) * 3

In [119]:
data[(data.abs() == 3).any(axis="columns")]

Unnamed: 0,0,1,2,3
78,-0.23964,-3.0,-0.329446,-0.904177
135,-3.0,1.449744,-0.094607,0.560016
136,1.356443,0.013333,3.0,0.515915
245,-0.766604,3.0,-0.586928,-0.243599
247,-0.095794,-3.0,1.167684,0.941339
254,0.795129,0.010861,-0.279162,3.0
392,-0.320554,-0.329598,0.280289,3.0
667,-3.0,0.257463,0.768994,-0.360105
677,0.915903,-0.145163,3.0,-0.619483
754,-3.0,-0.607739,2.411551,0.227206


In [121]:
# Permutations
df = pd.DataFrame(np.arange(5*7).reshape((5, 7)))

In [123]:
df

Unnamed: 0,0,1,2,3,4,5,6
0,0,1,2,3,4,5,6
1,7,8,9,10,11,12,13
2,14,15,16,17,18,19,20
3,21,22,23,24,25,26,27
4,28,29,30,31,32,33,34


In [131]:
sampler = np.random.permutation(5)

In [133]:
sampler

array([2, 0, 1, 4, 3])

In [135]:
# Sampling
df.take(sampler)

Unnamed: 0,0,1,2,3,4,5,6
2,14,15,16,17,18,19,20
0,0,1,2,3,4,5,6
1,7,8,9,10,11,12,13
4,28,29,30,31,32,33,34
3,21,22,23,24,25,26,27


In [136]:
df = pd.DataFrame({"key": ['b', 'b', 'a', 'c', 'a', 'b'],
                   "data1": range(6)})

In [139]:
df

Unnamed: 0,key,data1
0,b,0
1,b,1
2,a,2
3,c,3
4,a,4
5,b,5


In [141]:
pd.get_dummies(df["key"])

Unnamed: 0,a,b,c
0,False,True,False
1,False,True,False
2,True,False,False
3,False,False,True
4,True,False,False
5,False,True,False


In [142]:
s = pd.Series(list('abca'))

In [144]:
s

0    a
1    b
2    c
3    a
dtype: object

In [146]:
pd.get_dummies(s)

Unnamed: 0,a,b,c
0,True,False,False
1,False,True,False
2,False,False,True
3,True,False,False


In [147]:
s1 = ['a', 'b', np.nan]

In [148]:
pd.get_dummies(s1)

Unnamed: 0,a,b
0,True,False
1,False,True
2,False,False


In [153]:
pd.get_dummies(s1,
               dummy_na=True,
              prefix="key",
              prefix_sep = "-",
              )

Unnamed: 0,key-a,key-b,key-nan
0,True,False,False
1,False,True,False
2,False,False,True


In [154]:
s1 = pd.Series(["a|b", "a", "a|c"])

In [156]:
s1

0    a|b
1      a
2    a|c
dtype: object

In [157]:
s1.str.get_dummies()

Unnamed: 0,a,b,c
0,1,1,0
1,1,0,0
2,1,0,1
