In [3]:
import pandas as pd
import numpy as np

In [4]:
obj = pd.Series([-7, -5, 7, 4, 2, 0, 4])

In [5]:
obj

0   -7
1   -5
2    7
3    4
4    2
5    0
6    4
dtype: int64

In [6]:
obj.rank(
    method="dense",
)

0    1.0
1    2.0
2    6.0
3    5.0
4    4.0
5    3.0
6    5.0
dtype: float64

In [7]:
obj.rank(method="first")

0    1.0
1    2.0
2    7.0
3    5.0
4    4.0
5    3.0
6    6.0
dtype: float64

In [8]:
frame = pd.DataFrame(
    {"b":[4.3, 7, -3, 2],
     "a":[0, 1, 0, 1],
     "c":[-2, 5, 8, -2.5]}
)

In [9]:
frame

Unnamed: 0,b,a,c
0,4.3,0,-2.0
1,7.0,1,5.0
2,-3.0,0,8.0
3,2.0,1,-2.5


In [10]:
frame.rank(axis="columns",
           method="max",
           
          )

Unnamed: 0,b,a,c
0,3.0,2.0,1.0
1,3.0,1.0,2.0
2,1.0,2.0,3.0
3,3.0,2.0,1.0


In [11]:
obj = pd.Series(np.arange(5),
                index=['a', 'a', 'b', 'b', 'c'])

In [12]:
obj

a    0
a    1
b    2
b    3
c    4
dtype: int32

In [13]:
obj.index.is_unique

False

In [14]:
df = pd.DataFrame(np.random.standard_normal((5, 3)),
                  index=['a', 'a', 'b', 'b', 'c'])

In [15]:
df

Unnamed: 0,0,1,2
a,-0.487088,-0.198431,0.154839
a,0.166815,-1.581179,0.40118
b,-1.08011,-0.929436,0.13076
b,0.59119,0.004737,1.365296
c,-2.156101,1.975266,-0.092045


In [16]:
df.loc["a"]

Unnamed: 0,0,1,2
a,-0.487088,-0.198431,0.154839
a,0.166815,-1.581179,0.40118


In [17]:
df.loc["c"]

0   -2.156101
1    1.975266
2   -0.092045
Name: c, dtype: float64

In [18]:
df = pd.DataFrame([[1.4, np.nan], [7.1, -4.5],
                   [np.nan, np.nan], [0.75, -1.3]],
                  index=['a', 'b', 'c', 'd'],
                  columns=['one', 'two'])

In [19]:
df

Unnamed: 0,one,two
a,1.4,
b,7.1,-4.5
c,,
d,0.75,-1.3


In [20]:
df.sum(axis=0)

one    9.25
two   -5.80
dtype: float64

In [21]:
df.sum(axis="index",
       # skipna=False,
      )

one    9.25
two   -5.80
dtype: float64

In [22]:
df.sum(axis=1,
       # skipna=False,
      )

a    1.40
b    2.60
c    0.00
d   -0.55
dtype: float64

In [23]:
df.idxmax()

one    b
two    d
dtype: object

In [24]:
df.describe().loc['mean']

one    3.083333
two   -2.900000
Name: mean, dtype: float64

In [25]:
# non-numeric data
obj = pd.Series(['a', 'a', 'b', 'c']*4)

In [26]:
obj.describe()

count     16
unique     3
top        a
freq       8
dtype: object

In [34]:
obj = pd.Series(['c', 'a', 'd', 'a', 'a', 'b', 'b', 'c', 'c'])

In [31]:
obj.unique()

array(['c', 'a', 'd', 'b'], dtype=object)

In [38]:
pd.Series(obj).value_counts()

c    3
a    3
b    2
d    1
Name: count, dtype: int64

In [39]:
data = pd.DataFrame({"qu1": [1, 3, 4, 3, 4],
                     "qu2": [2, 3, 1, 2, 3],
                     "qu3": [1, 5, 2, 4, 4]})

In [41]:
data

Unnamed: 0,qu1,qu2,qu3
0,1,2,1
1,3,3,5
2,4,1,2
3,3,2,4
4,4,3,4


In [42]:
data.qu1.value_counts().sort_index()

qu1
1    1
3    2
4    2
Name: count, dtype: int64

In [63]:
data.value_counts()

qu1  qu2  qu3
1    2    1      1
3    2    4      1
     3    5      1
4    1    2      1
     3    4      1
Name: count, dtype: int64

In [64]:
import sys

In [68]:
data.to_csv(sys.stdout, index=False, columns=['qu1', 'qu3'], sep="|")

qu1|qu3
1|1
3|5
4|2
3|4
4|4


In [69]:
import sqlite3

In [70]:
query = """
CREATE TABLE test
(a VARCHAR(20), b VARCHAR(20), c REAL, d INTEGER);"""

In [71]:
con = sqlite3.connect("mydata.sqlite")
con.execute(query)
con.commit()

In [72]:
data = [("Atlanta", "Georgia", 1.25, 6),
        ("Tallahassee", "Florida", 2.6, 3),
        ("Sacramento", "California", 1.7, 5)]

In [75]:
stmt = "INSERT INTO test VALUES(?, ?, ?, ?)"
con.executemany(stmt, data)
con.commit()

In [85]:
cursor = con.execute("SELECT * FROM test")
rows = cursor.fetchall()

In [86]:
rows

[('Atlanta', 'Georgia', 1.25, 6),
 ('Tallahassee', 'Florida', 2.6, 3),
 ('Sacramento', 'California', 1.7, 5)]

In [82]:
cursor.description

(('a', None, None, None, None, None, None),
 ('b', None, None, None, None, None, None),
 ('c', None, None, None, None, None, None),
 ('d', None, None, None, None, None, None))

In [87]:
df = pd.DataFrame(rows, columns=[x[0] for x in cursor.description])

In [89]:
df

Unnamed: 0,a,b,c,d
0,Atlanta,Georgia,1.25,6
1,Tallahassee,Florida,2.6,3
2,Sacramento,California,1.7,5


In [94]:
import sqlalchemy as sqla

In [96]:
db = sqla.create_engine("sqlite:///mydata.sqlite")

In [97]:
df = pd.read_sql("SELECT * FROM test", db)

In [99]:
df

Unnamed: 0,a,b,c,d
0,Atlanta,Georgia,1.25,6
1,Tallahassee,Florida,2.6,3
2,Sacramento,California,1.7,5


In [136]:
# drop duplicates
data = pd.DataFrame({"k1": ["one", "two"]*3 + ["two"],
                     "k2": [1, 1, 2, 3, 3, 4, 4]})

In [137]:
data

Unnamed: 0,k1,k2
0,one,1
1,two,1
2,one,2
3,two,3
4,one,3
5,two,4
6,two,4


In [138]:
# data.duplicated()

In [139]:
# data = data.drop_duplicates()

In [140]:
# data

In [141]:
data.loc[:,'v1'] = range(7)

In [142]:
data

Unnamed: 0,k1,k2,v1
0,one,1,0
1,two,1,1
2,one,2,2
3,two,3,3
4,one,3,4
5,two,4,5
6,two,4,6


In [144]:
data.drop_duplicates(subset=['k2'])

Unnamed: 0,k1,k2,v1
0,one,1,0
2,one,2,2
3,two,3,3
5,two,4,5


In [153]:
data = pd.DataFrame({"food": ["bacon", "pulled pork", "bacon", "pastrami", "corned beef", "bacon", "pastrami", "honey ham", "nova lox"],
                     "ounces": [4, 3, 12, 6, 7.5, 8, 3, 5, 6]})

In [154]:
data

Unnamed: 0,food,ounces
0,bacon,4.0
1,pulled pork,3.0
2,bacon,12.0
3,pastrami,6.0
4,corned beef,7.5
5,bacon,8.0
6,pastrami,3.0
7,honey ham,5.0
8,nova lox,6.0


In [150]:
meat_to_animal = {
    "bacon": "pig",
    "pulled pork": "pig",
    "pastrami": "cow",
    "corned beef": "cow",
    "honey ham": "pig",
    "nova lox" : "Salmon"
}

In [155]:
data['animal'] = data["food"].map(meat_to_animal)

In [157]:
data

Unnamed: 0,food,ounces,animal
0,bacon,4.0,pig
1,pulled pork,3.0,pig
2,bacon,12.0,pig
3,pastrami,6.0,cow
4,corned beef,7.5,cow
5,bacon,8.0,pig
6,pastrami,3.0,cow
7,honey ham,5.0,pig
8,nova lox,6.0,Salmon


In [159]:
data = pd.Series([1., -999., 2.0, -999.0, -1000, -1000, 3])

In [161]:
data

0       1.0
1    -999.0
2       2.0
3    -999.0
4   -1000.0
5   -1000.0
6       3.0
dtype: float64

In [165]:
data.replace([-999, -1000], np.nan)

0    1.0
1    NaN
2    2.0
3    NaN
4    NaN
5    NaN
6    3.0
dtype: float64

In [166]:
data.replace([-999, -1000], [np.nan, 0])

0    1.0
1    NaN
2    2.0
3    NaN
4    0.0
5    0.0
6    3.0
dtype: float64

In [167]:
data.replace({-999: np.nan, -1000: 0})

0    1.0
1    NaN
2    2.0
3    NaN
4    0.0
5    0.0
6    3.0
dtype: float64

In [169]:
data

0       1.0
1    -999.0
2       2.0
3    -999.0
4   -1000.0
5   -1000.0
6       3.0
dtype: float64

In [179]:
# binning and discretization
ages = [18, 22, 25, 26, 21, 23, 37, 31, 61, 45, 41, 32]

In [171]:
bins = [18, 25, 35, 60, 100]

In [172]:
age_categories = pd.cut(ages, bins)

In [180]:
age_categories

[(18, 25], (18, 25], (18, 25], (25, 35], (18, 25], ..., (25, 35], (60, 100], (35, 60], (35, 60], (25, 35]]
Length: 12
Categories (4, interval[int64, right]): [(18, 25] < (25, 35] < (35, 60] < (60, 100]]

In [178]:
ages

[20, 22, 25, 26, 21, 23, 37, 31, 61, 45, 41, 32]