In [1]:
import pandas as pd
import numpy as np

In [3]:
# series: 1 dim (size immutable)
# dataframe: 2 dim
# panel: 3 dim

### Series ###

In [13]:
"""
pd.Series(data, index, dtype, copy)
    data: data takes various forms like ndarray, list, constants
    index: Index values must be unique and hashable, same length as data. 
           Default np.arrange(n) if no index is passed.
    dtype: dtype is for data type. If None, data type will be inferred
    copy: Copy data. Default False
"""
s = pd.Series(dtype=object)
s

Series([], dtype: object)

In [23]:
# from numpy array
data = np.array(['a', 'b', 'c', 'd'])
s = pd.Series(data)
print(s)
s = pd.Series(data, index=[100, 101, 102, 103])
print(s)

0    a
1    b
2    c
3    d
dtype: object
100    a
101    b
102    c
103    d
dtype: object


In [42]:
data = np.array([1.1, 2.2, 3.3, 4.4]) # ??? only affects np.array non string
s = pd.Series(data, copy=False)
s.iloc[0] = 10.2
print(s)
print(data)

0    10.2
1     2.2
2     3.3
3     4.4
dtype: float64
[10.2  2.2  3.3  4.4]


In [50]:
# from dictionary
data = {'a' : 0., 'b' : 1., 'c' : 2.}
s = pd.Series(data) # keys are used to construct index
print(s)
s = pd.Series(data, index=['a', 'b', 'c', 'd']) # index order preserved
print(s)

a    0.0
b    1.0
c    2.0
dtype: float64
a    0.0
b    1.0
c    2.0
d    NaN
dtype: float64


In [52]:
# from scalar
s = pd.Series(5, index=[0, 1, 2, 3])
s # the value will be repeated to match the length of index

0    5
1    5
2    5
3    5
dtype: int64

In [58]:
# accessing data
s = pd.Series([1,2,3,4,5],index = ['a','b','c','d','e'])
print(s[1])
print(s[:3])
print(s[-3:])
print(s['d'])
print(s[['a', 'c', 'd']])


2
a    1
b    2
c    3
dtype: int64
c    3
d    4
e    5
dtype: int64
4
a    1
c    3
d    4
dtype: int64


### DataFrame ###

In [2]:
''' 
pandas.DataFrame(data, index, columns, dtype, copy)
    data: ndarray, series, map, lists, dict, constants
    index: row labels
'''

' \npandas.DataFrame(data, index, columns, dtype, copy)\n    data: ndarray, series, map, lists, dict, constants\n    index: row labels\n'

In [6]:
df = pd.DataFrame()
print(df)

Empty DataFrame
Columns: []
Index: []


In [12]:
# create df from list
data = [1,2,3,4,5]
df = pd.DataFrame(data)
print(df)
data = [['Alex',10],['Bob',12],['Clarke',13]]
df = pd.DataFrame(data,columns=['Name','Age'])
print(df)
df = pd.DataFrame(data,columns=['Name','Age'],dtype=float)
print(df) # deprecated

   0
0  1
1  2
2  3
3  4
4  5
     Name  Age
0    Alex   10
1     Bob   12
2  Clarke   13
     Name   Age
0    Alex  10.0
1     Bob  12.0
2  Clarke  13.0


  exec(code_obj, self.user_global_ns, self.user_ns)


In [13]:
# create df from Dict of ndarrays / Lists
data = {'Name':['Tom', 'Jack', 'Steve', 'Ricky'],'Age':[28,34,29,42]}
df = pd.DataFrame(data)
df

Unnamed: 0,Name,Age
0,Tom,28
1,Jack,34
2,Steve,29
3,Ricky,42


In [20]:
data = {'Name':['Tom', 'Jack', 'Steve', 'Ricky'],'Age':[28,34,29,42]} # each dic is a column
df = pd.DataFrame(data, index=['rank1','rank2','rank3','rank4'])
df

Unnamed: 0,Name,Age
rank1,Tom,28
rank2,Jack,34
rank3,Steve,29
rank4,Ricky,42


In [21]:
# create df from list of dicts
data = [{'a': 1, 'b': 2},{'a': 5, 'b': 10, 'c': 20}] # each dic is a row
df = pd.DataFrame(data)
df

Unnamed: 0,a,b,c
0,1,2,
1,5,10,20.0


In [22]:
# seet index
data = [{'a': 1, 'b': 2},{'a': 5, 'b': 10, 'c': 20}]
df = pd.DataFrame(data, index=['first', 'second'])
df

Unnamed: 0,a,b,c
first,1,2,
second,5,10,20.0


In [24]:
# create df from Dic of Series
d = {'one' : pd.Series([1, 2, 3], index=['a', 'b', 'c']),
   'two' : pd.Series([1, 2, 3, 4], index=['a', 'b', 'c', 'd'])}
df = pd.DataFrame(d)
df

Unnamed: 0,one,two
a,1.0,1
b,2.0,2
c,3.0,3
d,,4


In [25]:
# select column
df['one']

a    1.0
b    2.0
c    3.0
d    NaN
Name: one, dtype: float64

In [30]:
# column addition
df['three'] = df['one'] + df['two']
df

Unnamed: 0,one,two,three
a,1.0,1,2.0
b,2.0,2,4.0
c,3.0,3,6.0
d,,4,


In [31]:
# column deletion
del df['three']
df

Unnamed: 0,one,two
a,1.0,1
b,2.0,2
c,3.0,3
d,,4


In [33]:
df.pop('two')
df

Unnamed: 0,one
a,1.0
b,2.0
c,3.0
d,


In [35]:
# row selection, addition, and deletion
d = {'one' : pd.Series([1, 2, 3], index=['a', 'b', 'c']), 
   'two' : pd.Series([1, 2, 3, 4], index=['a', 'b', 'c', 'd'])}
df = pd.DataFrame(d)
df

Unnamed: 0,one,two
a,1.0,1
b,2.0,2
c,3.0,3
d,,4


In [41]:
print(type(df.loc['b']))
df.loc['b']

<class 'pandas.core.series.Series'>


one    2.0
two    2.0
Name: b, dtype: float64

In [42]:
df.iloc[2]

one    3.0
two    3.0
Name: c, dtype: float64

In [43]:
# slice rows
df[2:4]

Unnamed: 0,one,two
c,3.0,3
d,,4


In [12]:
# addition
df = pd.DataFrame([[1, 2], [3, 4]], columns = ['a','b'])
df2 = pd.DataFrame([[5, 6], [7, 8]], columns = ['a','b'])
df = df.append(df2)
df

Unnamed: 0,a,b
0,1,2
1,3,4
0,5,6
1,7,8


In [13]:
print(df.index)
df = df.drop(index=0)
print(df.index)
df

Int64Index([0, 1, 0, 1], dtype='int64')
Int64Index([1, 1], dtype='int64')


Unnamed: 0,a,b
1,3,4
1,7,8


### Panel ###

In [19]:
print(pd.__version__)
# panel has been removed

1.3.1


In [None]:
# pandas.Panel(data, items, major_axis, minor_axis, dtype, copy)
# deprecated

In [20]:
data = np.random.rand(2,4,5)
print(data)
#p = pd.Panel(data)

[[[0.88734208 0.06838684 0.85097435 0.06880823 0.50740681]
  [0.53391734 0.48904843 0.80451022 0.19523583 0.98792353]
  [0.88117797 0.96723983 0.57660127 0.86192481 0.89825808]
  [0.0839457  0.4874043  0.31584979 0.90837376 0.402868  ]]

 [[0.49019733 0.97867174 0.59028147 0.58462989 0.54630031]
  [0.56666222 0.70302267 0.05812085 0.88207743 0.48650731]
  [0.2712874  0.91200067 0.5440278  0.81565483 0.03874466]
  [0.91817501 0.35323288 0.79801686 0.24815126 0.23603052]]]


### Basic Functionality ###

In [2]:
s = pd.Series(np.random.randn(4))

In [5]:
print("axes\n", s.axes)
print("dtype\n", s.dtype)
print("empty\n", s.empty)
print("ndim\n", s.ndim)
print("size\n", s.size)
print("values\n", s.values)
print("head\n", s.head())
print("tail\n", s.tail())

axes
 [RangeIndex(start=0, stop=4, step=1)]
dtype
 float64
empty
 False
ndim
 1
size
 4
values
 [ 0.06504169 -1.02060985  0.21459751  0.70705839]
head
 0    0.065042
1   -1.020610
2    0.214598
3    0.707058
dtype: float64
tail
 0    0.065042
1   -1.020610
2    0.214598
3    0.707058
dtype: float64


In [9]:
d = {'Name':pd.Series(['Tom','James','Ricky','Vin','Steve','Smith','Jack']),
   'Age':pd.Series([25,26,25,23,30,29,23]),
   'Rating':pd.Series([4.23,3.24,3.98,2.56,3.20,4.6,3.8])}
df = pd.DataFrame(d)
df.dtypes

Name       object
Age         int64
Rating    float64
dtype: object

In [10]:
df.T

Unnamed: 0,0,1,2,3,4,5,6
Name,Tom,James,Ricky,Vin,Steve,Smith,Jack
Age,25,26,25,23,30,29,23
Rating,4.23,3.24,3.98,2.56,3.2,4.6,3.8


### Descriptive Statistics ###

In [11]:
d = {'Name':pd.Series(['Tom','James','Ricky','Vin','Steve','Smith','Jack',
   'Lee','David','Gasper','Betina','Andres']),
   'Age':pd.Series([25,26,25,23,30,29,23,34,40,30,51,46]),
   'Rating':pd.Series([4.23,3.24,3.98,2.56,3.20,4.6,3.8,3.78,2.98,4.80,4.10,3.65])
}

df = pd.DataFrame(d)

In [13]:
df.sum()

Name      TomJamesRickyVinSteveSmithJackLeeDavidGasperBe...
Age                                                     382
Rating                                                44.92
dtype: object

In [15]:
df.sum(axis=1, numeric_only=True)

0     29.23
1     29.24
2     28.98
3     25.56
4     33.20
5     33.60
6     26.80
7     37.78
8     42.98
9     34.80
10    55.10
11    49.65
dtype: float64

In [19]:
"""
count()
sum()
mean()
median()
mode()
std()
min()
max()
abs()
prod(): product of values
cumsum(): cumulative sum
cumprod(): cumulative product
"""

'\ncount()\nsum()\nmean()\nmedian()\nmode()\nstd()\nmin()\nmax()\nabs()\nprod(): product of values\ncumsum(): cumulative sum\ncumprod(): cumulative product\n'

In [23]:
# summarizing data
# df.describe() # excludes character column
df.describe(include=['object']) # just Name

Unnamed: 0,Name
count,12
unique,12
top,Tom
freq,1


In [24]:
df.describe(include='all')

Unnamed: 0,Name,Age,Rating
count,12,12.0,12.0
unique,12,,
top,Tom,,
freq,1,,
mean,,31.833333,3.743333
std,,9.232682,0.661628
min,,23.0,2.56
25%,,25.0,3.23
50%,,29.5,3.79
75%,,35.5,4.1325


### Function Application ###