# [Intro to Data Structures](https://pandas.pydata.org/pandas-docs/version/0.17.0/dsintro.html#series)

# [Series](https://pandas.pydata.org/pandas-docs/version/0.17.0/dsintro.html#series)

In [3]:
import numpy as np
import pandas as pd

In [None]:
s = pd.Series(np.random.randn(5), index=['a', 'b', 'c', 'd', 'e'])
s

a   -0.179276
b    1.004520
c   -1.344587
d   -1.699669
e   -1.074553
dtype: float64

In [None]:
s.index

Index(['a', 'b', 'c', 'd', 'e'], dtype='object')

In [None]:
pd.Series(np.random.randn(5))

0   -0.787171
1    0.068485
2   -0.210245
3    1.436049
4    0.613123
dtype: float64

In [None]:
d = {'a' : 0., 'b' : 1., 'c' : 2.}
pd.Series(d)

a    0.0
b    1.0
c    2.0
dtype: float64

In [None]:
pd.Series(d, index=['b', 'c', 'd', 'a'])

b    1.0
c    2.0
d    NaN
a    0.0
dtype: float64

From scalar value If data is a scalar value, an index must be provided. The value will be repeated to match the length of index



In [None]:
pd.Series(5, index=['a', 'b', 'c', 'd', 'e'])

a    5
b    5
c    5
d    5
e    5
dtype: int64

In [None]:
s

a   -0.179276
b    1.004520
c   -1.344587
d   -1.699669
e   -1.074553
dtype: float64

In [None]:
s[0]

-0.1792759755846804

In [None]:
s[:3]

a   -0.179276
b    1.004520
c   -1.344587
dtype: float64

In [None]:
s[s > s.median()]

a   -0.179276
b    1.004520
dtype: float64

In [None]:
s[[4,3,1]]

e   -1.074553
d   -1.699669
b    1.004520
dtype: float64

In [None]:
np.exp(s)

a    0.835875
b    2.730596
c    0.260647
d    0.182744
e    0.341450
dtype: float64

In [None]:
s['a'] == s[[0,]]

a    True
dtype: bool

In [None]:
s['a']

-0.1792759755846804

In [None]:
s['e']

-1.0745530446432012

In [None]:
s

a   -0.179276
b    1.004520
c   -1.344587
d   -1.699669
e   -1.074553
dtype: float64

In [None]:
'e' in s

True

In [None]:
'f' in s

False

In [None]:
s.get('f', np.nan)

nan

In [None]:
s+s

a   -0.358552
b    2.009040
c   -2.689173
d   -3.399339
e   -2.149106
dtype: float64

In [None]:
s*2

a   -0.358552
b    2.009040
c   -2.689173
d   -3.399339
e   -2.149106
dtype: float64

In [None]:
np.exp(s)

a    0.835875
b    2.730596
c    0.260647
d    0.182744
e    0.341450
dtype: float64

In [None]:
s[1:]

b    1.004520
c   -1.344587
d   -1.699669
e   -1.074553
dtype: float64

In [None]:
s[:-1]

a   -0.179276
b    1.004520
c   -1.344587
d   -1.699669
dtype: float64

In [None]:
s[1:] + s[:-1]

a         NaN
b    2.009040
c   -2.689173
d   -3.399339
e         NaN
dtype: float64

In [None]:
s = pd.Series(np.random.randn(5), name='something')
s

0   -0.769761
1   -0.779107
2   -0.745004
3   -0.567864
4    0.539211
Name: something, dtype: float64

In [None]:
s.name

'something'

# [DataFrame](https://pandas.pydata.org/pandas-docs/version/0.17.0/dsintro.html#dataframe)

In [None]:
d = {'one' : pd.Series([1., 2., 3.], index=['a', 'b', 'c']), 
     'two' : pd.Series([1., 2., 3., 4.], index=['a', 'b', 'c', 'd'])}

df = pd.DataFrame(d)
df

Unnamed: 0,one,two
a,1.0,1.0
b,2.0,2.0
c,3.0,3.0
d,,4.0


In [None]:
pd.DataFrame(d, index=['d', 'b', 'a'])

Unnamed: 0,one,two
d,,4.0
b,2.0,2.0
a,1.0,1.0


In [None]:
pd.DataFrame(d, index=['d', 'b', 'a'], columns=['two', 'three'])

Unnamed: 0,two,three
d,4.0,
b,2.0,
a,1.0,


In [None]:
df.index

Index(['a', 'b', 'c', 'd'], dtype='object')

In [None]:
df.columns

Index(['one', 'two'], dtype='object')

In [None]:
d = {'one' : [1., 2., 3., 4.],
     'two' : [4., 3., 2., 1.]}

pd.DataFrame(d)

Unnamed: 0,one,two
0,1.0,4.0
1,2.0,3.0
2,3.0,2.0
3,4.0,1.0


In [None]:
pd.DataFrame(d, index=['a', 'b', 'c', 'd'])

Unnamed: 0,one,two
a,1.0,4.0
b,2.0,3.0
c,3.0,2.0
d,4.0,1.0


In [None]:
data = np.zeros((2,), dtype=[('A', 'i4'), ('B', 'f4'), ('C', 'a10')])
data

array([(0, 0., b''), (0, 0., b'')],
      dtype=[('A', '<i4'), ('B', '<f4'), ('C', 'S10')])

In [None]:
data[:] = [(1, 2., 'Hello'), (2, 3., "World")]
data

array([(1, 2., b'Hello'), (2, 3., b'World')],
      dtype=[('A', '<i4'), ('B', '<f4'), ('C', 'S10')])

In [None]:
pd.DataFrame(data)

Unnamed: 0,A,B,C
0,1,2.0,b'Hello'
1,2,3.0,b'World'


In [None]:
pd.DataFrame(data, index=['first', 'second'])

Unnamed: 0,A,B,C
first,1,2.0,b'Hello'
second,2,3.0,b'World'


In [None]:
pd.DataFrame(data, columns=['C', 'A', 'B'])

Unnamed: 0,C,A,B
0,b'Hello',1,2.0
1,b'World',2,3.0


In [None]:
pd.DataFrame(data, index=['first', 'second'], columns=['C', 'A', 'B', 'D'])

Unnamed: 0,C,A,B,D
first,b'Hello',1,2.0,
second,b'World',2,3.0,


In [None]:
data2 = [{'a' : 1, 'b' : 2}, {'a' : 5, 'b' : 10, 'c' : 20}]

pd.DataFrame(data2)

Unnamed: 0,a,b,c
0,1,2,
1,5,10,20.0


In [None]:
pd.DataFrame(data2, index=['first', 'second'])

Unnamed: 0,a,b,c
first,1,2,
second,5,10,20.0


In [None]:
pd.DataFrame(data2, columns=['a', 'b'])

Unnamed: 0,a,b
0,1,2
1,5,10


In [None]:
pd.DataFrame(
  {('a', 'b'): {('A', 'B'): 1, ('A', 'C'): 2},
  ('a', 'a'): {('A', 'C'): 3, ('A', 'B'): 4},
  ('a', 'c'): {('A', 'B'): 5, ('A', 'C'): 6},
  ('b', 'a'): {('A', 'C'): 7, ('A', 'B'): 8},
  ('b', 'b'): {('A', 'D'): 9, ('A', 'B'): 10}}
)

Unnamed: 0_level_0,Unnamed: 1_level_0,a,a,a,b,b
Unnamed: 0_level_1,Unnamed: 1_level_1,b,a,c,a,b
A,B,1.0,4.0,5.0,8.0,10.0
A,C,2.0,3.0,6.0,7.0,
A,D,,,,,9.0


In [None]:
data

array([(1, 2., b'Hello'), (2, 3., b'World')],
      dtype=[('A', '<i4'), ('B', '<f4'), ('C', 'S10')])

In [None]:
pd.DataFrame.from_records(data, index='C')

Unnamed: 0_level_0,A,B
C,Unnamed: 1_level_1,Unnamed: 2_level_1
b'Hello',1,2.0
b'World',2,3.0


In [None]:
# https://pandas.pydata.org/pandas-docs/version/0.23/generated/pandas.DataFrame.from_items.html
pd.DataFrame.from_dict([('A', [1, 2, 3]), ('B', [4, 5, 6])])

Unnamed: 0,0,1
0,A,"[1, 2, 3]"
1,B,"[4, 5, 6]"


In [None]:
# pd.DataFrame.from_items([('A', [1, 2, 3]), ('B', [4, 5, 6])], orient='index', columns=['one', 'two', 'three'])

In [None]:
# https://pandas.pydata.org/pandas-docs/version/0.17.0/dsintro.html#column-selection-addition-deletion
df

Unnamed: 0,one,two
a,1.0,1.0
b,2.0,2.0
c,3.0,3.0
d,,4.0


In [None]:
df['one']

a    1.0
b    2.0
c    3.0
d    NaN
Name: one, dtype: float64

In [None]:
df['three'] = df['one'] * df['two']
df['flag'] = df['one'] > 2
df

Unnamed: 0,one,two,three,flag
a,1.0,1.0,1.0,False
b,2.0,2.0,4.0,False
c,3.0,3.0,9.0,True
d,,4.0,,False


In [None]:
del df['two']
three = df.pop('three')
df

Unnamed: 0,one,flag
a,1.0,False
b,2.0,False
c,3.0,True
d,,False


In [None]:
three

a    1.0
b    4.0
c    9.0
d    NaN
Name: three, dtype: float64

In [None]:
df['foo'] = 'bar'
df

Unnamed: 0,one,flag,foo
a,1.0,False,bar
b,2.0,False,bar
c,3.0,True,bar
d,,False,bar


In [None]:
df['one_trunc'] = df['one'][:2]
df

Unnamed: 0,one,flag,foo,one_trunc
a,1.0,False,bar,1.0
b,2.0,False,bar,2.0
c,3.0,True,bar,
d,,False,bar,


In [None]:
df.insert(1, 'bar', df['one'])
df

Unnamed: 0,one,bar,flag,foo,one_trunc
a,1.0,1.0,False,bar,1.0
b,2.0,2.0,False,bar,2.0
c,3.0,3.0,True,bar,
d,,,False,bar,


In [None]:
iris = pd.read_csv('https://gist.githubusercontent.com/curran/a08a1080b88344b0c8a7/raw/0e7a9b0a5d22642a06d3d5b9bcbad9890c8ee534/iris.csv')
iris.head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa
3,4.6,3.1,1.5,0.2,setosa
4,5.0,3.6,1.4,0.2,setosa


In [None]:
# assign always returns a copy of the data, leaving the original DataFrame untouched.
iris.assign(sepal_ratio = iris['sepal_width'] / iris['sepal_length']).head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species,sepal_ratio
0,5.1,3.5,1.4,0.2,setosa,0.686275
1,4.9,3.0,1.4,0.2,setosa,0.612245
2,4.7,3.2,1.3,0.2,setosa,0.680851
3,4.6,3.1,1.5,0.2,setosa,0.673913
4,5.0,3.6,1.4,0.2,setosa,0.72


In [None]:
iris.assign(sepal_ratio = lambda x: (x['sepal_width'] / x['sepal_length'])).tail()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species,sepal_ratio
145,6.7,3.0,5.2,2.3,virginica,0.447761
146,6.3,2.5,5.0,1.9,virginica,0.396825
147,6.5,3.0,5.2,2.0,virginica,0.461538
148,6.2,3.4,5.4,2.3,virginica,0.548387
149,5.9,3.0,5.1,1.8,virginica,0.508475


In [None]:
# iris.query('sepal_length > 5.0').assign(sepal_ratio = lambda x: x.sepal_width / x.sepal_length, sepal_ratio = lambda x: x.petal_width / x.petal_length).plot(kind='scatter', x='sepal_ratio', y='petal_ratio')

![ss](https://pandas.pydata.org/pandas-docs/version/0.17.0/_images/basics_assign.png)

In [None]:
# df.assign(C = lambda x: x['A'] + x['B']).assign(D = lambda x: x['A'] + x['C'])

<table border="1" class="docutils">
<colgroup>
<col width="50%">
<col width="33%">
<col width="17%">
</colgroup>
<thead valign="bottom">
<tr class="row-odd"><th class="head">Operation</th>
<th class="head">Syntax</th>
<th class="head">Result</th>
</tr>
</thead>
<tbody valign="top">
<tr class="row-even"><td>Select column</td>
<td><tt class="docutils literal"><span class="pre">df[col]</span></tt></td>
<td>Series</td>
</tr>
<tr class="row-odd"><td>Select row by label</td>
<td><tt class="docutils literal"><span class="pre">df.loc[label]</span></tt></td>
<td>Series</td>
</tr>
<tr class="row-even"><td>Select row by integer location</td>
<td><tt class="docutils literal"><span class="pre">df.iloc[loc]</span></tt></td>
<td>Series</td>
</tr>
<tr class="row-odd"><td>Slice rows</td>
<td><tt class="docutils literal"><span class="pre">df[5:10]</span></tt></td>
<td>DataFrame</td>
</tr>
<tr class="row-even"><td>Select rows by boolean vector</td>
<td><tt class="docutils literal"><span class="pre">df[bool_vec]</span></tt></td>
<td>DataFrame</td>
</tr>
</tbody>
</table>

In [None]:
df

Unnamed: 0,one,bar,flag,foo,one_trunc
a,1.0,1.0,False,bar,1.0
b,2.0,2.0,False,bar,2.0
c,3.0,3.0,True,bar,
d,,,False,bar,


In [None]:
df.loc['b']

one            2.0
bar            2.0
flag         False
foo            bar
one_trunc      2.0
Name: b, dtype: object

In [None]:
df.iloc[2]

one           3.0
bar           3.0
flag         True
foo           bar
one_trunc     NaN
Name: c, dtype: object

In [4]:
df = pd.DataFrame(np.random.randn(10, 4), columns=['A', 'B', 'C', 'D'])
df

Unnamed: 0,A,B,C,D
0,0.400039,0.617729,1.374557,-0.508661
1,0.113959,0.190525,-1.348401,-0.824795
2,-0.669504,0.378567,0.16388,0.580276
3,-0.7939,0.350789,0.002983,-0.3149
4,-0.766962,0.388381,-0.527093,-0.451364
5,1.106483,1.522864,0.466429,-0.966616
6,-0.218352,2.149964,-1.493201,-1.222822
7,1.351323,-0.201325,0.321702,0.519737
8,0.992758,-0.284728,-0.880578,-0.259262
9,-1.510088,-0.669669,0.183103,0.704404


In [5]:
df2 = pd.DataFrame(np.random.randn(7, 3), columns=['A', 'B', 'C'])
df + df2

Unnamed: 0,A,B,C,D
0,-1.573819,1.811805,1.232235,
1,-0.39908,1.770911,-0.761849,
2,0.57225,0.446738,1.618381,
3,-1.886133,0.362286,-1.015956,
4,-1.293507,2.771374,-0.324106,
5,1.52177,-0.108381,0.71783,
6,-1.515903,1.797626,-3.35603,
7,,,,
8,,,,
9,,,,


In [7]:
df.iloc[0]

A    0.400039
B    0.617729
C    1.374557
D   -0.508661
Name: 0, dtype: float64

In [8]:
df - df.iloc[0]

Unnamed: 0,A,B,C,D
0,0.0,0.0,0.0,0.0
1,-0.28608,-0.427204,-2.722957,-0.316134
2,-1.069543,-0.239163,-1.210677,1.088937
3,-1.193939,-0.26694,-1.371574,0.193761
4,-1.167001,-0.229348,-1.901649,0.057297
5,0.706444,0.905135,-0.908128,-0.457955
6,-0.618391,1.532234,-2.867758,-0.714161
7,0.951284,-0.819054,-1.052855,1.028398
8,0.592719,-0.902457,-2.255135,0.249399
9,-1.910127,-1.287398,-1.191454,1.213065


In [11]:
index = pd.date_range('1/1/2000', periods=36)
index

DatetimeIndex(['2000-01-01', '2000-01-02', '2000-01-03', '2000-01-04',
               '2000-01-05', '2000-01-06', '2000-01-07', '2000-01-08',
               '2000-01-09', '2000-01-10', '2000-01-11', '2000-01-12',
               '2000-01-13', '2000-01-14', '2000-01-15', '2000-01-16',
               '2000-01-17', '2000-01-18', '2000-01-19', '2000-01-20',
               '2000-01-21', '2000-01-22', '2000-01-23', '2000-01-24',
               '2000-01-25', '2000-01-26', '2000-01-27', '2000-01-28',
               '2000-01-29', '2000-01-30', '2000-01-31', '2000-02-01',
               '2000-02-02', '2000-02-03', '2000-02-04', '2000-02-05'],
              dtype='datetime64[ns]', freq='D')

In [12]:
df = pd.DataFrame(np.random.randn(8, 3),
                  index = pd.date_range('1/1/2000', periods=8),
                  columns = list('ABC'),
                  )
df

Unnamed: 0,A,B,C
2000-01-01,0.14298,0.298441,0.376236
2000-01-02,1.414891,-0.774851,0.875617
2000-01-03,-2.570544,0.232319,-1.237126
2000-01-04,0.604815,0.533303,-0.162483
2000-01-05,-1.872393,0.100759,0.186198
2000-01-06,0.058534,-1.213358,0.733581
2000-01-07,-2.492105,-0.11506,1.412039
2000-01-08,-0.306748,-0.942421,-1.668813


In [13]:
type(df['A'])

pandas.core.series.Series

In [16]:
# df.sub(df['A'], axis=1) # This is preferred way to replicate this behavior.
df - df['A'] # This is now deprecated and will be removed in a future release.

Unnamed: 0,2000-01-01 00:00:00,2000-01-02 00:00:00,2000-01-03 00:00:00,2000-01-04 00:00:00,2000-01-05 00:00:00,2000-01-06 00:00:00,2000-01-07 00:00:00,2000-01-08 00:00:00,A,B,C
2000-01-01,,,,,,,,,,,
2000-01-02,,,,,,,,,,,
2000-01-03,,,,,,,,,,,
2000-01-04,,,,,,,,,,,
2000-01-05,,,,,,,,,,,
2000-01-06,,,,,,,,,,,
2000-01-07,,,,,,,,,,,
2000-01-08,,,,,,,,,,,


In [23]:
dfA = df.sub(df['A'], axis=0)
dfA

Unnamed: 0,A,B,C
2000-01-01,0.0,0.155461,0.233256
2000-01-02,0.0,-2.189741,-0.539274
2000-01-03,0.0,2.802863,1.333418
2000-01-04,0.0,-0.071512,-0.767297
2000-01-05,0.0,1.973152,2.058591
2000-01-06,0.0,-1.271892,0.675047
2000-01-07,0.0,2.377045,3.904144
2000-01-08,0.0,-0.635673,-1.362064


In [24]:
dfA * 5 + 2

Unnamed: 0,A,B,C
2000-01-01,2.0,2.777304,3.16628
2000-01-02,2.0,-8.948707,-0.69637
2000-01-03,2.0,16.014314,8.667088
2000-01-04,2.0,1.64244,-1.836487
2000-01-05,2.0,11.865761,12.292956
2000-01-06,2.0,-4.35946,5.375233
2000-01-07,2.0,13.885225,21.520721
2000-01-08,2.0,-1.178363,-4.810322


In [26]:
1/dfA

Unnamed: 0,A,B,C
2000-01-01,inf,6.432486,4.287136
2000-01-02,inf,-0.456675,-1.854345
2000-01-03,inf,0.356778,0.749953
2000-01-04,inf,-13.983674,-1.303276
2000-01-05,inf,0.506803,0.485769
2000-01-06,inf,-0.78623,1.481379
2000-01-07,inf,0.42069,0.256138
2000-01-08,inf,-1.573137,-0.73418


In [27]:
dfA ** 4

Unnamed: 0,A,B,C
2000-01-01,0.0,0.000584,0.00296
2000-01-02,0.0,22.99171,0.084574
2000-01-03,0.0,61.717355,3.161292
2000-01-04,0.0,2.6e-05,0.346621
2000-01-05,0.0,15.158013,17.958926
2000-01-06,0.0,2.616984,0.207651
2000-01-07,0.0,31.926374,232.329007
2000-01-08,0.0,0.16328,3.441839


In [28]:
df1 = pd.DataFrame({'a' : [1, 0, 1], 'b' : [0, 1, 1]}, dtype=bool)
df1

Unnamed: 0,a,b
0,True,False
1,False,True
2,True,True


In [39]:
-df1

Unnamed: 0,a,b
0,False,True
1,True,False
2,False,False


In [29]:
df2 = pd.DataFrame({'a' : [0, 1, 1], 'b' : [1, 1, 0]}, dtype=bool)
df2

Unnamed: 0,a,b
0,False,True
1,True,True
2,True,False


In [30]:
df1 & df2

Unnamed: 0,a,b
0,False,False
1,False,True
2,True,False


In [31]:
df1 | df2

Unnamed: 0,a,b
0,True,True
1,True,True
2,True,True


In [34]:
df1 ^ df2

Unnamed: 0,a,b
0,True,True
1,True,False
2,False,True


In [None]:
# https://pandas.pydata.org/pandas-docs/version/0.17.0/dsintro.html#transposing