# Pandas intro
---

In [1]:
import numpy as np
import matplotlib.pyplot as plt

In [2]:
%matplotlib inline

In [3]:
import pandas as pd

In [4]:
np.nan

nan

In [5]:
np.inf

inf

In [7]:
np.info(np.nan)

float(x) -> floating point number

Convert a string or number to a floating point number, if possible.


In [8]:
type(np.nan)

float

In [9]:
type(np.inf)

float

In [10]:
None

In [11]:
type(None)

NoneType

In [12]:
np.nan == np.nan

False

In [13]:
np.isnan(np.nan)

True

In [14]:
arr = np.array([1, 2, 3, np.nan])

In [15]:
arr

array([  1.,   2.,   3.,  nan])

In [16]:
np.inf

inf

In [17]:
np.inf == np.inf

True

In [18]:
np.isinf(np.inf)

True

In [19]:
arr

array([  1.,   2.,   3.,  nan])

In [20]:
arr.dtype

dtype('float64')

In [21]:
arr.ndim

1

In [22]:
arr.shape

(4,)

In [23]:
pd.Series(arr)

0    1.0
1    2.0
2    3.0
3    NaN
dtype: float64

In [24]:
sr = pd.Series(arr)

In [25]:
sr

0    1.0
1    2.0
2    3.0
3    NaN
dtype: float64

In [26]:
np.info(pd.Series)

 Series()

One-dimensional ndarray with axis labels (including time series).

Labels need not be unique but must be any hashable type. The object
supports both integer- and label-based indexing and provides a host of
methods for performing operations involving the index. Statistical
methods from ndarray have been overridden to automatically exclude
missing data (currently represented as NaN)

Operations between Series (+, -, /, *, **) align values based on their
associated index values-- they need not be the same length. The result
index will be the sorted union of the two indexes.

Parameters
----------
data : array-like, dict, or scalar value
    Contains data stored in Series
index : array-like or Index (1d)
    Values must be unique and hashable, same length as data. Index
    object (or other iterable of same length as data) Will default to
    RangeIndex(len(data)) if not provided. If both a dict and index
    sequence are used, the index will override the keys found in the
    d

In [27]:
arr

array([  1.,   2.,   3.,  nan])

In [28]:
type(arr)

numpy.ndarray

In [29]:
sr

0    1.0
1    2.0
2    3.0
3    NaN
dtype: float64

In [30]:
type(sr)

pandas.core.series.Series

In [31]:
arr[0]

1.0

In [32]:
arr[5] = 7.5

IndexError: index 5 is out of bounds for axis 0 with size 4

In [33]:
sr[0]

1.0

In [34]:
sr[5] = 7.5

In [35]:
sr

0    1.0
1    2.0
2    3.0
3    NaN
5    7.5
dtype: float64

In [36]:
help(dict)

Help on class dict in module builtins:

class dict(object)
 |  dict() -> new empty dictionary
 |  dict(mapping) -> new dictionary initialized from a mapping object's
 |      (key, value) pairs
 |  dict(iterable) -> new dictionary initialized as if via:
 |      d = {}
 |      for k, v in iterable:
 |          d[k] = v
 |  dict(**kwargs) -> new dictionary initialized with the name=value pairs
 |      in the keyword argument list.  For example:  dict(one=1, two=2)
 |  
 |  Methods defined here:
 |  
 |  __contains__(self, key, /)
 |      True if D has a key k, else False.
 |  
 |  __delitem__(self, key, /)
 |      Delete self[key].
 |  
 |  __eq__(self, value, /)
 |      Return self==value.
 |  
 |  __ge__(self, value, /)
 |      Return self>=value.
 |  
 |  __getattribute__(self, name, /)
 |      Return getattr(self, name).
 |  
 |  __getitem__(...)
 |      x.__getitem__(y) <==> x[y]
 |  
 |  __gt__(self, value, /)
 |      Return self>value.
 |  
 |  __init__(self, /, *args, **kwargs)
 |

In [37]:
import hashlib

In [38]:
hashlib.sha256

<function _hashlib.openssl_sha256>

In [39]:
hashlib.sha512

<function _hashlib.openssl_sha512>

In [40]:
hashlib.md5

<function _hashlib.openssl_md5>

In [41]:
hashlib.sha256('string function')

TypeError: Unicode-objects must be encoded before hashing

In [42]:
hashlib.sha256(b'string function')

<sha256 HASH object @ 0x7fdd53440da0>

In [43]:
'a'

'a'

In [44]:
type('a')

str

In [45]:
type(b'a')

bytes

In [46]:
b'स'

SyntaxError: bytes can only contain ASCII literal characters. (<ipython-input-46-ea60b0d1628a>, line 1)

In [47]:
'स'

'स'

In [49]:
'स'.encode()

b'\xe0\xa4\xb8'

In [50]:
b'\xe0\xa4\xb8'.decode()

'स'

In [51]:
hashlib.sha256('लकसहदउकसजदउहक'.encode())

<sha256 HASH object @ 0x7fdd53415620>

In [52]:
hashlib.sha256('लकसहदउकसजदउहक'.encode()).hexdigest()

'6ff25f1fa5bbf661da196c0022ebfd67d8ab6d1b3499e02a8e3c9dc0bbad73a3'

In [53]:
hashlib.md5('लकसहदउकसजदउहक'.encode()).hexdigest()

'32c874b6b960598aff1b115199e0acaa'

In [54]:
hashlib.sha1('लकसहदउकसजदउहक'.encode()).hexdigest()

'1acb9225d94c276a3ada6c85c175617ff32c6605'

In [55]:
r'[a-z]+.*'

'[a-z]+.*'

In [56]:
r'\b'

'\\b'

In [57]:
'\\b'

'\\b'

In [61]:
age = 56
'this is example {age}'.format(age=age)

'this is example 56'

In [62]:
age = 56

In [63]:
f'this is {age}'

'this is 56'

In [64]:
class T:
    pass

In [65]:
sr[T()] = 8

In [66]:
sr

0                                        1.0
1                                        2.0
2                                        3.0
3                                        NaN
5                                        7.5
<__main__.T object at 0x7fdd533cc390>    8.0
dtype: float64

In [78]:
sr = pd.Series(np.array([1, 4, 7, 88, 97, 54]))

In [79]:
sr

0     1
1     4
2     7
3    88
4    97
5    54
dtype: int64

In [80]:
dir(sr)

['T',
 '_AXIS_ALIASES',
 '_AXIS_IALIASES',
 '_AXIS_LEN',
 '_AXIS_NAMES',
 '_AXIS_NUMBERS',
 '_AXIS_ORDERS',
 '_AXIS_REVERSED',
 '_AXIS_SLICEMAP',
 '__abs__',
 '__add__',
 '__and__',
 '__array__',
 '__array_prepare__',
 '__array_priority__',
 '__array_wrap__',
 '__bool__',
 '__bytes__',
 '__class__',
 '__contains__',
 '__delattr__',
 '__delitem__',
 '__dict__',
 '__dir__',
 '__div__',
 '__divmod__',
 '__doc__',
 '__eq__',
 '__finalize__',
 '__float__',
 '__floordiv__',
 '__format__',
 '__ge__',
 '__getattr__',
 '__getattribute__',
 '__getitem__',
 '__getstate__',
 '__gt__',
 '__hash__',
 '__iadd__',
 '__imul__',
 '__init__',
 '__init_subclass__',
 '__int__',
 '__invert__',
 '__ipow__',
 '__isub__',
 '__iter__',
 '__itruediv__',
 '__le__',
 '__len__',
 '__long__',
 '__lt__',
 '__mod__',
 '__module__',
 '__mul__',
 '__ne__',
 '__neg__',
 '__new__',
 '__nonzero__',
 '__or__',
 '__pow__',
 '__radd__',
 '__rand__',
 '__rdiv__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__rfloordiv__',


In [85]:
sr = pd.Series(np.array([99, 88, 77, 66, 55]), index=[7, 9, 4, 2, 6])

In [84]:
np.info(pd.Series)

 Series()

One-dimensional ndarray with axis labels (including time series).

Labels need not be unique but must be any hashable type. The object
supports both integer- and label-based indexing and provides a host of
methods for performing operations involving the index. Statistical
methods from ndarray have been overridden to automatically exclude
missing data (currently represented as NaN)

Operations between Series (+, -, /, *, **) align values based on their
associated index values-- they need not be the same length. The result
index will be the sorted union of the two indexes.

Parameters
----------
data : array-like, dict, or scalar value
    Contains data stored in Series
index : array-like or Index (1d)
    Values must be unique and hashable, same length as data. Index
    object (or other iterable of same length as data) Will default to
    RangeIndex(len(data)) if not provided. If both a dict and index
    sequence are used, the index will override the keys found in the
    d

In [86]:
sr = pd.Series(np.array([99, 88, 77, 66, 55]), index=[7, 9, 4, 2, 6])

In [87]:
sr

7    99
9    88
4    77
2    66
6    55
dtype: int64

In [88]:
sr[7]

99

In [89]:
sr[0]

KeyError: 0

In [90]:
np.info(sr.loc)

Purely label-location based indexer for selection by label.

``.loc[]`` is primarily label based, but may also be used with a
boolean array.

Allowed inputs are:

- A single label, e.g. ``5`` or ``'a'``, (note that ``5`` is
  interpreted as a *label* of the index, and **never** as an
  integer position along the index).
- A list or array of labels, e.g. ``['a', 'b', 'c']``.
- A slice object with labels, e.g. ``'a':'f'`` (note that contrary
  to usual python slices, **both** the start and the stop are included!).
- A boolean array.
- A ``callable`` function with one argument (the calling Series, DataFrame
  or Panel) and that returns valid output for indexing (one of the above)

``.loc`` will raise a ``KeyError`` when the items are not found.

See more at :ref:`Selection by Label <indexing.label>`


In [91]:
np.info(sr.iloc)

Purely integer-location based indexing for selection by position.

``.iloc[]`` is primarily integer position based (from ``0`` to
``length-1`` of the axis), but may also be used with a boolean
array.

Allowed inputs are:

- An integer, e.g. ``5``.
- A list or array of integers, e.g. ``[4, 3, 0]``.
- A slice object with ints, e.g. ``1:7``.
- A boolean array.
- A ``callable`` function with one argument (the calling Series, DataFrame
  or Panel) and that returns valid output for indexing (one of the above)

``.iloc`` will raise ``IndexError`` if a requested indexer is
out-of-bounds, except *slice* indexers which allow out-of-bounds
indexing (this conforms with python/numpy *slice* semantics).

See more at :ref:`Selection by Position <indexing.integer>`


In [92]:
sr

7    99
9    88
4    77
2    66
6    55
dtype: int64

In [93]:
sr[7]

99

In [95]:
sr.loc[7]

99

In [96]:
sr.iloc[7]

IndexError: single positional indexer is out-of-bounds

In [97]:
sr.iloc[0]

99

In [98]:
np.info(sr.ix)

A primarily label-location based indexer, with integer position
fallback.

``.ix[]`` supports mixed integer and label based access. It is
primarily label based, but will fall back to integer positional
access unless the corresponding axis is of integer type.

``.ix`` is the most general indexer and will support any of the
inputs in ``.loc`` and ``.iloc``. ``.ix`` also supports floating
point label schemes. ``.ix`` is exceptionally useful when dealing
with mixed positional and label based hierachical indexes.

However, when an axis is integer based, ONLY label based access
and not positional access is supported. Thus, in such cases, it's
usually better to be explicit and use ``.iloc`` or ``.loc``.

See more at :ref:`Advanced Indexing <advanced>`.


In [99]:
sr.ix[0]

KeyError: 0

In [100]:
sr2 = pd.Series([88, 99, 66, 55], index=['8', '9', '6', '5'])

In [101]:
sr2

8    88
9    99
6    66
5    55
dtype: int64

In [102]:
sr2.index

Index(['8', '9', '6', '5'], dtype='object')

In [104]:
sr2['8']

88

In [105]:
sr2[0]

88

In [106]:
sr2.iloc[0]

88

In [107]:
sr2.ix[0]

88

In [None]:
srs = pd.Series(np.array([]))