In [1]:
import pandas as pd
import numpy as np
#import matplotlib.pyplot as plt
print(pd.__version__)

0.24.2


In [2]:
print(pd.__doc__)


pandas - a powerful data analysis and manipulation library for Python

**pandas** is a Python package providing fast, flexible, and expressive data
structures designed to make working with "relational" or "labeled" data both
easy and intuitive. It aims to be the fundamental high-level building block for
doing practical, **real world** data analysis in Python. Additionally, it has
the broader goal of becoming **the most powerful and flexible open source data
analysis / manipulation tool available in any language**. It is already well on
its way toward this goal.

Main Features
-------------
Here are just a few of the things that pandas does well:

  - Easy handling of missing data in floating point as well as non-floating
    point data.
  - Size mutability: columns can be inserted and deleted from DataFrame and
    higher dimensional objects
  - Automatic and explicit data alignment: objects can be explicitly aligned
    to a set of labels, or the user can simply ignore the labels and

### Data Structures in Pandas 

1. One Dimenstional     - Series
2. Multi Dimenstional   - DataFrame

In [3]:
a_array = np.arange(10, 90, 10)

In [4]:
a_array

array([10, 20, 30, 40, 50, 60, 70, 80])

In [5]:
a_series = pd.Series(a_array)

In [6]:
a_series

0    10
1    20
2    30
3    40
4    50
5    60
6    70
7    80
dtype: int32

In [7]:
a_series.shape

(8,)

In [8]:
a_series.size

8

In [9]:
list(a_series.index)

[0, 1, 2, 3, 4, 5, 6, 7]

In [10]:
a_series.index = ["a", "b", "c", "d", "e", "f", "g", "h"]

In [11]:
a_series

a    10
b    20
c    30
d    40
e    50
f    60
g    70
h    80
dtype: int32

In [12]:
a_series.index = range(1, 9)

In [13]:
a_series

1    10
2    20
3    30
4    40
5    50
6    60
7    70
8    80
dtype: int32

In [14]:
type(a_series)

pandas.core.series.Series

In [15]:
a_series[a_series > 50]

6    60
7    70
8    80
dtype: int32

In [16]:
a_series[a_series <  40]

1    10
2    20
3    30
dtype: int32

In [17]:
a_series * 10

1    100
2    200
3    300
4    400
5    500
6    600
7    700
8    800
dtype: int32

### Lambda

In [18]:
# one lined function

In [19]:
a_series.apply(lambda x : x +10)

1    20
2    30
3    40
4    50
5    60
6    70
7    80
8    90
dtype: int64

In [24]:
def a_func(x):
    x = x * 10
    return x

In [25]:
for i in [10, 20, 30]:
    print(a_func(i))

100
200
300


In [26]:
a_lambda = lambda x: x*10

In [27]:
a_series.apply(a_lambda)

1    100
2    200
3    300
4    400
5    500
6    600
7    700
8    800
dtype: int64

### Map

In [28]:
list(map(a_func, [10, 20, 30]))

[100, 200, 300]

In [29]:
list(map(lambda x: x*10, [10, 20, 30]))

[100, 200, 300]

### Filter

In [30]:
list(filter(lambda x: x < 10, [9, 8, 7, 10, 11, 12]))

[9, 8, 7]

### Apply 

In [31]:
a_series.apply(lambda x: x*10)

1    100
2    200
3    300
4    400
5    500
6    600
7    700
8    800
dtype: int64

In [32]:
a_series.dtype

dtype('int32')

In [33]:
a_series.cumsum()

1     10
2     30
3     60
4    100
5    150
6    210
7    280
8    360
dtype: int32

In [34]:
a_series

1    10
2    20
3    30
4    40
5    50
6    60
7    70
8    80
dtype: int32

In [35]:
a_series.diff()

1     NaN
2    10.0
3    10.0
4    10.0
5    10.0
6    10.0
7    10.0
8    10.0
dtype: float64

In [36]:
a_series = a_series.append(pd.Series([90, np.NaN, np.nan, 120]))

In [46]:
a_series

3     30.0
4     40.0
5     50.0
6     60.0
7     70.0
8     80.0
0     90.0
1      NaN
2      NaN
3    120.0
0    130.0
1     40.0
2     50.0
3    160.0
dtype: float64

In [48]:
a_series.drop([0])

3     30.0
4     40.0
5     50.0
6     60.0
7     70.0
8     80.0
1      NaN
2      NaN
3    120.0
1     40.0
2     50.0
3    160.0
dtype: float64

In [44]:
del a_series[0]

  self.values = np.delete(self.values, loc, 0)
  self.mgr_locs = self.mgr_locs.delete(loc)
  return self._shallow_copy(np.delete(self._data, loc))


In [45]:
a_series

3     30.0
4     40.0
5     50.0
6     60.0
7     70.0
8     80.0
0     90.0
1      NaN
2      NaN
3    120.0
0    130.0
1     40.0
2     50.0
3    160.0
dtype: float64

In [41]:
a_series.dropna().index.tolist()

Int64Index([1, 2, 3, 4, 5, 6, 7, 8, 0, 3, 0, 1, 2, 3], dtype='int64')

In [38]:
a_series.dropna()

1     10.0
2     20.0
3     30.0
4     40.0
5     50.0
6     60.0
7     70.0
8     80.0
0     90.0
3    120.0
dtype: float64

In [39]:
a_series = a_series.append(pd.Series([130, 40, 50, 160]))

In [40]:
a_series

1     10.0
2     20.0
3     30.0
4     40.0
5     50.0
6     60.0
7     70.0
8     80.0
0     90.0
1      NaN
2      NaN
3    120.0
0    130.0
1     40.0
2     50.0
3    160.0
dtype: float64

In [35]:
a_series.drop_duplicates().dropna()

1     10.0
2     20.0
3     30.0
4     40.0
5     50.0
6     60.0
7     70.0
8     80.0
0     90.0
3    120.0
0    130.0
3    160.0
dtype: float64

In [36]:
a_series

1     10.0
2     20.0
3     30.0
4     40.0
5     50.0
6     60.0
7     70.0
8     80.0
0     90.0
1      NaN
2      NaN
3    120.0
0    130.0
1     40.0
2     50.0
3    160.0
dtype: float64

In [37]:
a_series.reset_index(drop=True)

0      10.0
1      20.0
2      30.0
3      40.0
4      50.0
5      60.0
6      70.0
7      80.0
8      90.0
9       NaN
10      NaN
11    120.0
12    130.0
13     40.0
14     50.0
15    160.0
dtype: float64

In [38]:
a_series.dropna().astype(int)

1     10
2     20
3     30
4     40
5     50
6     60
7     70
8     80
0     90
3    120
0    130
1     40
2     50
3    160
dtype: int32

In [39]:
np.NaN, np.inf, np.nan # Non Finite Values

(nan, inf, nan)

In [40]:
type(np.pi) # Float

float

In [41]:
int(np.pi)

3

In [42]:
# Null and Nan
# Null/None - there is no value
# NAN - Not A Number

In [43]:
a_series

1     10.0
2     20.0
3     30.0
4     40.0
5     50.0
6     60.0
7     70.0
8     80.0
0     90.0
1      NaN
2      NaN
3    120.0
0    130.0
1     40.0
2     50.0
3    160.0
dtype: float64

In [44]:
a_series = a_series.append(pd.Series([170, 180, None, None, 190, 200.20]))

In [45]:
a_series

1     10.0
2     20.0
3     30.0
4     40.0
5     50.0
6     60.0
7     70.0
8     80.0
0     90.0
1      NaN
2      NaN
3    120.0
0    130.0
1     40.0
2     50.0
3    160.0
0    170.0
1    180.0
2      NaN
3      NaN
4    190.0
5    200.2
dtype: float64

In [46]:
a_series.isna().sum()

4

In [47]:
a_series.isnull().sum()

4

In [48]:
a_series.append(pd.Series(["a"]))

1       10
2       20
3       30
4       40
5       50
6       60
7       70
8       80
0       90
1      NaN
2      NaN
3      120
0      130
1       40
2       50
3      160
0      170
1      180
2      NaN
3      NaN
4      190
5    200.2
0        a
dtype: object

In [49]:
a_series = a_series.reset_index(drop=True)

In [50]:
a_series.interpolate()

0      10.000000
1      20.000000
2      30.000000
3      40.000000
4      50.000000
5      60.000000
6      70.000000
7      80.000000
8      90.000000
9     100.000000
10    110.000000
11    120.000000
12    130.000000
13     40.000000
14     50.000000
15    160.000000
16    170.000000
17    180.000000
18    183.333333
19    186.666667
20    190.000000
21    200.200000
dtype: float64

In [51]:
a_series.fillna(-999)

0      10.0
1      20.0
2      30.0
3      40.0
4      50.0
5      60.0
6      70.0
7      80.0
8      90.0
9    -999.0
10   -999.0
11    120.0
12    130.0
13     40.0
14     50.0
15    160.0
16    170.0
17    180.0
18   -999.0
19   -999.0
20    190.0
21    200.2
dtype: float64

In [52]:
a_series.plot()

<matplotlib.axes._subplots.AxesSubplot at 0x20808f05d30>

In [53]:
a_series

0      10.0
1      20.0
2      30.0
3      40.0
4      50.0
5      60.0
6      70.0
7      80.0
8      90.0
9       NaN
10      NaN
11    120.0
12    130.0
13     40.0
14     50.0
15    160.0
16    170.0
17    180.0
18      NaN
19      NaN
20    190.0
21    200.2
dtype: float64

In [54]:
a_series.argmax(), a_series.argmin()

The current behaviour of 'Series.argmax' is deprecated, use 'idxmax'
instead.
The behavior of 'argmax' will be corrected to return the positional
maximum in the future. For now, use 'series.values.argmax' or
'np.argmax(np.array(values))' to get the position of the maximum
row.
  """Entry point for launching an IPython kernel.
The current behaviour of 'Series.argmin' is deprecated, use 'idxmin'
instead.
The behavior of 'argmin' will be corrected to return the positional
minimum in the future. For now, use 'series.values.argmin' or
'np.argmin(np.array(values))' to get the position of the minimum
row.
  """Entry point for launching an IPython kernel.


(21, 0)

In [55]:
a_series.sort_values().reset_index(drop=True)

0      10.0
1      20.0
2      30.0
3      40.0
4      40.0
5      50.0
6      50.0
7      60.0
8      70.0
9      80.0
10     90.0
11    120.0
12    130.0
13    160.0
14    170.0
15    180.0
16    190.0
17    200.2
18      NaN
19      NaN
20      NaN
21      NaN
dtype: float64

In [56]:
a_series.max(), a_series.min()

(200.2, 10.0)

In [57]:
a_series.mean()

93.9

In [58]:
a_series.std()

63.37111050832882

In [59]:
a_series[a_series > 50]

5      60.0
6      70.0
7      80.0
8      90.0
11    120.0
12    130.0
15    160.0
16    170.0
17    180.0
20    190.0
21    200.2
dtype: float64

In [60]:
np.where(a_series > 100, "Good", "Bad")

array(['Bad', 'Bad', 'Bad', 'Bad', 'Bad', 'Bad', 'Bad', 'Bad', 'Bad',
       'Bad', 'Bad', 'Good', 'Good', 'Bad', 'Bad', 'Good', 'Good', 'Good',
       'Bad', 'Bad', 'Good', 'Good'], dtype='<U4')

In [61]:
a_series

0      10.0
1      20.0
2      30.0
3      40.0
4      50.0
5      60.0
6      70.0
7      80.0
8      90.0
9       NaN
10      NaN
11    120.0
12    130.0
13     40.0
14     50.0
15    160.0
16    170.0
17    180.0
18      NaN
19      NaN
20    190.0
21    200.2
dtype: float64

In [62]:
a_series[:10]

0    10.0
1    20.0
2    30.0
3    40.0
4    50.0
5    60.0
6    70.0
7    80.0
8    90.0
9     NaN
dtype: float64

In [63]:
a_series[10:]

10      NaN
11    120.0
12    130.0
13     40.0
14     50.0
15    160.0
16    170.0
17    180.0
18      NaN
19      NaN
20    190.0
21    200.2
dtype: float64

In [64]:
a_series[5: 11]

5     60.0
6     70.0
7     80.0
8     90.0
9      NaN
10     NaN
dtype: float64

In [65]:
a_series[0:9]

0    10.0
1    20.0
2    30.0
3    40.0
4    50.0
5    60.0
6    70.0
7    80.0
8    90.0
dtype: float64

In [66]:
a_series.index = np.random.randint(0, 100, 22)

In [67]:
a_series

99     10.0
63     20.0
94     30.0
14     40.0
37     50.0
82     60.0
75     70.0
55     80.0
78     90.0
86      NaN
56      NaN
95    120.0
56    130.0
12     40.0
47     50.0
74    160.0
53    170.0
47    180.0
28      NaN
35      NaN
16    190.0
36    200.2
dtype: float64

In [68]:
a_series[:10]

99    10.0
63    20.0
94    30.0
14    40.0
37    50.0
82    60.0
75    70.0
55    80.0
78    90.0
86     NaN
dtype: float64

In [69]:
pd.Series(list(filter(lambda x: x> 100, a_series)))

0    120.0
1    130.0
2    160.0
3    170.0
4    180.0
5    190.0
6    200.2
dtype: float64

In [70]:
a_series[a_series > 100]

95    120.0
56    130.0
74    160.0
53    170.0
47    180.0
16    190.0
36    200.2
dtype: float64

In [71]:
a_series.corr(a_series)

1.0

In [72]:
a_series.add_suffix("P_")

99P_     10.0
63P_     20.0
94P_     30.0
14P_     40.0
37P_     50.0
82P_     60.0
75P_     70.0
55P_     80.0
78P_     90.0
86P_      NaN
56P_      NaN
95P_    120.0
56P_    130.0
12P_     40.0
47P_     50.0
74P_    160.0
53P_    170.0
47P_    180.0
28P_      NaN
35P_      NaN
16P_    190.0
36P_    200.2
dtype: float64

In [73]:
a_series.

SyntaxError: invalid syntax (<ipython-input-73-aa53ce955f34>, line 1)

In [None]:
df = pd.DataFrame(data= {"Fruits": ["Apple", "Banana", "Cheery"],
                         "Sales": [100, 300, 50]})

In [None]:
df

In [None]:
# pd.read_csv
# pd.read_excel