# pandas: The Python Data Analysis Library

In [None]:
#pandas arises from the need to have a specific library to analyze data that provides, in the simplest possible way, 
#all the instruments for data processing, data extraction, and data manipulation.
#This Python package is designed on the basis of the NumPy library

#Install Pandas
#Jupyter 
pip install pandas
#Anaconda prompt
conda install pandas
conda update pandas

In [2]:
import pandas as pd
import numpy as np

The heart of pandas is the two primary data structures
-Series 
-Dataframes

# The Series

The series is the object of the pandas library designed to represent one-dimensional data structures, 
similar to an array but with some additional features

#Declaring a Series

In [3]:
s = pd.Series([12,-4,7,9])
s
#If you do not specify any index during the definition of the series, by default, 
#pandas will assign numerical values increasing from 0 as labels

0    12
1    -4
2     7
3     9
dtype: int64

In [6]:
s = pd.Series([12,-4,7,9], index=['a','b','c','d'])
s

a    12
b    -4
c     7
d     9
dtype: int64

In [7]:
#Selecting the Internal Elements
s[2]

7

In [8]:
s['b']

-4

In [9]:
s[0:2]

a    12
b    -4
dtype: int64

In [10]:
#Assigning Values to the Elements
s[1] = 0
s

a    12
b     0
c     7
d     9
dtype: int64

In [11]:
s['b'] = 1
s

a    12
b     1
c     7
d     9
dtype: int64

In [12]:
#Defining a Series from NumPy Arrays and Other Series
arr = np.array([1,2,3,4])
s3 = pd.Series(arr)
s3

0    1
1    2
2    3
3    4
dtype: int32

In [13]:
arr[2] = -2
s3

0    1
1    2
2   -2
3    4
dtype: int32

In [14]:
s

a    12
b     1
c     7
d     9
dtype: int64

In [15]:
#Filtering Values
print(s)
s[s > 8]

a    12
b     1
c     7
d     9
dtype: int64


a    12
d     9
dtype: int64

In [14]:
#Operations and Mathematical Functions
s/2

a    6.0
b    0.5
c    3.5
d    4.5
dtype: float64

In [15]:
#However, with the NumPy mathematical functions, you must specify the function referenced with np 
#and the instance of the series passed as an argument.
np.log(s)

a    2.484907
b    0.000000
c    1.945910
d    2.197225
dtype: float64

In [16]:
#Evaluating Values
serd = pd.Series([1,0,2,1,2,3], index=['white','white','blue','green','green','yellow'])
serd

white     1
white     0
blue      2
green     1
green     2
yellow    3
dtype: int64

In [17]:
serd.unique()

array([1, 0, 2, 3], dtype=int64)

In [18]:
serd.value_counts()

2    2
1    2
3    1
0    1
dtype: int64

In [19]:
serd.isin([0,3])

white     False
white      True
blue      False
green     False
green     False
yellow     True
dtype: bool

In [20]:
#NaN (Not a number) Values
s2 = pd.Series([5,-3,np.NaN,14])
s2

0     5.0
1    -3.0
2     NaN
3    14.0
dtype: float64

In [21]:
s2.isnull()

0    False
1    False
2     True
3    False
dtype: bool

In [22]:
s2.notnull()

0     True
1     True
2    False
3     True
dtype: bool

In [24]:
s2[s2.notnull()]

0     5.0
1    -3.0
3    14.0
dtype: float64

In [17]:
#Series as Dictionaries
#An alternative way to think of a series is to think of it as an object dict (dictionary)
mydict = {'red': 2000, 'blue': 1000, 'yellow': 500,
 'orange': 1000}
myseries = pd.Series(mydict)
myseries

red       2000
blue      1000
yellow     500
orange    1000
dtype: int64

In [18]:
#You can also define the series indexes separately
colors = ['red','yellow','orange','blue','green']
myseries = pd.Series(mydict, index=colors)
myseries

red       2000.0
yellow     500.0
orange    1000.0
blue      1000.0
green        NaN
dtype: float64

In [19]:
#Operations Between Series
mydict2 = {'red':400,'yellow':1000,'black':700}
myseries2 = pd.Series(mydict2)

myseries2

red        400
yellow    1000
black      700
dtype: int64

In [21]:
myseries

red       2000.0
yellow     500.0
orange    1000.0
blue      1000.0
green        NaN
dtype: float64

In [22]:
myseries + myseries2

black        NaN
blue         NaN
green        NaN
orange       NaN
red       2400.0
yellow    1500.0
dtype: float64