In [1]:
import pandas as pd
import numpy as np

### Series Object

In [2]:
# A pandas Series is a one-dimensional array of inddexed data. It can be created from a list or array 
data = pd.Series([0.25, 0.50, .75, 1.00])
data

0    0.25
1    0.50
2    0.75
3    1.00
dtype: float64

In [3]:
data.values

array([0.25, 0.5 , 0.75, 1.  ])

In [4]:
data.shape

(4,)

In [5]:
data.index

RangeIndex(start=0, stop=4, step=1)

In [6]:
data[1:3]

1    0.50
2    0.75
dtype: float64

In [7]:
# We can also defined index associated with the values
data1 = pd.Series([1, 2, 3, 4], index=['a', 'b', 'c', 'd'])
data1

a    1
b    2
c    3
d    4
dtype: int64

In [8]:
data1['b']

np.int64(2)

In [9]:
# Series as a specialised dictionary

population_dict = {
    'California': 123453,
    'Texas': 87561,
    'Boston': 908382,
    'New York': 907856
}

population = pd.Series(population_dict)
population

California    123453
Texas          87561
Boston        908382
New York      907856
dtype: int64

In [10]:
# By default, a Series will be created where the index is drawn from the sorted keys.
population['California']

np.int64(123453)

### Dataframe Object

Data frame is a generalized NumPy array
If a Series is an analog of a one-dimensional array with flexible indices, a DataFrame is an analog of a two-dimensional array with both 
flexible row indices and flexible column indices.

In [11]:
# DatFrame can be thought of either as a generalization of a NumPy array, or as a specialization of a python dictionary

area_dict = {
    'Texas': 695662,
    'Boston': 484000,
    'California': 423967, 
    'New Town': 23126
}

area = pd.Series(area_dict)
area

Texas         695662
Boston        484000
California    423967
New Town       23126
dtype: int64

In [12]:
states = pd.DataFrame(
    {
        'population': population, 
        'area': area
    }
)

states

Unnamed: 0,population,area
Boston,908382.0,484000.0
California,123453.0,423967.0
New Town,,23126.0
New York,907856.0,
Texas,87561.0,695662.0


In [13]:
states.index

Index(['Boston', 'California', 'New Town', 'New York', 'Texas'], dtype='object')

In [14]:
states.shape

(5, 2)

In [15]:
# DataFrame has a columns attribute, which is an index object holding the column labels
states.columns

Index(['population', 'area'], dtype='object')

In [16]:
states['area']

Boston        484000.0
California    423967.0
New Town       23126.0
New York           NaN
Texas         695662.0
Name: area, dtype: float64

#### DataFrame can we constructed with Several ways

In [17]:
# 1. Using Single Series object

pd.DataFrame(population, columns = ['Population'])

Unnamed: 0,Population
California,123453
Texas,87561
Boston,908382
New York,907856


In [18]:
# 2. From a list of dicts
data = [{'a': i, 'b': 2*i} for i in range(3)]
pd.DataFrame(data)

Unnamed: 0,a,b
0,0,0
1,1,2
2,2,4


In [19]:
# 3. If some keys are missing pandas fill with them with NaN('Not a number')
pd.DataFrame([{'a': 1, 'b': 2}, {'b': 3, 'c': 4}])

Unnamed: 0,a,b,c
0,1.0,2,
1,,3,4.0


In [20]:
# 4. From a dictionary of Series objects
pd.DataFrame({'population': population, 'area': area})

Unnamed: 0,population,area
Boston,908382.0,484000.0
California,123453.0,423967.0
New Town,,23126.0
New York,907856.0,
Texas,87561.0,695662.0


In [21]:
# 5. From a two-dimensional NumPy array
pd.DataFrame(np.random.rand(3, 2), columns=['Col-1', 'Col-2'], index=['a', 'b', 'c'])

Unnamed: 0,Col-1,Col-2
a,0.397367,0.618708
b,0.04485,0.559661
c,0.275182,0.863636


### Index Object

We have seen here that both the Series and DataFrame objects contain an explicit index that lets you reference and modify data. This Index object is an interesting structure in itself, and it can be thought of either as an immutable array or as an ordered set (technically a multiset, as Index objects may contain repeated values). Those views have some interesting consequences in the operations available on Index objects. As a simple example, let’s construct an Index from a list of integers

In [22]:
index = pd.Index([2, 3, 5, 7, 11])
index

Index([2, 3, 5, 7, 11], dtype='int64')

In [23]:
index[1]

np.int64(3)

In [24]:
index[::2]

Index([2, 5, 11], dtype='int64')

In [25]:
data = pd.Series([0.25,0.50,0.75,1],index=['a', 'b', 'c', 'd'])
data

a    0.25
b    0.50
c    0.75
d    1.00
dtype: float64

In [26]:
'b' in data

True

In [27]:
data['b']

np.float64(0.5)

In [28]:
data.keys()

Index(['a', 'b', 'c', 'd'], dtype='object')

In [29]:
list(data.items())

[('a', 0.25), ('b', 0.5), ('c', 0.75), ('d', 1.0)]

In [30]:
data['e'] = 1.25

In [31]:
data

a    0.25
b    0.50
c    0.75
d    1.00
e    1.25
dtype: float64

In [32]:
data['a' : 'c'] # Slicing with explicit index, explicit means we're going to use actualt label or values of an index

a    0.25
b    0.50
c    0.75
dtype: float64

In [33]:
data[0:2] # slicing with implicit index, implicit means using numerical position or slicing

a    0.25
b    0.50
dtype: float64

In [34]:
data[(data > 0.3) & (data < 0.8)] # Masking

b    0.50
c    0.75
dtype: float64

### Indexes: loc, iloc and ix

In [35]:
data = pd.Series(['a', 'b', 'c'], index = [1,3,5])
data

1    a
3    b
5    c
dtype: object

In [36]:
# explicit index when indexing
data[1]

'a'

In [37]:
data[1:3]

3    b
5    c
dtype: object

In [38]:
# loc attribute allows indexing and slicing that always refresences the explicit index
data.loc[1]

'a'

In [39]:
data.loc[1:3]

1    a
3    b
dtype: object

In [40]:
# iloc allows indexing and slicing that always references the implicit Python-style index

data.iloc[1:3]

3    b
5    c
dtype: object

In [41]:
data.iloc[1]

'b'

In [42]:
pop_area = pd.DataFrame({'pop': population, 'area': area})
pop_area

Unnamed: 0,pop,area
Boston,908382.0,484000.0
California,123453.0,423967.0
New Town,,23126.0
New York,907856.0,
Texas,87561.0,695662.0


In [43]:
pop_area['pop']

Boston        908382.0
California    123453.0
New Town           NaN
New York      907856.0
Texas          87561.0
Name: pop, dtype: float64

In [44]:
pop_area.area is pop_area['area']

True

In [45]:
pop_area.pop is pop_area['pop']

False

In [46]:
pop_area['density'] = pop_area['pop']/ pop_area['area']
pop_area

Unnamed: 0,pop,area,density
Boston,908382.0,484000.0,1.876822
California,123453.0,423967.0,0.291185
New Town,,23126.0,
New York,907856.0,,
Texas,87561.0,695662.0,0.125867


In [47]:
pop_area.values

array([[9.08382000e+05, 4.84000000e+05, 1.87682231e+00],
       [1.23453000e+05, 4.23967000e+05, 2.91185399e-01],
       [           nan, 2.31260000e+04,            nan],
       [9.07856000e+05,            nan,            nan],
       [8.75610000e+04, 6.95662000e+05, 1.25867160e-01]])

In [48]:
# Transpose the full DataFrame to swap rows with columns
pop_area.T

Unnamed: 0,Boston,California,New Town,New York,Texas
pop,908382.0,123453.0,,907856.0,87561.0
area,484000.0,423967.0,23126.0,,695662.0
density,1.876822,0.291185,,,0.125867


In [49]:
pop_area.values[0]

array([9.08382000e+05, 4.84000000e+05, 1.87682231e+00])

In [50]:
pop_area.iloc[:3, :1] # Row, Column

Unnamed: 0,pop
Boston,908382.0
California,123453.0
New Town,


In [51]:
pop_area.loc[:'New York', :'area']

Unnamed: 0,pop,area
Boston,908382.0,484000.0
California,123453.0,423967.0
New Town,,23126.0
New York,907856.0,


In [52]:
pop_area

Unnamed: 0,pop,area,density
Boston,908382.0,484000.0,1.876822
California,123453.0,423967.0,0.291185
New Town,,23126.0,
New York,907856.0,,
Texas,87561.0,695662.0,0.125867


In [53]:
pop_area.loc[pop_area.density > 1.5, ['pop', 'area']]

Unnamed: 0,pop,area
Boston,908382.0,484000.0


In [54]:
pop_area[pop_area.density > 1.5]

Unnamed: 0,pop,area,density
Boston,908382.0,484000.0,1.876822


### Handling Missing Data

A number of schemes have been developed to indicate the presence of missing data in a table or DataFrame. Generally, they revolve around one of two strategies: using a mask that globally indicates missing values, or choosing a sentinel value that indicates a missing entry.

**In the masking approach**, the mask might be an entirely separate Boolean array, or it may involve appropriation of one bit in the data representation to locally indicate the null status of a value.

**In the sentinel approach**, the sentinel value could be some data-specific convention, such as indicating a missing integer value with –9999 or some rare bit pattern, or it could be a more global convention, such as indicating a missing floating-point value with NaN (Not a Number), a special value which is part of the IEEE floating-point specification



In [55]:
value1 = np.array([1, None, 2, 3])
value1

array([1, None, 2, 3], dtype=object)

In [56]:
value2 = np.array([1, np.nan, 2, 3])
value2

array([ 1., nan,  2.,  3.])

In [57]:
type(value2)

numpy.ndarray

In [58]:
value2.dtype

dtype('float64')

In [59]:
#  You should be aware that NaN is a bit like a data virus—it infects any other object it touches. Regardless of the operation, the result of arithmetic with NaN will be another NaN:

1+np.nan

nan

In [60]:
value2.sum(), value2.max(), value2.min()

(np.float64(nan), np.float64(nan), np.float64(nan))

In [61]:

data = pd.Series([1, np.nan, None, 4, 8, 'world'])
data

0        1
1      NaN
2     None
3        4
4        8
5    world
dtype: object

In [62]:
# Pandas have two useful methods for detecting null data: isnull() and notnull()
# isnull()::  Generate a Boolean mask indicating missing values
# notnull():: Oppsoite of isnull()
# dropna():: Return a filtered version of the data
# fillna():: Return a copy of the data with missing values filled or imputed

In [63]:
data.isnull()

0    False
1     True
2     True
3    False
4    False
5    False
dtype: bool

In [64]:
data.notnull()

0     True
1    False
2    False
3     True
4     True
5     True
dtype: bool

In [65]:
data[data.notnull()]

0        1
3        4
4        8
5    world
dtype: object

In [66]:
data.dropna()

0        1
3        4
4        8
5    world
dtype: object

In [67]:
df = pd.DataFrame(
    [
        [1, np.nan, 2],
        [2, 3, 5],
        [np.nan, 4, 6]
    ]
)
df

Unnamed: 0,0,1,2
0,1.0,,2
1,2.0,3.0,5
2,,4.0,6


In [68]:
# Imp: We can not drop single values from a DatFrame, either we can drop complete row or complete column

df.dropna()

Unnamed: 0,0,1,2
1,2.0,3.0,5


In [69]:
df.dropna(axis=1) # OR df.dropna(axis='columns')

Unnamed: 0,2
0,2
1,5
2,6


In [70]:
df[3] = np.nan
df

Unnamed: 0,0,1,2,3
0,1.0,,2,
1,2.0,3.0,5,
2,,4.0,6,


In [71]:
# The default is how='any', such that any row or column (depending on the axis key‐word) containing a null value will be dropped. 
#You can also specify how='all', which
# will only drop rows/columns that are all null values:

In [72]:
df.dropna(axis=1, how='all')

Unnamed: 0,0,1,2
0,1.0,,2
1,2.0,3.0,5
2,,4.0,6


In [73]:
df.dropna(axis=1, how='any')

Unnamed: 0,2
0,2
1,5
2,6


In [74]:
df.fillna(0)

Unnamed: 0,0,1,2,3
0,1.0,0.0,2,0.0
1,2.0,3.0,5,0.0
2,0.0,4.0,6,0.0


In [75]:
df.ffill()

Unnamed: 0,0,1,2,3
0,1.0,,2,
1,2.0,3.0,5,
2,2.0,4.0,6,


In [76]:
df.bfill()

Unnamed: 0,0,1,2,3
0,1.0,3.0,2,
1,2.0,3.0,5,
2,,4.0,6,
