In [3]:
import pandas as pd
import numpy as np

In [6]:
# Display the help document
pd?

[1;31mType:[0m        module
[1;31mString form:[0m <module 'pandas' from 'C:\\Users\\Administrator\\anaconda3\\lib\\site-packages\\pandas\\__init__.py'>
[1;31mFile:[0m        c:\users\administrator\anaconda3\lib\site-packages\pandas\__init__.py
[1;31mDocstring:[0m  
pandas - a powerful data analysis and manipulation library for Python

**pandas** is a Python package providing fast, flexible, and expressive data
structures designed to make working with "relational" or "labeled" data both
easy and intuitive. It aims to be the fundamental high-level building block for
doing practical, **real world** data analysis in Python. Additionally, it has
the broader goal of becoming **the most powerful and flexible open source data
analysis / manipulation tool available in any language**. It is already well on
its way toward this goal.

Main Features
-------------
Here are just a few of the things that pandas does well:

  - Easy handling of missing data in floating point as well as non-flo

In [7]:
# Print the version of pandas
pd.__version__

'1.1.3'

In [11]:
"""
Pandas Object: Series
"""

# Create a series from an array
ser = pd.Series([0.25, 0.5, 0.75, 1.0]) # contructor method
print(ser)
print(type(ser), '\n')

0    0.25
1    0.50
2    0.75
3    1.00
dtype: float64
<class 'pandas.core.series.Series'> 



In [12]:
# Two main attributes: 'values' and 'index'

arr = ser.values
print(arr)

ind = ser.index
print(ind)

[0.25 0.5  0.75 1.  ]
RangeIndex(start=0, stop=4, step=1)


In [15]:
# Label-based Indexing

ser = pd.Series([0.25, 0.5, 0.75, 1.0], index=['a', 'b', 'c', 'd'])
print(ser)

ind = ser.index
print(ind)

a    0.25
b    0.50
c    0.75
d    1.00
dtype: float64
Index(['a', 'b', 'c', 'd'], dtype='object')


In [17]:
"""
Dictionary and Series
"""

dict = {'a' : 1, 2 : 'two', 'third' : True}
print(dict)

{'a': 1, 2: 'two', 'third': True}


In [22]:
"""
Create a series from dictionary
"""

population_dict = { 'California' : 38332521,
                   'Texas' : 26448193,
                   'New York' : 19651127,
                   'Florida' : 19552860,
                   'Illinois' : 12882135 }


population = pd.Series(population_dict)
print(population)

print(population['Texas':'Illinois'])

California    38332521
Texas         26448193
New York      19651127
Florida       19552860
Illinois      12882135
dtype: int64
Texas       26448193
New York    19651127
Florida     19552860
Illinois    12882135
dtype: int64


In [24]:
"""
Pandas object : DataFrame
"""

area_dict = {'California' : 423967,
               'Texas' : 695662,
               'New York' : 141297,
               'Florida' : 170312,
               'Illinois' : 149995}

area = pd.Series(area_dict)
print(area)

California    423967
Texas         695662
New York      141297
Florida       170312
Illinois      149995
dtype: int64


In [29]:
# Construct a DataFrame containing 'population' and 'area'

states = pd.DataFrame({'population': population, 'area': area})
print(states)

print(states.index)
print(states.columns)
print(states['population'])

            population    area
California    38332521  423967
Texas         26448193  695662
New York      19651127  141297
Florida       19552860  170312
Illinois      12882135  149995
Index(['California', 'Texas', 'New York', 'Florida', 'Illinois'], dtype='object')
Index(['population', 'area'], dtype='object')
California    38332521
Texas         26448193
New York      19651127
Florida       19552860
Illinois      12882135
Name: population, dtype: int64


In [31]:
# Construct a DataFrame from a 2D numpy array

arr = np.random.rand(3, 2)
print(arr, '\n')

pd.DataFrame(arr, columns=['foo', 'bar'], index=['a', 'b', 'c'])

[[0.34880434 0.31506792]
 [0.63624238 0.35348771]
 [0.90298458 0.06098923]] 



Unnamed: 0,foo,bar
a,0.348804,0.315068
b,0.636242,0.353488
c,0.902985,0.060989


In [39]:
"""
Series object manipulation : dictionary-style
"""

ser = pd.Series([0.25, 0.5, 0.75, 1.0], index=['a','b','c','d'])
print(ser['b'])

print('a' in ser)
print(0.25 in ser)

print(ser.keys())

0.5
True
False
Index(['a', 'b', 'c', 'd'], dtype='object')


In [40]:
"""
Series object manipulation: array-style
"""

ser['e'] = 1.25
ser['a'] = 0.125
print(ser)

print(ser['a':'c'])

a    0.125
b    0.500
c    0.750
d    1.000
e    1.250
dtype: float64
a    0.125
b    0.500
c    0.750
dtype: float64


In [42]:
"""
DataFrame object manipulation
"""

states['density'] = states['population'] / states['area']
print(states)

            population    area     density
California    38332521  423967   90.413926
Texas         26448193  695662   38.018740
New York      19651127  141297  139.076746
Florida       19552860  170312  114.806121
Illinois      12882135  149995   85.883763


In [47]:
# Indexer : loc, iloc, ix

print(states.loc['New York': 'Illinois']) # explicit indexing
print(states.iloc[1:3]) # implicit indexing

          population    area     density
New York    19651127  141297  139.076746
Florida     19552860  170312  114.806121
Illinois    12882135  149995   85.883763
          population    area     density
Texas       26448193  695662   38.018740
New York    19651127  141297  139.076746


In [50]:
# Masking and fancy indexing using the loc indexer
print(states.loc[states.density > 100, ['population', 'density']])

          population     density
New York    19651127  139.076746
Florida     19552860  114.806121
