In [1]:
!pip install pandas



In [2]:
import pandas as pd
import numpy as np

In [3]:
# Display the help document
np?

[1;31mType:[0m        module
[1;31mString form:[0m <module 'numpy' from 'C:\\ProgramData\\Anaconda3\\lib\\site-packages\\numpy\\__init__.py'>
[1;31mFile:[0m        c:\programdata\anaconda3\lib\site-packages\numpy\__init__.py
[1;31mDocstring:[0m  
NumPy
=====

Provides
  1. An array object of arbitrary homogeneous items
  2. Fast mathematical operations over arrays
  3. Linear Algebra, Fourier Transforms, Random Number Generation

How to use the documentation
----------------------------
Documentation is available in two forms: docstrings provided
with the code, and a loose standing reference guide, available from
`the NumPy homepage <https://www.scipy.org>`_.

We recommend exploring the docstrings using
`IPython <https://ipython.org>`_, an advanced Python shell with
TAB-completion and introspection capabilities.  See below for further
instructions.

The docstring examples assume that `numpy` has been imported as `np`::

  >>> import numpy as np

Code snippets are indicated by t

In [4]:
# Print the version of pandas
pd.__version__

'1.3.4'

In [8]:
"""
    Pandas Object: Series
"""
# Create a series from a list
ser = pd.Series(np.arange(0.25,1.25,0.25))
print(ser)

0    0.25
1    0.50
2    0.75
3    1.00
dtype: float64


In [9]:
# Two main attributes: 'values' and 'index'
print(ser.values)
print(ser.index)
print(ser.index.values)

[0.25 0.5  0.75 1.  ]
RangeIndex(start=0, stop=4, step=1)
[0 1 2 3]


In [16]:
# Label-based Indexing
ser = pd.Series(np.arange(0.25,1.25,0.25), index=['a','b','c','d'])
print(ser, '\n')

print(ser[1])
print(ser['b'])

a    0.25
b    0.50
c    0.75
d    1.00
dtype: float64 

0.5
0.5


In [16]:
"""
Dictionary and Series
"""

dict = {'a':1, 2:'two', 'third':True}
print(dict)

ser = pd.Series(dict)
print(ser)

{'a': 1, 2: 'two', 'third': True}
a           1
2         two
third    True
dtype: object


In [17]:
"""
Create a series from a dictionary
"""
population_dict = {'California': 38332521,
                   'Texas': 26448193,
                   "New York": 19651127,
                   'Florida': 19552860,
                   'Illinois': 12882135}

population = pd.Series(population_dict)
print(population, '\n')

print(population['Texas'], '\n')
print(population['California' : 'New York'])

California    38332521
Texas         26448193
New York      19651127
Florida       19552860
Illinois      12882135
dtype: int64 

26448193 

California    38332521
Texas         26448193
New York      19651127
dtype: int64


In [18]:
"""
    Pandas object: DataFrame
"""

area_dict = {'California': 423967, 'Texas': 695662, 'New York': 141297,
            'Florida': 170312, 'Illinois': 149995}

area = pd.Series(area_dict)
area

California    423967
Texas         695662
New York      141297
Florida       170312
Illinois      149995
dtype: int64

In [20]:
# Construct a DataFrame containing 'population' and 'area' Series
states = pd.DataFrame({'population':population, 'area':area})
print(states, '\n')

print(states.values, '\n')

print(states.index)
print(states.index.values, '\n')

print(states.columns)
print(states.columns.values)

            population    area
California    38332521  423967
Texas         26448193  695662
New York      19651127  141297
Florida       19552860  170312
Illinois      12882135  149995 

[[38332521   423967]
 [26448193   695662]
 [19651127   141297]
 [19552860   170312]
 [12882135   149995]] 

Index(['California', 'Texas', 'New York', 'Florida', 'Illinois'], dtype='object')
['California' 'Texas' 'New York' 'Florida' 'Illinois'] 

Index(['population', 'area'], dtype='object')
['population' 'area']


In [21]:
states['area']

California    423967
Texas         695662
New York      141297
Florida       170312
Illinois      149995
Name: area, dtype: int64

In [23]:
# Construct a DataFrame from a 2D NumPy array
arr = np.random.rand(3, 2)
print(arr)

[[0.27562793 0.00948565]
 [0.53256626 0.73528843]
 [0.32931722 0.54879762]]


In [25]:
pd.DataFrame(arr, columns=['col1','col2'], index=['a','b','c'])

Unnamed: 0,col1,col2
a,0.275628,0.009486
b,0.532566,0.735288
c,0.329317,0.548798


In [27]:
"""
Series object manipulation: dictionary-style
"""

ser = pd.Series([0.25, 0.5, 0.75, 1.0], index=['a', 'b', 'c', 'd'])

print(ser['b'])

print('a' in ser)
print(0.25 in ser)

print(ser.keys())

ser['e']=1.25
ser['a']=0.125
print(ser)

0.5
True
False
Index(['a', 'b', 'c', 'd'], dtype='object')
a    0.125
b    0.500
c    0.750
d    1.000
e    1.250
dtype: float64


In [41]:
"""
Series object manipulation: array-style
"""

# slicing
print(ser['a':'c'], '\n')

# masking
print(ser[(ser>0.3)&(ser<0.8)], '\n')

# fancy indexing
print(ser[['a','e']])

a    0.125
b    0.500
c    0.750
dtype: float64 

b    0.50
c    0.75
dtype: float64 

a    0.125
e    1.250
dtype: float64


In [28]:
"""
Caution: Slicing Series object using explicit/implicit indexing
"""

ser = pd.Series([1, 2, 3, 4, 5], index=['a', 'b', 'c', 'd', 'e'])

# explicit
print(ser['a':'c'])

# implicit
print(ser[0:2])

a    1
b    2
c    3
dtype: int64
a    1
b    2
dtype: int64


In [30]:
"""
DataFrame object manipulation
"""

print(states["area"], '\n')
print(states.area, '\n')

states['density'] = states['population'] / states['area']
states

California    423967
Texas         695662
New York      141297
Florida       170312
Illinois      149995
Name: area, dtype: int64 

California    423967
Texas         695662
New York      141297
Florida       170312
Illinois      149995
Name: area, dtype: int64 



Unnamed: 0,population,area,density
California,38332521,423967,90.413926
Texas,26448193,695662,38.01874
New York,19651127,141297,139.076746
Florida,19552860,170312,114.806121
Illinois,12882135,149995,85.883763


In [33]:
# Indexer: loc, iloc
# - 인덱싱이 숫자이면 [1]이 진짜 인덱스 1인 1행인지 0,1,2 순서일 때 1인 2행인지 모름
ser = pd.Series(['a', 'b', 'c'], index=[1,3,5])
print(ser, '\n')

print(ser[1], '\n')   # X

print(ser.loc[1:3], '\n')   # 명시적 인덱싱 : 인데스(레이블)명
print(ser.iloc[1:3])   # 암시적 인덱싱 : 0, 1, 2, ...

1    a
3    b
5    c
dtype: object 

a 

1    a
3    b
dtype: object 

3    b
5    c
dtype: object


In [36]:
"""
    DataFrame as two-dimensional array
"""

print(states.ndim)
print(states.shape)
print(states.values, '\n')

print("<transpose of a matrix>")
print(states.T, '\n')
print(states.T.T, '\n')

print("<loc iloc>")
print(states.iloc[:3, :2])
print(states.loc[: 'Illinois', :'population'])

2
(5, 3)
[[3.83325210e+07 4.23967000e+05 9.04139261e+01]
 [2.64481930e+07 6.95662000e+05 3.80187404e+01]
 [1.96511270e+07 1.41297000e+05 1.39076746e+02]
 [1.95528600e+07 1.70312000e+05 1.14806121e+02]
 [1.28821350e+07 1.49995000e+05 8.58837628e+01]] 

<transpose of a matrix>
              California         Texas      New York       Florida  \
population  3.833252e+07  2.644819e+07  1.965113e+07  1.955286e+07   
area        4.239670e+05  6.956620e+05  1.412970e+05  1.703120e+05   
density     9.041393e+01  3.801874e+01  1.390767e+02  1.148061e+02   

                Illinois  
population  1.288214e+07  
area        1.499950e+05  
density     8.588376e+01   

            population      area     density
California  38332521.0  423967.0   90.413926
Texas       26448193.0  695662.0   38.018740
New York    19651127.0  141297.0  139.076746
Florida     19552860.0  170312.0  114.806121
Illinois    12882135.0  149995.0   85.883763 

<loc iloc>
            population    area
California    38332

In [41]:
# Masking and fancy indexing using the loc indexer
print(states.loc[states.density > 100, ['population', 'density']])

# DataFrame value changing
states.iloc[0,2] = 90
states

          population     density
New York    19651127  139.076746
Florida     19552860  114.806121


Unnamed: 0,population,area,density
California,38332521,423967,90.0
Texas,26448193,695662,38.01874
New York,19651127,141297,139.076746
Florida,19552860,170312,114.806121
Illinois,12882135,149995,85.883763
