Reference : https://wikidocs.net/32829

In [1]:
import os              # working directory 
import pandas as pd    # data preprocessing
import numpy as np     # data preprocessing
import math            # using mathematical functions
import seaborn as sns            # for visualization
import matplotlib.pyplot as plt  # for visualization
from plotly.offline import init_notebook_mode, iplot   # for visualization
from pandas.plotting import scatter_matrix             # for visualization
from IPython.display import display, HTML              # for html & display
import warnings  
%matplotlib inline

pd.options.display.max_columns = None   #999  
pd.options.display.max_rows = None      #999    
pd.options.display.float_format = '{:.2f}'.format
warnings.filterwarnings("ignore")  
init_notebook_mode(connected=True)

# for css style of dataframe
from IPython.display import HTML
import pandas as pd


#################### USING sticky header and index ####################
#### generate DataFrame ####
# >>> df


# set style
style = """
    <style scoped>        
        .dataframe-div {
          max-height: 300px;
          overflow: auto;      // scroll bar
          position: relative;
        }
        
        .dataframe thead th {
          position: -webkit-sticky; /* for Safari */
          position: sticky;
          top: 0;
          background: black;
          color: white;
        }

        .dataframe thead th:first-child {
          left: 0;
          z-index: 1;
        }

        .dataframe tbody tr th:only-of-type {
                vertical-align: middle;
        }

        .dataframe tbody tr th {
          position: -webkit-sticky; /* for Safari */
          position: sticky;
          left: 0;
          background: black;
          color: white;
          vertical-align: top;
        }
    </style>
"""

# concatenate to single string
# >>> df = style + '<div class="dataframe-div">' + df.to_html() + "\n</div>"


# Displaying DataFrame
# >>> HTML(df)
#################################################################################


# basic init. phrase for dataframe style
df = pd.DataFrame({ 'col_01' : ['1a', '2b'], 'col_02' : ['a', 'b'], 'col_03' : [100, 200] })
HTML(style + '<div class="dataframe-div">' + df.to_html() + "\n</div>")


#### usage ####
# df = 'DataFrame that you generated while coding'
# HTML(style + '<div class="dataframe-div">' + df.to_html() + "\n</div>")

Unnamed: 0,col_01,col_02,col_03
0,1a,a,100
1,2b,b,200


<hr>

# pandas

In [2]:
import pandas as pd

## 3 data structures in pandas
 - Series
 - DataFrame
 - Panel

## Series
 - consists of : index & value

In [4]:
sr = pd.Series([17000,18000,1000,5000],
               index=["pizza","chicken","coke","beer"])
sr

pizza      17000
chicken    18000
coke        1000
beer        5000
dtype: int64

In [5]:
sr.values

array([17000, 18000,  1000,  5000], dtype=int64)

In [6]:
sr.index

Index(['pizza', 'chicken', 'coke', 'beer'], dtype='object')

In [7]:
print("value of series :", sr.values)
print("index of series :", sr.index)

value of series : [17000 18000  1000  5000]
index of series : Index(['pizza', 'chicken', 'coke', 'beer'], dtype='object')


## DataFrame
 - 2 dimensional data structure
 - consists of : columns & index & values
 - column : column-direction index
 - index : row-direction index

In [8]:
values_input = [[1,2,3],[4,5,6],[7,8,9]]
index_input = ['one', 'two', 'three']
columns_input = ['A', 'B', 'C']

df = pd.DataFrame(values_input, index=index_input, columns=columns_input)
df

Unnamed: 0,A,B,C
one,1,2,3
two,4,5,6
three,7,8,9


In [9]:
df.index

Index(['one', 'two', 'three'], dtype='object')

In [10]:
df.values

array([[1, 2, 3],
       [4, 5, 6],
       [7, 8, 9]], dtype=int64)

#### Define DataFrame
 - using List

In [11]:
# Define DataFrame using List

data = [
    ['1000', 'Kay', 90.72],
    ['1001', 'Jay', 78.09],
    ['1002', 'Dee', 98.43],
    ['1003', 'Zet', 64.19],
    ['1004', 'Pee', 81.30]
]
df = pd.DataFrame(data)
df

Unnamed: 0,0,1,2
0,1000,Kay,90.72
1,1001,Jay,78.09
2,1002,Dee,98.43
3,1003,Zet,64.19
4,1004,Pee,81.3


In [12]:
# set column name of dataframe
df = pd.DataFrame(data, columns=['no','name','score'])
df

Unnamed: 0,no,name,score
0,1000,Kay,90.72
1,1001,Jay,78.09
2,1002,Dee,98.43
3,1003,Zet,64.19
4,1004,Pee,81.3


In [13]:
# Define DataFrame using Dictionary

data = {'no' : ['1000', '1001', '1002', '1003', '1004', '1005'],
        'name' : ['Bee','Cee','Dee','Eee','Eff','Gee'],
        'score' : [90.72, 78.09, 98.43, 64.19, 81.30, 99.14]}
df = pd.DataFrame(data)
df

Unnamed: 0,no,name,score
0,1000,Bee,90.72
1,1001,Cee,78.09
2,1002,Dee,98.43
3,1003,Eee,64.19
4,1004,Eff,81.3
5,1005,Gee,99.14


Value Indexing in DataFrame
 - df.head()
 - df.tail()
 - df['column_name']

In [14]:
df.head(2)

Unnamed: 0,no,name,score
0,1000,Bee,90.72
1,1001,Cee,78.09


In [15]:
df.tail(2)

Unnamed: 0,no,name,score
4,1004,Eff,81.3
5,1005,Gee,99.14


In [16]:
df['no']

0    1000
1    1001
2    1002
3    1003
4    1004
5    1005
Name: no, dtype: object

### Redading data
 - pandas supports csv, txt, excel, sql, html, json file

In [20]:
# df = pd.read_csv('test.csv')
# df

# numpy
 - good at dealing numeric data
 - ndarray
    - core data structure of numpy package
    - used in linear algebra calculations

In [21]:
import numpy as np

#### Main modules of Numpy
 - np.array() : generate ndarray from list, tuple, array
 - np.asarray() : generate ndarray from the original array
 - np.arange() : similar to range
 - np.linspace(start, end, n) : generate n number of objects between [start, end] with same intervals
 - np.logspace(start, end, n) : generate n number of objects between [start, end] with log scale interval

In [22]:
a = np.array([1,2,3,4,5])  # using list to make 1 dimension array
type(a)

numpy.ndarray

In [23]:
a

array([1, 2, 3, 4, 5])

In [24]:
b = np.array([[10,20,30], [60,70,80]])
b

array([[10, 20, 30],
       [60, 70, 80]])

In [25]:
print('dimension of array :', b.ndim)

dimension of array : 2


In [26]:
print('shape of array :', b.shape)

shape of array : (2, 3)


#### initializing ndarray

In [27]:
a = np.zeros((2,3)) # 2 by 3 array consists of 0 values
print(a)

[[0. 0. 0.]
 [0. 0. 0.]]


In [29]:
a = np.ones((2,3)) # 2 by 3 array consists of 1 values
print(a)

[[1. 1. 1.]
 [1. 1. 1.]]


In [32]:
a = np.full((2,2), 7) # 2 by 2 array consists of specific value (this case, 7)
print(a)

[[7 7]
 [7 7]]


In [35]:
a = np.eye(3)   # 3 dimension, diagonal values : 1 , other values : 0 
print(a)

[[1. 0. 0.]
 [0. 1. 0.]
 [0. 0. 1.]]


In [36]:
a = np.random.random((2,2)) # array with random values
print(a)

[[0.26336057 0.28110158]
 [0.22814615 0.82158025]]


#### np.arrange()

In [3]:
a = np.arange(10)
print(a)

[0 1 2 3 4 5 6 7 8 9]


In [4]:
a = np.arange(1, 10, 2)
print(a)

[1 3 5 7 9]


#### reshape()

In [5]:
a = np.array(np.arange(30)).reshape((5,6))
print(a)

[[ 0  1  2  3  4  5]
 [ 6  7  8  9 10 11]
 [12 13 14 15 16 17]
 [18 19 20 21 22 23]
 [24 25 26 27 28 29]]


#### Numpy Slicing

In [1]:
import numpy as np
a = np.array([[1,2,3],[4,5,6]])
a

array([[1, 2, 3],
       [4, 5, 6]])

In [2]:
b = a[0:2, 0:2] 
print(b)

[[1 2]
 [4 5]]


In [5]:
c = a[0,:]   # first row  
print(c)  

[1 2 3]


In [7]:
d = a[:,1]   # second column
print(d)

[2 5]


#### Numpy Calculation
 - calculation among arrays
 - default functions : add(), substract(), multiply(), divide(), ...

In [2]:
x = np.array([1,2,3])
y = np.array([4,5,6])