This file contains basic use-methods of Pandas module I summarized from courses and some online sources.

[dataquest](https://www.dataquest.io)

# Pandas
Pandas is a library that unifies the most common workflows that data analysts and data scientists previously relied on many different libraries for. 

To represent tabular data, pandas uses a custom 2-dimensional data structure called a **dataframe** (collections of Series objects).

The **series** (collections of values) object is a core data structure that pandas uses to represent rows and columns.

- Pandas could store mixed data types in rows and columns.

- Pandas dataframes can handle missing values using a custom object, NaN, to represent those values.

- Pandas dataframes contain axis labels for both rows and columns and enable you to refer to elements in the dataframe more intuitively.

## 1. Series and dataframe initialization

In [114]:
import pandas as pd
import numpy as np

# create by myself
series = pd.Series(data=[1, 3, 5], index=['a', 'b', 'c'])
print('\ncreate series\n', series)
dataframe = pd.DataFrame(data=[[1, 2, 3], [4, 5, 6], [7, 8, 9]], index=[
                         'a', 'b', 'c'], columns=['one', 'two', 'three'])
print('\ncreate dataframe\n', dataframe)
dataframe2 = pd.DataFrame(data=np.random.normal(size=(3, 3)), index=[
                          'a', 'b', 'c'], columns=['one', 'two', 'three'])
print('\ncreate dataframe2\n', dataframe2)

# create from other sources
# pd_html = pd.read_html(url, header = 0, index_col = 'Currency')
# pd_csv = pd.read_csv('file_name.csv')


create series
 a    1
b    3
c    5
dtype: int64

create dataframe
    one  two  three
a    1    2      3
b    4    5      6
c    7    8      9

create dataframe2
         one       two     three
a -0.153552  1.562383 -0.533992
b  0.416615 -0.294909 -0.007498
c  0.570763 -0.943341 -0.230215


## 2. Useful attributes for series and dataframe object

In [115]:
print('\ndataframe_head\n', dataframe.head(2))

print('\ndataframe_columns\n', list(dataframe.columns))
print('\ndataframe_rows\n', list(dataframe.index))
print('\ndataframe_type\n', dataframe.dtypes)

print('\ndataframe_shape\n', dataframe.shape)
print('\ndataframe_row_num\n', dataframe.shape[0])
print('\ndataframe_column_num\n', dataframe.shape[1])

print('\nseries_max\n', series.max())
print('\nseries_mean\n', series.mean())
print('\nseries_std\n', series.std())
print('\nseries_sort\n', series.sort_values(ascending=False))

print('\ndataframe_sort\n', dataframe.sort_values(
    'one', inplace=False, ascending=False))


dataframe_head
    one  two  three
a    1    2      3
b    4    5      6

dataframe_columns
 ['one', 'two', 'three']

dataframe_rows
 ['a', 'b', 'c']

dataframe_type
 one      int64
two      int64
three    int64
dtype: object

dataframe_shape
 (3, 3)

dataframe_row_num
 3

dataframe_column_num
 3

series_max
 5

series_mean
 3.0

series_std
 2.0

series_sort
 c    5
b    3
a    1
dtype: int64

dataframe_sort
    one  two  three
c    7    8      9
b    4    5      6
a    1    2      3


## 3. Series and dataframe indexing and slicing

In [128]:
# select colums
print('\nselect_column\n', dataframe['one'])  # the result is series
print('\nselect_columns\n', dataframe[['one', 'two']])

# select rows
print('\nselect_row\n', dataframe.loc['a'])  # the result is series
print('\nselect_row2\n', dataframe.iloc[0])
print('\nselect_rows\n', dataframe.loc[['a', 'b']])
print('\nselect_rows2\n', dataframe.loc['a':'b'])
print('\nselect_rows3\n', dataframe.iloc[0:3])

# select elements
print('\nselect_element\n', dataframe['one']['a'])
print('\nselect_element2\n', dataframe.loc['a', 'one'])
print('\nselect_element3\n', dataframe.iloc[0, 0])
print('\nselect_elements\n', dataframe.loc[['a', 'b']][['one', 'two']])
print('\nselect_elements2\n', dataframe.iloc[:2, :2])

# use boolean
print('\nselect_rows_boolean\n', dataframe[dataframe['one'] == 1])


select_column
 a    1
b    4
c    7
Name: one, dtype: int64

select_columns
    one  two
a    1    2
b    4    5
c    7    8

select_row
 one      1
two      2
three    3
Name: a, dtype: int64

select_row2
 one      1
two      2
three    3
Name: a, dtype: int64

select_rows
    one  two  three
a    1    2      3
b    4    5      6

select_rows2
    one  two  three
a    1    2      3
b    4    5      6

select_rows3
    one  two  three
a    1    2      3
b    4    5      6
c    7    8      9

select_element
 1

select_element2
 1

select_element3
 1

select_elements
    one  two
a    1    2
b    4    5

select_elements2
    one  two
a    1    2
b    4    5

select_rows_boolean
    one  two  three
a    1    2      3


## 4.  Series and dataframe manipulation

In [119]:
# calculation for columns or rows
# same for substract, multiply, divide
print('calculation\n')
add = dataframe['two'] + 10
print(add)
add2 = dataframe['one'] + dataframe['two']
print(add2)
add3 = dataframe.loc['a'] + dataframe.loc['b']
print(add3)

# add more columns or rows
dataframe2['four'] = dataframe2['one'] + dataframe2['two']
print('\nadd_column\n', dataframe2)
dataframe2.loc['d'] = dataframe2.loc['a'] + dataframe2.loc['b']
print('\nadd_row\n', dataframe2)

# replace the column
dataframe2['four'] = dataframe2['one']
print('\nreplace_column\n', dataframe2)

calculation

a    12
b    15
c    18
Name: two, dtype: int64
a     3
b     9
c    15
dtype: int64
one      5
two      7
three    9
dtype: int64

add_column
         one       two     three      four
a -0.153552  1.562383 -0.533992  1.408831
b  0.416615 -0.294909 -0.007498  0.121706
c  0.570763 -0.943341 -0.230215 -0.372578

add_row
         one       two     three      four
a -0.153552  1.562383 -0.533992  1.408831
b  0.416615 -0.294909 -0.007498  0.121706
c  0.570763 -0.943341 -0.230215 -0.372578
d  0.263063  1.267474 -0.541490  1.530537

replace_column
         one       two     three      four
a -0.153552  1.562383 -0.533992 -0.153552
b  0.416615 -0.294909 -0.007498  0.416615
c  0.570763 -0.943341 -0.230215  0.570763
d  0.263063  1.267474 -0.541490  0.263063
