# Pandas
Data structures in pandas
- `Series` objects: 1D array, similar to a column in a spreadsheet
- `DataFrame` objects: 2D table, similar to a spreadsheet
- `Panel` objects: Dictionary of DataFrames, similar to sheet in MS Excel

## Series

In [1]:
import pandas as pd
import numpy as np

birthyear = pd.Series([1984, 1985, 1992])
print(birthyear)
print(birthyear.index)
print()

weight = pd.Series([68, 83, 112],index=["alice", "bob", "charles"])
print(weight)
print(weight.index)

0    1984
1    1985
2    1992
dtype: int64
RangeIndex(start=0, stop=3, step=1)

alice       68
bob         83
charles    112
dtype: int64
Index(['alice', 'bob', 'charles'], dtype='object')


## DataFrame Creation
- from dict
- from iterables (with column names)
- from ndarray (with colum names)

In [3]:
weight = pd.Series([68, 83, 112],index=["alice", "bob", "charles"])
birthyear = pd.Series([1984, 1985, 1992], index=["bob", "alice", "charles"], name="year")
children = pd.Series([0, 3], index=["charles", "bob"])
hobby = pd.Series(["Biking", "Dancing"], index=["alice", "bob"])

people_dict = { "weight": weight,
                "birthyear": birthyear,
                "children": children,
                "hobby": hobby}

people = pd.DataFrame(people_dict)
print(people)

         weight  birthyear  children    hobby
alice        68       1985       NaN   Biking
bob          83       1984       3.0  Dancing
charles     112       1992       0.0      NaN


In [3]:
print('shape:', people.shape)
print(people.dtypes)
print('index:', people.index)
print('columns:', people.columns)

shape: (3, 4)
birthyear      int64
children     float64
hobby         object
weight         int64
dtype: object
index: Index(['alice', 'bob', 'charles'], dtype='object')
columns: Index(['birthyear', 'children', 'hobby', 'weight'], dtype='object')


In [4]:
people['birthyear']     # extracting column --> Series

alice      1985
bob        1984
charles    1992
Name: birthyear, dtype: int64

In [5]:
people['birthyear'] < 1990      # like numpy boolean array

alice       True
bob         True
charles    False
Name: birthyear, dtype: bool

In [6]:
old_people = people[people['birthyear'] < 1990]    # like numpy boolean array indexing
old_people

Unnamed: 0,birthyear,children,hobby,weight
alice,1985,,Biking,68
bob,1984,3.0,Dancing,83


In [7]:
people_abbr = people[['birthyear', 'weight']]   # like numpy integer array indexing
people_abbr

Unnamed: 0,birthyear,weight
alice,1985,68
bob,1984,83
charles,1992,112


In [8]:
people['weight'].mean()

87.66666666666667

# Data Preparation


In [2]:
df = pd.read_csv("data/diabetes.csv")
df

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1
...,...,...,...,...,...,...,...,...,...
763,10,101,76,48,180,32.9,0.171,63,0
764,2,122,70,27,0,36.8,0.340,27,0
765,5,121,72,23,112,26.2,0.245,30,0
766,1,126,60,0,0,30.1,0.349,47,1


In [9]:
url = 'http://bogotobogo.com/python/images/python_Pandas_NumPy_Matplotlib/HIP_star.dat'
df = pd.read_csv(url, sep='\s+')
print(df.shape)
df.head()

(2720, 9)


Unnamed: 0,HIP,Vmag,RA,DE,Plx,pmRA,pmDE,e_Plx,B-V
0,2,9.27,0.003797,-19.498837,21.9,181.21,-0.93,3.1,0.999
1,38,8.65,0.111047,-79.061831,23.84,162.3,-62.4,0.78,0.778
2,47,10.78,0.135192,-56.835248,24.45,-44.21,-145.9,1.97,1.15
3,54,10.57,0.151656,17.968956,20.97,367.14,-19.49,1.71,1.03
4,74,9.93,0.221873,35.752722,24.22,157.73,-40.31,1.36,1.068


Data preprocessing: check data validity

In [10]:
# check if a colum has no data (or NaN)
df.isnull().sum()

HIP       0
Vmag      1
RA        1
DE        1
Plx       1
pmRA      1
pmDE      1
e_Plx     1
B-V      42
dtype: int64

In [11]:
# Drop any row if any of the column ha no data
df = df.dropna()
# Check again
df.isnull().sum()

HIP      0
Vmag     0
RA       0
DE       0
Plx      0
pmRA     0
pmDE     0
e_Plx    0
B-V      0
dtype: int64

In [12]:
df.describe()

Unnamed: 0,Vmag,RA,DE,Plx,pmRA,pmDE,e_Plx,B-V
count,2678.0,2678.0,2678.0,2678.0,2678.0,2678.0,2678.0,2678.0
mean,8.21478,173.528409,-0.274356,22.195403,5.537058,-63.534589,1.544955,0.76153
std,1.858407,107.748388,38.893512,1.41826,161.120941,140.351882,1.748178,0.318188
min,0.45,0.003797,-87.20273,20.0,-868.01,-1392.3,0.45,-0.158
25%,7.03,69.984258,-31.80038,20.98,-91.995,-129.9675,0.87,0.56
50%,8.245,173.362326,3.125766,22.1,10.64,-48.68,1.13,0.7105
75%,9.54,267.781761,27.734524,23.3575,103.6775,8.7125,1.65,0.953
max,12.49,359.954685,88.302681,25.0,781.34,481.19,36.48,2.8


## DataFrame 합치기: `concat`, `merge`, `join`
참고: https://yganalyst.github.io/data_handling/Pd_12/
