**Pandas Series**
 - A Pandas Series is like a column in a table.
 - It is a one-dimensional array holding data of any type.

In [None]:
import numpy as np
import pandas as pd

In [None]:
#only data
pd.Series([1, 2, 3])
pd.Series(['a', 'b', 'c'])
pd.Series(np.arange(200))

In [None]:
#indicate data, index
pd.Series([1, 2, 3], [100, 200, 300])

In [None]:
#indicate data, index, data type
s = pd.Series(np.arange(5), np.arange(100, 105), dtype=np.int16)
print(s)

In [None]:
s.index

In [None]:
s.values

In [None]:
s[104]

In [None]:
s[104]=70
s

In [None]:
s2 = pd.Series(np.arange(5), s.index)
s2

In [None]:
#series size, shape, unique, count, value_counts function
s = pd.Series([1, 1, 2, 1, 2, 2, 2, 1, 1, 3, 3, 4, 5, 5, 7, np.NaN])
s

In [None]:
len(s) #length
s.size
s.shape #return shape with tuple
s.unique()
s.count() #return number of data except NaN
s.mean()
s.value_counts()

In [None]:
s[[5, 7, 8, 10]].value_counts()

In [None]:
s.head() #used to get the first n rows (default n = 5)
s.head(n = 7)

In [None]:
s.tail() # used to get the last n rows

In [None]:
# calculate series data
s1 = pd.Series([1, 2, 3, 4], ['a', 'b', 'c', 'd'])
s2 = pd.Series([6, 3, 2, 1], ['d', 'c', 'b', 'a'])
s1 + s2
s1 ** s2

In [None]:
s1['k'] = 7
s2['e'] = 9

In [None]:
s1 + s2

In [None]:
#series boolean selection
s = pd.Series(np.arange(10), np.arange(10)+1)
s

In [None]:
s > 5
s[s > 5]
s[ s%2 == 0]

In [None]:
s.index > 5
s[s.index > 5]
s[(s > 5) & (s < 8)]
(s >= 7).sum()

In [None]:
#slicing
s1 = pd.Series(np.arange(100, 105))
print(s1)
s1[1:3]

In [None]:
s2 = pd.Series(np.arange(100, 105), ['a', 'c', 'b', 'd', 'e'])
print(s2)
s2['c':'d']

**pandas dataframe**
 - a 2 dimensional data structure, like a 2 dimensional array, or a table with rows and columns.

In [None]:
# data source: https://www.kaggle.com/hesh97/titanicdataset-traincsv/data
train_data = pd.read_csv('./train.csv')

In [None]:
train_data.head(n=3)

In [None]:
train_data.tail(n=10)

In [None]:
train_data.shape
#used for calculating some statistical data like percentile, mean and std of the numerical values of the Series or DataFrame
train_data.describe() 
train_data.info()

In [None]:
train_data.index
train_data.columns

In [None]:
# generate DataFrame with dictionary
data = {'a' : 100, 'b' : 200, 'c' : 300}
pd.DataFrame(data, index=['x', 'y', 'z'])

In [None]:
data = {'a' : [1, 2, 3], 'b' : [4, 5, 6], 'c' : [10, 11, 12]}
pd.DataFrame(data, index=[0, 1, 2])

In [None]:
# generate DataFrame with series
a = pd.Series([100, 200, 300], ['a', 'b', 'd'])
b = pd.Series([101, 201, 301], ['a', 'b', 'k'])
c = pd.Series([110, 210, 310], ['a', 'b', 'c'])

pd.DataFrame([a, b, c], index=[100, 101, 102])

In [None]:
# generate DataFrame with csv data
train_data = pd.read_csv('./train.csv', index_col='PassengerId', usecols=['PassengerId', 'Survived', 'Pclass', 'Name'])
train_data

In [None]:
#select row
# loc - Access a group of rows and columns by label(s) or a boolean array
train_data.loc[1]

In [None]:
# iloc - Purely integer-location based indexing for selection by position
train_data.iloc[[0, 100, 200, 2]]

In [None]:
# calculate correlation
train_data = pd.read_csv('./train.csv')
train_data.corr()

In [None]:
import matplotlib.pyplot as plt

In [None]:
plt.matshow(train_data.corr())

In [None]:
# NaN data
train_data.isna()
train_data['Age'].isna()

In [None]:
#delete NaN data with dropna()
train_data.dropna()

In [None]:
train_data.dropna(axis = 1)

In [None]:
# Fill NA/NaN values using the specified method
train_data['Age'].fillna(train_data['Age'].mean())

In [None]:
#change type
train_data['Pclass'] = train_data['Pclass'].astype(str)

In [None]:
train_data.info()

In [None]:
import math

In [None]:
def age_categorize(age):
    if math.isnan(age):
        return -1
    return math.floor(age / 10) * 10

In [None]:
train_data

In [None]:
train_data['Age'].apply(age_categorize)

**one-hot encoding**
 - One Hot Encoding is a process in the data processing that is applied to categorical data, to convert it into a binary vector representation for use in machine learning algorithms
 - pandas.get_dummies function
  - Convert categorical variable into dummy/indicator variables.

In [None]:
pd.get_dummies(train_data)

In [None]:
pd.get_dummies(train_data, columns=['Pclass', 'Sex', 'Embarked'], drop_first=True)

In [None]:
df = pd.read_csv('./train.csv')
class_group = df.groupby('Pclass')
class_group

In [None]:
class_group.groups

In [None]:
gender_group = df.groupby('Sex')
gender_group.groups

In [None]:
class_group.count()

In [None]:
#e.g. calculate survival rate
df.set_index('Age').groupby(age_categorize).mean()['Survived']

In [None]:
df.set_index(['Pclass', 'Sex']).groupby(level=[0, 1]).aggregate([np.mean, np.sum, np.max])

In [None]:
# using transform function, will keep index
df.groupby('Pclass').mean()

In [None]:
df.groupby('Pclass').transform(np.mean)

In [None]:
df['Age2'] = df.groupby('Pclass').transform(np.mean)['Age']
df