<h3>Data Manipulations</h3>

<h3>What</h3>

<h3>Why</h3>

<h3>How</h3>

<h3>Pandas Data Manipulation</h3>

<h4>Pandas Data Structures</h4>

<h5>Pandas Series Object</h5>
<p>
    One-dimensional array of indexed data
</p>

In [27]:
import pandas as pd
import numpy as np
data = pd.Series([.344, .98, .234, .904])
data

0    0.344
1    0.980
2    0.234
3    0.904
dtype: float64

In [28]:
type(data)

pandas.core.series.Series

In [29]:
# Series as Generalized NumPy Array
# Similar to NumPy array with difference of an explicitly defined index associated
# with each value. The index can be of any datatype, not strictly integer
data = pd.Series([.25, .6, .8, 9],
                 index=[0, 6.5, 'c', 90])
data

0      0.25
6.5    0.60
c      0.80
90     9.00
dtype: float64

In [30]:
type(data.index)

pandas.core.indexes.base.Index

In [31]:
# Series as Specialized Dictionary
hiv_dict = {
    'Central 1': 8.6,
    'Central 2': 7.6,
    'Kampala': 6.9,
    'East-Central': 4.7,
    'Mid-Central': 5.1,
}
hiv = pd.Series(hiv_dict)
hiv

Central 1       8.6
Central 2       7.6
Kampala         6.9
East-Central    4.7
Mid-Central     5.1
dtype: float64

In [32]:
# Dictionary-like access
hiv.Kampala

np.float64(6.9)

In [33]:
hiv['Central 1']

np.float64(8.6)

In [34]:
# Unlike dictionaries, series support slicing
hiv['Central 1': 'Kampala']

Central 1    8.6
Central 2    7.6
Kampala      6.9
dtype: float64

In [35]:
# Constructing Series Objects
# syntax: pd.Series(data, index=index)
# where data can be a list or a NumPy array in which case
# index defaults to integer sequence
pd.Series(['Africa', 'King', 'Uoganda'])

0     Africa
1       King
2    Uoganda
dtype: object

In [36]:
# Or data can be a scalar, a string
pd.Series('Africa', index=[4, 9, 2, 1])

4    Africa
9    Africa
2    Africa
1    Africa
dtype: object

In [37]:
# or a dictionary 
pd.Series(
    {
    'Central 1': 8.6,
    'Central 2': 7.6,
    'Kampala': 6.9,
    'East-Central': 4.7,
    'Mid-Central': 5.1,
})

Central 1       8.6
Central 2       7.6
Kampala         6.9
East-Central    4.7
Mid-Central     5.1
dtype: float64

In [38]:
# Index can be set to control the order of the subset of keys used
pd.Series(
    {
    'Central 1': 8.6,
    'Central 2': 7.6,
    'Kampala': 6.9,
    'East-Central': 4.7,
    'Mid-Central': 5.1,
},
index=['Kampala', 'Central 1', 'Central 2', 'East-Central', 'Mid-Central'])

Kampala         6.9
Central 1       8.6
Central 2       7.6
East-Central    4.7
Mid-Central     5.1
dtype: float64

<h3>Pandas DataFrame Object</h3>
<p>
    The Pandas DataFrame can be thought of either as a generalization of a NumPy array or a specialization of a Python dictionary
</p>

In [39]:
# DataFrame as a Generalized NumPy Array
# If a series is an analog of a one-dimensional array with explicit indices,
# a DataFrame can be thought as a sequence of aligned series objects. Here, by
# 'aligned' means they share the same index

# Constructing a Series first
hiv_dict = {
    'Central 1': 8.6,
    'Central 2': 7.6,
    'Kampala': 6.9,
    'East-Central': 4.7,
    'Mid-Central': 5.1,
}

hiv = pd.Series(hiv_dict)

# A population Series 
population_dict = {
    'Central 1': 900565,
    'Central 2': 12005674,
    'Kampala': 53459035,
    'East-Central': 9045000,
    'Mid-Central': 4975545,
}

population = pd.Series(population_dict)

# Constructing a dataframe from the series
regions = pd.DataFrame({'HIV Prevalence (95% CI)': hiv,
'Population': population
                       })
regions

Unnamed: 0,HIV Prevalence (95% CI),Population
Central 1,8.6,900565
Central 2,7.6,12005674
Kampala,6.9,53459035
East-Central,4.7,9045000
Mid-Central,5.1,4975545


In [40]:
# Attributes: index, columns, 
regions.index.name = 'Regions'
regions.columns.name = 'Features'
regions

Features,HIV Prevalence (95% CI),Population
Regions,Unnamed: 1_level_1,Unnamed: 2_level_1
Central 1,8.6,900565
Central 2,7.6,12005674
Kampala,6.9,53459035
East-Central,4.7,9045000
Mid-Central,5.1,4975545


In [41]:
# A DataFrame as a specialized Dictionary
regions.Population

Regions
Central 1         900565
Central 2       12005674
Kampala         53459035
East-Central     9045000
Mid-Central      4975545
Name: Population, dtype: int64

In [42]:
regions.iloc[0]

Features
HIV Prevalence (95% CI)         8.6
Population                 900565.0
Name: Central 1, dtype: float64

In [43]:
regions.iat[0, 1]

np.int64(900565)

In [44]:
# Constructing DataFrame Objects
# From a single Series Object
pd.DataFrame(
    population,
    columns=['Population']
)

Unnamed: 0,Population
Central 1,900565
Central 2,12005674
Kampala,53459035
East-Central,9045000
Mid-Central,4975545


In [45]:
# from a list of Dictionaries
data = [
    {'a': i, 'b': 2*i}
    for i in range(4)
]

# data = [{'a': 0, 'b': 0}, {'a': 1, 'b': 2}, {'a': 2, 'b': 4}, {'a': 3, 'b': 6}]
pd.DataFrame(data)

Unnamed: 0,a,b
0,0,0
1,1,2
2,2,4
3,3,6


In [46]:
# If some keys are missing, Not-a-number (NaN) values are introduced
pd.DataFrame([
    {'Kampala': 45,
     'Iganga': 90,
     'Fort Portal': 80,
    },
    {'Iganga': 100,
     'Lira': 34,
     'Fort Portal': 90
    }
])

Unnamed: 0,Kampala,Iganga,Fort Portal,Lira
0,45.0,90,80,
1,,100,90,34.0


In [47]:
# From a dictionary of Series Objects
# population
pd.DataFrame(
    {'HIV Prevalence': hiv,
     'Population': population
    }
)

Unnamed: 0,HIV Prevalence,Population
Central 1,8.6,900565
Central 2,7.6,12005674
Kampala,6.9,53459035
East-Central,4.7,9045000
Mid-Central,5.1,4975545


In [49]:
# From a two-dimensional NumPy array
# READ about NumPy arrays
pd.DataFrame(
    np.random.rand(3, 2),
    columns=['foo', 'bar'],
    index=['a', 'b', 'c']
)

Unnamed: 0,foo,bar
a,0.571865,0.50601
b,0.073125,0.146914
c,0.529346,0.928949


In [54]:
# From a numpy Structured array
# READ MORE ABOUT NUMPY STRUCTURED ARRAYS
A = np.zeros(3, dtype=[('A', 'i8'),
                       ('B', 'f8')])
A

array([(0, 0.), (0, 0.), (0, 0.)], dtype=[('A', '<i8'), ('B', '<f8')])

In [56]:
# The Pandas Index Object
# The Series and DataFrame objects contain an explicit index that enables you reference
# and modify data
# Constructing an index
ind = pd.Index(
    [1, 2, 3, 4, 5]
)
ind

Index([1, 2, 3, 4, 5], dtype='int64')

In [64]:
# Index as an immutable array
# indexing an index array
ind[::2]

Index([1, 3, 5], dtype='int64')

In [65]:
# Index attributes
print(ind.size, ind.shape, ind.ndim, ind.dtype)

5 (5,) 1 int64


<h3>From Here ... </h3>

In [53]:
np.array([[0.08526167, 0.36899345],
       [0.13622546, 0.74735849],
       [0.33634605, 0.23684853]])

array([[0.08526167, 0.36899345],
       [0.13622546, 0.74735849],
       [0.33634605, 0.23684853]])