# DataFrame

## 1. Creating a DataFrame
* Dict keys = column names, list of values = column values

In [1]:
import numpy as np
import pandas as pd

In [2]:
data = {'state': ['Ohio', 'Ohio', 'Ohio', 'Nevada', 'Nevada', 'Nevada'],
        'year': [2000, 2001, 2002, 2001, 2002, 2003],
        'population': [1.5, 1.7, 3.6, 2.4, 2.9, 3.2]}

In [3]:
frame = pd.DataFrame(data)
frame

Unnamed: 0,state,year,population
0,Ohio,2000,1.5
1,Ohio,2001,1.7
2,Ohio,2002,3.6
3,Nevada,2001,2.4
4,Nevada,2002,2.9
5,Nevada,2003,3.2


### 1.1 EDA descriptors

In [4]:
#only the first five rows
frame.head()

Unnamed: 0,state,year,population
0,Ohio,2000,1.5
1,Ohio,2001,1.7
2,Ohio,2002,3.6
3,Nevada,2001,2.4
4,Nevada,2002,2.9


In [5]:
#only the last five rows
frame.tail()

Unnamed: 0,state,year,population
1,Ohio,2001,1.7
2,Ohio,2002,3.6
3,Nevada,2001,2.4
4,Nevada,2002,2.9
5,Nevada,2003,3.2


In [6]:
frame_2 = pd.DataFrame(data,
                       columns=['year', 'state', 'population', 'debt'],
                       index=['one', 'two', 'three', 'four', 'five', 'six'])

In [7]:
frame_2

Unnamed: 0,year,state,population,debt
one,2000,Ohio,1.5,
two,2001,Ohio,1.7,
three,2002,Ohio,3.6,
four,2001,Nevada,2.4,
five,2002,Nevada,2.9,
six,2003,Nevada,3.2,


In [8]:
frame_2.columns

Index(['year', 'state', 'population', 'debt'], dtype='object')

In [9]:
frame_2.index

Index(['one', 'two', 'three', 'four', 'five', 'six'], dtype='object')

### 1.2 Selectig columns (dict key) and rows (loc)

In [10]:
frame_2['population']

one      1.5
two      1.7
three    3.6
four     2.4
five     2.9
six      3.2
Name: population, dtype: float64

In [11]:
#alternatively
frame_2.population

one      1.5
two      1.7
three    3.6
four     2.4
five     2.9
six      3.2
Name: population, dtype: float64

In [12]:
#selecting rows via loc
frame_2.loc['three']

year          2002
state         Ohio
population     3.6
debt           NaN
Name: three, dtype: object

### 1.3 Assigning column values

In [13]:
frame_2['debt'] = 16
frame_2

Unnamed: 0,year,state,population,debt
one,2000,Ohio,1.5,16
two,2001,Ohio,1.7,16
three,2002,Ohio,3.6,16
four,2001,Nevada,2.4,16
five,2002,Nevada,2.9,16
six,2003,Nevada,3.2,16


In [14]:
frame_2['debt'] = np.arange(6)
frame_2

Unnamed: 0,year,state,population,debt
one,2000,Ohio,1.5,0
two,2001,Ohio,1.7,1
three,2002,Ohio,3.6,2
four,2001,Nevada,2.4,3
five,2002,Nevada,2.9,4
six,2003,Nevada,3.2,5


In [15]:
#assigning series value to column one
values = pd.Series([1.5, 1.7, 2.2], index=['two', 'three', 'five'])
frame_2['debt'] = values
frame_2

Unnamed: 0,year,state,population,debt
one,2000,Ohio,1.5,
two,2001,Ohio,1.7,1.5
three,2002,Ohio,3.6,1.7
four,2001,Nevada,2.4,
five,2002,Nevada,2.9,2.2
six,2003,Nevada,3.2,


### 1.4 Creating a column

In [16]:
frame_2['eastern'] = frame_2['state'] == 'Ohio'
frame_2

Unnamed: 0,year,state,population,debt,eastern
one,2000,Ohio,1.5,,True
two,2001,Ohio,1.7,1.5,True
three,2002,Ohio,3.6,1.7,True
four,2001,Nevada,2.4,,False
five,2002,Nevada,2.9,2.2,False
six,2003,Nevada,3.2,,False


In [17]:
#deleting a column
del frame_2['eastern']

In [18]:
frame_2.columns

Index(['year', 'state', 'population', 'debt'], dtype='object')

### 1.5 Nested Dict of Dicts
* outer keys = DataFrame columns
* inner keys = DataFrame index names

In [19]:
population = {'Nevada': {2001: 2.4, 2002: 2.9},
              'Ohio': {2000: 1.5, 2001: 1.7, 2002: 3.6}}

In [20]:
frame_3 = pd.DataFrame(population)
frame_3

Unnamed: 0,Nevada,Ohio
2001,2.4,1.7
2002,2.9,3.6
2000,,1.5


In [21]:
#transposing rows and columns
frame_3.T

Unnamed: 0,2001,2002,2000
Nevada,2.4,2.9,
Ohio,1.7,3.6,1.5


In [22]:
pd.DataFrame(frame_3, index=[2001, 2002, 2003])

Unnamed: 0,Nevada,Ohio
2001,2.4,1.7
2002,2.9,3.6
2003,,


### 1.6 Naming Dataframe column and index

In [23]:
frame_3.index.name = 'year'
frame_3.columns.name = 'state'
frame_3

state,Nevada,Ohio
year,Unnamed: 1_level_1,Unnamed: 2_level_1
2001,2.4,1.7
2002,2.9,3.6
2000,,1.5


In [24]:
#converting DataFrame ds to array object
frame_3.values

array([[2.4, 1.7],
       [2.9, 3.6],
       [nan, 1.5]])

In [25]:
#if dtypes of DataFrame are heteregenous => universal dtype is taken
#E.g.: dtype=object
frame_2.values

array([[2000, 'Ohio', 1.5, nan],
       [2001, 'Ohio', 1.7, 1.5],
       [2002, 'Ohio', 3.6, 1.7],
       [2001, 'Nevada', 2.4, nan],
       [2002, 'Nevada', 2.9, 2.2],
       [2003, 'Nevada', 3.2, nan]], dtype=object)

## 2. Index Objects

In [26]:
obj = pd.Series(np.arange(3), index=list('abc'))
obj.index

Index(['a', 'b', 'c'], dtype='object')

In [27]:
obj.index[1:]

Index(['b', 'c'], dtype='object')

### 2.1 Creating a custom indexes

In [28]:
labels = pd.Index(np.arange(3))
labels

Int64Index([0, 1, 2], dtype='int64')

In [29]:
obj_2 = pd.Series([1.5, 2.5, 0], index=labels)
obj_2

0    1.5
1    2.5
2    0.0
dtype: float64

In [30]:
obj_2.index is labels

True

In [31]:
frame_3

state,Nevada,Ohio
year,Unnamed: 1_level_1,Unnamed: 2_level_1
2001,2.4,1.7
2002,2.9,3.6
2000,,1.5


In [32]:
frame_3.columns

Index(['Nevada', 'Ohio'], dtype='object', name='state')

In [33]:
'Ohio' in frame_3.columns

True

In [34]:
#indexes can contain duplicate lables
dup_labels = pd.Index(['foo', 'foo', 'bar'])
series = pd.Series(np.arange(3), index=dup_labels)
series

foo    0
foo    1
bar    2
dtype: int32