# DataFrame

### Creating random DF

In [2]:
import pandas as pd
import numpy as np

raw_data = {'first_name': ['Jason', 'Molly', 'Tina', 'Jake', 'Amy'], 
        'last_name': ['Miller', 'Jacobson', 'Ali', 'Milner', 'Cooze'], 
        'age': [42, 52, 36, 24, 73], 
        'preTestScore': [4, 24, 31, 2, 3],
        'postTestScore': [25, 94, 57, 62, 70]}
df = pd.DataFrame(raw_data, columns = ['first_name', 'last_name', 'age', 'preTestScore', 'postTestScore'])
df

Unnamed: 0,first_name,last_name,age,preTestScore,postTestScore
0,Jason,Miller,42,4,25
1,Molly,Jacobson,52,24,94
2,Tina,Ali,36,31,57
3,Jake,Milner,24,2,62
4,Amy,Cooze,73,3,70


### Some DF basics

In [6]:
type(df) # type of the DF

pandas.core.frame.DataFrame

In [7]:
df.shape # (rows x columns)

(5, 5)

In [8]:
df.columns

Index(['first_name', 'last_name', 'age', 'preTestScore', 'postTestScore'], dtype='object')

In [9]:
df.index

RangeIndex(start=0, stop=5, step=1)

### Slicing

In [12]:
df.iloc[::2, :3] # positionally

Unnamed: 0,first_name,last_name,age
0,Jason,Miller,42
2,Tina,Ali,36
4,Amy,Cooze,73


In [20]:
df.loc[:, ['first_name', 'last_name']] # via column / row labels

Unnamed: 0,first_name,last_name
0,Jason,Miller
1,Molly,Jacobson
2,Tina,Ali
3,Jake,Milner
4,Amy,Cooze


In [22]:
df.head(3) # first 3 lines

Unnamed: 0,first_name,last_name,age,preTestScore,postTestScore
0,Jason,Miller,42,4,25
1,Molly,Jacobson,52,24,94
2,Tina,Ali,36,31,57


In [23]:
df.tail(3) # last 3 lines

Unnamed: 0,first_name,last_name,age,preTestScore,postTestScore
2,Tina,Ali,36,31,57
3,Jake,Milner,24,2,62
4,Amy,Cooze,73,3,70


In [24]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5 entries, 0 to 4
Data columns (total 5 columns):
first_name       5 non-null object
last_name        5 non-null object
age              5 non-null int64
preTestScore     5 non-null int64
postTestScore    5 non-null int64
dtypes: int64(3), object(2)
memory usage: 280.0+ bytes


Assiging NaN values to a DF

In [27]:
df.iloc[-1, -1] = np.nan
df

Unnamed: 0,first_name,last_name,age,preTestScore,postTestScore
0,Jason,Miller,42,4,25.0
1,Molly,Jacobson,52,24,94.0
2,Tina,Ali,36,31,57.0
3,Jake,Milner,24,2,62.0
4,Amy,Cooze,73,3,


Getting numeric values from a DF column and generating a NumPy array of it:

In [31]:
values = df['postTestScore'].values
values

array([25., 94., 57., 62., nan])

#### Creating a DF from scratch

In [33]:
eesnimi = ['Ardo', 'Indrek', 'Elari', 'Juta']

vanus = [26, 24, 28, 26]

pikkus = [176, 188, 187, 169]

perek_nimi = ['Saks', 'Kaldma', 'Roes', 'Ilves']

list_labels = ['eesnimi', 'perek_nimi', 'vanus', 'pikkus']

list_cols = [eesnimi, perek_nimi, vanus, pikkus]

zipped = list(zip(list_labels, list_cols)) 

data = dict(zipped)

dataframe = pd.DataFrame(data)

dataframe

Unnamed: 0,eesnimi,perek_nimi,pikkus,vanus
0,Ardo,Saks,176,26
1,Indrek,Kaldma,188,24
2,Elari,Roes,187,28
3,Juta,Ilves,169,26


#### Broadcasting

In [35]:
dataframe['palk'] = 0

dataframe

Unnamed: 0,eesnimi,perek_nimi,pikkus,vanus,palk
0,Ardo,Saks,176,26,0
1,Indrek,Kaldma,188,24,0
2,Elari,Roes,187,28,0
3,Juta,Ilves,169,26,0


Changing columns and row labels

In [37]:
dataframe.index = ['essa', 'tessa', 'kossa', 'nessa']

dataframe.columns = ['ees_nimi', 'perek_nimi', 'pikkus', 'vanus', 'palk']

dataframe

Unnamed: 0,ees_nimi,perek_nimi,pikkus,vanus,palk
essa,Ardo,Saks,176,26,0
tessa,Indrek,Kaldma,188,24,0
kossa,Elari,Roes,187,28,0
nessa,Juta,Ilves,169,26,0
