# Data Manipulation with Pandas

Summary of basic data manipulation with Pandas

## Get started

In [1]:
# import packages
import pandas as pd
import seaborn as sb
import numpy as np
import os

In [2]:
# set working directory
os.chdir('C:\\Users\\Tobi\\Jupyiter Notebooks\\')

In [3]:
# get data set
iris = sb.load_dataset('iris')

# check if data is dataframe
isinstance(iris, pd.DataFrame)

True

## Properties of the dataframe
Properties have no no () no arguments (as opposed to methods)

In [4]:
# datatypes
iris.dtypes

sepal_length    float64
sepal_width     float64
petal_length    float64
petal_width     float64
species          object
dtype: object

In [5]:
# columns
iris.columns

Index(['sepal_length', 'sepal_width', 'petal_length', 'petal_width',
       'species'],
      dtype='object')

# Display data
iris.head(3)

## Accessing data

In [6]:
# access column by name (string)
iris['sepal_length']

0      5.1
1      4.9
2      4.7
3      4.6
4      5.0
      ... 
145    6.7
146    6.3
147    6.5
148    6.2
149    5.9
Name: sepal_length, Length: 150, dtype: float64

In [7]:
# access column by name (dot notation)
iris.species

0         setosa
1         setosa
2         setosa
3         setosa
4         setosa
         ...    
145    virginica
146    virginica
147    virginica
148    virginica
149    virginica
Name: species, Length: 150, dtype: object

## Describing data

In [8]:
# numerical dat
iris.describe(include = [np.number])

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width
count,150.0,150.0,150.0,150.0
mean,5.843333,3.057333,3.758,1.199333
std,0.828066,0.435866,1.765298,0.762238
min,4.3,2.0,1.0,0.1
25%,5.1,2.8,1.6,0.3
50%,5.8,3.0,4.35,1.3
75%,6.4,3.3,5.1,1.8
max,7.9,4.4,6.9,2.5


In [9]:
# categorical data
iris.describe(include = ['O'])

Unnamed: 0,species
count,150
unique,3
top,virginica
freq,50


## Statistical functions

In [10]:
print(np.mean(iris.sepal_length))
print(np.median(iris.sepal_length))

5.843333333333335
5.8


## Aggregations

In [11]:
iris.sum()

sepal_length                                                876.5
sepal_width                                                 458.6
petal_length                                                563.7
petal_width                                                 179.9
species         setosasetosasetosasetosasetosasetosasetosaseto...
dtype: object

In [12]:
iris.sum(axis=0)

sepal_length                                                876.5
sepal_width                                                 458.6
petal_length                                                563.7
petal_width                                                 179.9
species         setosasetosasetosasetosasetosasetosasetosaseto...
dtype: object

In [13]:
iris.sum(axis=1)

0      10.2
1       9.5
2       9.4
3       9.4
4      10.2
       ... 
145    17.2
146    15.7
147    16.7
148    17.3
149    15.8
Length: 150, dtype: float64

## Grouped Aggregations

In [14]:
iris.groupby(['species']).sum()

Unnamed: 0_level_0,sepal_length,sepal_width,petal_length,petal_width
species,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
setosa,250.3,171.4,73.1,12.3
versicolor,296.8,138.5,213.0,66.3
virginica,329.4,148.7,277.6,101.3


## Accessing data within a DataFrame
Use .loc() to locate the requested data - .loc(row reference, column reference).      
Warning: Note that contrary to usual python slices, both the start and the stop are included.  

 ### Selecting rows 
Use single brackets.  
Indexing starts at 1 (tbc).  
If only 1 row is selected it is returned as a list.  

In [15]:
print(iris.loc[1])

sepal_length       4.9
sepal_width          3
petal_length       1.4
petal_width        0.2
species         setosa
Name: 1, dtype: object


In [16]:
print(iris.loc[1:2])

   sepal_length  sepal_width  petal_length  petal_width species
1           4.9          3.0           1.4          0.2  setosa
2           4.7          3.2           1.3          0.2  setosa


Weiter gehts mit: https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.loc.html

## Manipulating data

In [17]:
# remove column from dataframe
del iris['sepal_length']
iris.columns

Index(['sepal_width', 'petal_length', 'petal_width', 'species'], dtype='object')