# Wes McKinney: Datenanalyse mit Python

## Numpy

In [15]:
import numpy as np
data = np.random.randn(2,3)
data

array([[ 0.83232759, -1.31390482,  0.6638562 ],
       [ 0.33750455, -0.44821506, -0.91589883]])

In [2]:
data * 10

array([[ 2.51565756,  2.94805486, -5.17739168],
       [ 3.40639329, -5.89368343, -5.91816395]])

In [9]:
data.shape

(2, 3)

In [7]:
data.dtype

dtype('float64')

In [16]:
my_list = [1,3.4,5,7]
arr = np.array(my_list)
arr

array([1. , 3.4, 5. , 7. ])

In [17]:
int_arr = arr.astype(np.int32)
int_arr

array([1, 3, 5, 7])

In [14]:
arr = np.arange(9)
arr

array([0, 1, 2, 3, 4, 5, 6, 7, 8])

### Rechnen

In [18]:
arr = np.array([[1,2,3],[4,5,6]])
arr * arr

array([[ 1,  4,  9],
       [16, 25, 36]])

In [19]:
arr - arr

array([[0, 0, 0],
       [0, 0, 0]])

In [20]:
arr * 2

array([[ 2,  4,  6],
       [ 8, 10, 12]])

### Slicing

In [22]:
arr = np.arange(10)
arr[2:5]

array([2, 3, 4])

In [23]:
arr[2:5] = 3
arr

array([0, 1, 3, 3, 3, 5, 6, 7, 8, 9])

Achtung: Slices sind nur Views auf den originalen Array!

### Transponieren und Achsen tauschen

In [24]:
arr = np.arange(15).reshape((3,5))
arr

array([[ 0,  1,  2,  3,  4],
       [ 5,  6,  7,  8,  9],
       [10, 11, 12, 13, 14]])

In [25]:
arr.T

array([[ 0,  5, 10],
       [ 1,  6, 11],
       [ 2,  7, 12],
       [ 3,  8, 13],
       [ 4,  9, 14]])

In [33]:
arr.swapaxes(1,0)

array([[ 0,  5, 10],
       [ 1,  6, 11],
       [ 2,  7, 12],
       [ 3,  8, 13],
       [ 4,  9, 14]])

### Universelle Funktionen

In [34]:
arr = np.arange(10)
np.sqrt(arr)

array([0.        , 1.        , 1.41421356, 1.73205081, 2.        ,
       2.23606798, 2.44948974, 2.64575131, 2.82842712, 3.        ])

In [36]:
x = np.random.randn(8)
y = np.random.randn(8)
x

array([-0.44135588,  0.00845968,  0.60806886,  0.30242691, -0.32682074,
       -0.09348695, -0.5448053 , -0.96186682])

In [37]:
y

array([-1.20957537,  0.87480842,  0.65594657,  0.62626169,  0.41614379,
        0.14858748,  0.99802086,  0.2070704 ])

In [38]:
np.maximum(x,y)

array([-0.44135588,  0.87480842,  0.65594657,  0.62626169,  0.41614379,
        0.14858748,  0.99802086,  0.2070704 ])

### Mathematische und Statistische Methoden

In [43]:
arr = np.random.randn(5,4)
arr

array([[ 0.4677534 ,  0.27811891,  0.35704938,  0.76113888],
       [ 1.75992058, -0.03345403, -0.75600059,  2.13301448],
       [ 0.03186892, -0.64392482, -0.47240514, -1.93440141],
       [ 0.90564606, -1.2627532 ,  0.40148696, -0.12370467],
       [-0.9546801 , -1.73621086, -1.06800743, -0.53758302]])

In [44]:
arr.mean()

-0.12135638564588606

In [45]:
arr.sum()

-2.4271277129177213

In [46]:
# Mittelwert über Spalten
arr.mean(axis=1)

array([ 0.46601514,  0.77587011, -0.75471561, -0.01983121, -1.07412035])

In [47]:
# Summe über Zeilen
arr.sum(axis=0)

array([ 2.21050884, -3.398224  , -1.53787682,  0.29846426])

### Sortieren

In [48]:
arr = np.random.randn(10)
arr

array([-0.62655298, -1.13672454, -0.89436418, -1.06697554,  0.49504995,
        1.66092063,  0.21803015,  1.39112196, -1.26164087,  0.63970848])

In [50]:
arr.sort()
arr

array([-1.26164087, -1.13672454, -1.06697554, -0.89436418, -0.62655298,
        0.21803015,  0.49504995,  0.63970848,  1.39112196,  1.66092063])

## Pandas: Erste Schritte

In [3]:
import pandas as pd
from pandas import Series, DataFrame

### Series

In [2]:
obj = pd.Series([4,7,-5,3])
obj

0    4
1    7
2   -5
3    3
dtype: int64

In [4]:
obj.index

RangeIndex(start=0, stop=4, step=1)

In [5]:
obj2 = pd.Series([4,7,-5,3], index = ['a','b','c','d'])
obj2

a    4
b    7
c   -5
d    3
dtype: int64

In [6]:
obj2['c']

-5

In [7]:
obj2 * 2

a     8
b    14
c   -10
d     6
dtype: int64

In [8]:
sdata = {'Zürich':450000,'Bern':300000,'Genf':400000,'Basel':350000}
obj3 = pd.Series(sdata)
obj3

Zürich    450000
Bern      300000
Genf      400000
Basel     350000
dtype: int64

In [9]:
obj4 = pd.Series(sdata,['Zürich','Bern','Lausanne','Basel'])
obj3 + obj4

Basel       700000.0
Bern        600000.0
Genf             NaN
Lausanne         NaN
Zürich      900000.0
dtype: float64

In [10]:
obj3.name = 'population'
obj3.index.name = 'city'
obj3

city
Zürich    450000
Bern      300000
Genf      400000
Basel     350000
Name: population, dtype: int64

### DataFrame

In [12]:
data = {'name': ['Anton','Barbara','Claudia'],'year':[1966,1979,1981],'height':[1.8,1.74,1.69]}
frame = pd.DataFrame(data)
frame

Unnamed: 0,name,year,height
0,Anton,1966,1.8
1,Barbara,1979,1.74
2,Claudia,1981,1.69


In [14]:
frame['name']

0      Anton
1    Barbara
2    Claudia
Name: name, dtype: object

In [15]:
frame.year

0    1966
1    1979
2    1981
Name: year, dtype: int64

In [17]:
frame.loc[1]

name      Barbara
year         1979
height       1.74
Name: 1, dtype: object

In [20]:
frame['test'] = frame.year == 1979
frame

Unnamed: 0,name,year,height,test
0,Anton,1966,1.8,False
1,Barbara,1979,1.74,True
2,Claudia,1981,1.69,False


In [21]:
del frame['test']
frame

Unnamed: 0,name,year,height
0,Anton,1966,1.8
1,Barbara,1979,1.74
2,Claudia,1981,1.69


### Neuindizierung

In [5]:
obj = pd.Series([4.5,7.2,-5.3,3.6], index=['d','b','a','c'])
obj

d    4.5
b    7.2
a   -5.3
c    3.6
dtype: float64

In [6]:
obj2 = obj.reindex(['a','b','c','d','e'])
obj2

a   -5.3
b    7.2
c    3.6
d    4.5
e    NaN
dtype: float64

### Löschen

In [7]:
obj2.drop('e')

a   -5.3
b    7.2
c    3.6
d    4.5
dtype: float64

In [12]:
data = {'year':[1966,1979,1981],'height':[1.8,1.74,1.69]}
frame = pd.DataFrame(data,index=['Anton','Barbara','Claudia'])
frame

Unnamed: 0,year,height
Anton,1966,1.8
Barbara,1979,1.74
Claudia,1981,1.69


In [13]:
frame.drop('Anton')

Unnamed: 0,year,height
Barbara,1979,1.74
Claudia,1981,1.69


In [14]:
frame.drop('height',axis=1)

Unnamed: 0,year
Anton,1966
Barbara,1979
Claudia,1981


### Indizierung, Auswahl, Filterung

In [16]:
obj = pd.Series(np.arange(4.), index=['a','b','c','d'])
obj

a    0.0
b    1.0
c    2.0
d    3.0
dtype: float64

In [17]:
obj['b']

1.0

In [23]:
obj[1:3]

b    1.0
c    2.0
dtype: float64

In [24]:
# Achtung: Beim Slicing gehört Endpunkt dazu
obj['a':'c']

a    0.0
b    1.0
c    2.0
dtype: float64

In [26]:
obj['a':'b'] = 4.0
obj

a    4.0
b    4.0
c    2.0
d    3.0
dtype: float64

In [29]:
frame['year']

Anton      1966
Barbara    1979
Claudia    1981
Name: year, dtype: int64

In [30]:
frame[frame['year'] > 1980]

Unnamed: 0,year,height
Claudia,1981,1.69


In [34]:
frame[:1]

Unnamed: 0,year,height
Anton,1966,1.8


In [35]:
frame < 1970

Unnamed: 0,year,height
Anton,True,True
Barbara,False,True
Claudia,False,True


### loc/iloc

In [36]:
data = pd.DataFrame(np.arange(16).reshape((4,4)), index=['Zürich','Aargau','Solothurn','Bern'], columns=['one','two','three','four'])
data

Unnamed: 0,one,two,three,four
Zürich,0,1,2,3
Aargau,4,5,6,7
Solothurn,8,9,10,11
Bern,12,13,14,15


In [37]:
data.loc['Zürich',['two','three']]

two      1
three    2
Name: Zürich, dtype: int32

In [38]:
data.iloc[2]

one       8
two       9
three    10
four     11
Name: Solothurn, dtype: int32

In [41]:
data.iloc[:2,2]

Zürich    2
Aargau    6
Name: three, dtype: int32

### Arithmetik und Datenausrichtung

In [42]:
s1 = pd.Series([1.0,2.5,3],index=['a','b','c'])
s2 = pd.Series([1.0,2.5,3],index=['a','c','d'])
s1 + s2

a    2.0
b    NaN
c    5.5
d    NaN
dtype: float64

In [44]:
s1.add(s2, fill_value= 0)

a    2.0
b    2.5
c    5.5
d    3.0
dtype: float64

In [46]:
data = pd.DataFrame(np.random.randn(4,4), index=['Zürich','Aargau','Solothurn','Bern'], columns=['one','two','three','four'])
data

Unnamed: 0,one,two,three,four
Zürich,-0.917816,0.095985,-0.513288,-0.280871
Aargau,-1.406109,0.597344,2.060929,-0.624888
Solothurn,-0.013426,-0.698863,0.025903,0.06957
Bern,0.568137,-0.52051,-1.327339,-2.445935


In [48]:
f = lambda x: x.max() - x.min()
data.apply(f)

one      1.974246
two      1.296208
three    3.388268
four     2.515505
dtype: float64

In [49]:
format = lambda x: '%.2f' % x
data.applymap(format)

Unnamed: 0,one,two,three,four
Zürich,-0.92,0.1,-0.51,-0.28
Aargau,-1.41,0.6,2.06,-0.62
Solothurn,-0.01,-0.7,0.03,0.07
Bern,0.57,-0.52,-1.33,-2.45


### Sortieren

In [50]:
data.sort_index()

Unnamed: 0,one,two,three,four
Aargau,-1.406109,0.597344,2.060929,-0.624888
Bern,0.568137,-0.52051,-1.327339,-2.445935
Solothurn,-0.013426,-0.698863,0.025903,0.06957
Zürich,-0.917816,0.095985,-0.513288,-0.280871


In [53]:
data.sort_index(axis=1, ascending=False)

Unnamed: 0,two,three,one,four
Zürich,0.095985,-0.513288,-0.917816,-0.280871
Aargau,0.597344,2.060929,-1.406109,-0.624888
Solothurn,-0.698863,0.025903,-0.013426,0.06957
Bern,-0.52051,-1.327339,0.568137,-2.445935


In [55]:
data.sort_values(by='one')

Unnamed: 0,one,two,three,four
Aargau,-1.406109,0.597344,2.060929,-0.624888
Zürich,-0.917816,0.095985,-0.513288,-0.280871
Solothurn,-0.013426,-0.698863,0.025903,0.06957
Bern,0.568137,-0.52051,-1.327339,-2.445935


### Duplizierte Labels

In [57]:
s1 = pd.Series(range(7),index=['a','b','c','a','d','d','e'])
s1
s1.index.is_unique

False

In [58]:
s1['a']

a    0
a    3
dtype: int64

### Deskriptive Statistik

In [59]:
df = pd.DataFrame([[2.0,np.nan],[3.0,1.5],[np.nan,3.5],[1.0,2.0]],index=['a','b','c','d'],columns=['one','two'])
df

Unnamed: 0,one,two
a,2.0,
b,3.0,1.5
c,,3.5
d,1.0,2.0


In [60]:
df.sum()

one    6.0
two    7.0
dtype: float64

In [61]:
df.sum(axis='columns')

a    2.0
b    4.5
c    3.5
d    3.0
dtype: float64

In [62]:
df.idxmax()

one    b
two    c
dtype: object

In [63]:
df.cumsum()

Unnamed: 0,one,two
a,2.0,
b,5.0,1.5
c,,5.0
d,6.0,7.0


In [64]:
df.describe()

Unnamed: 0,one,two
count,3.0,3.0
mean,2.0,2.333333
std,1.0,1.040833
min,1.0,1.5
25%,1.5,1.75
50%,2.0,2.0
75%,2.5,2.75
max,3.0,3.5


### Eindeutigkeit, Werteanzahl, Mitgliedschaft

In [67]:
obj = pd.Series(['c','a','d','a','a','b','b','c','c'])
obj.unique()

array(['c', 'a', 'd', 'b'], dtype=object)

In [68]:
obj.value_counts()

c    3
a    3
b    2
d    1
dtype: int64

In [71]:
mask = obj.isin(['b','c'])
mask

0     True
1    False
2    False
3    False
4    False
5     True
6     True
7     True
8     True
dtype: bool

In [72]:
obj[mask]

0    c
5    b
6    b
7    c
8    c
dtype: object

## Daten bereinigen

In [74]:
from numpy import nan as NA
data = pd.Series([1,NA,3.5,NA,7])
data.dropna()

0    1.0
2    3.5
4    7.0
dtype: float64

In [75]:
data[data.notnull()]

0    1.0
2    3.5
4    7.0
dtype: float64

In [77]:
data = pd.DataFrame([[1.,6.5,3.],[1.,NA,NA],[NA,NA,NA],[NA,6.5,3.]])
cleaned = data.dropna()
cleaned

Unnamed: 0,0,1,2
0,1.0,6.5,3.0


In [78]:
data.dropna(how='all')

Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,,
3,,6.5,3.0


In [79]:
data.dropna(how='all',axis=1)

Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,,
2,,,
3,,6.5,3.0


In [80]:
data.dropna(thresh=2)

Unnamed: 0,0,1,2
0,1.0,6.5,3.0
3,,6.5,3.0


In [81]:
data.fillna(0)

Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,0.0,0.0
2,0.0,0.0,0.0
3,0.0,6.5,3.0


In [85]:
data.fillna({0:1.,1:0.5,2:0})

Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,0.5,0.0
2,1.0,0.5,0.0
3,1.0,6.5,3.0


In [87]:
data = pd.DataFrame([[1.,6.5,3.],[1.,NA,NA],[5.,4.3,1.7],[1.,6.5,3.]])
data.duplicated()

0    False
1    False
2    False
3     True
dtype: bool

In [88]:
data.drop_duplicates()

Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,,
2,5.0,4.3,1.7


In [89]:
data.drop_duplicates([0])

Unnamed: 0,0,1,2
0,1.0,6.5,3.0
2,5.0,4.3,1.7


### Transformation mit Funktionen und Mappings

In [92]:
data = pd.DataFrame({'cities':['zürich','BERN','Basel','LAusanne'],'pop':[450000,300000,300000,350000]})
data

Unnamed: 0,cities,pop
0,zürich,450000
1,BERN,300000
2,Basel,300000
3,LAusanne,350000


In [94]:
mapping = {'zürich':'ZH','bern':'BE','basel':'BS','lausanne':'VD'}
lowercased = data['cities'].str.lower()
lowercased

0      zürich
1        bern
2       basel
3    lausanne
Name: cities, dtype: object

In [95]:
data['canton'] = lowercased.map(mapping)
data

Unnamed: 0,cities,pop,canton
0,zürich,450000,ZH
1,BERN,300000,BE
2,Basel,300000,BS
3,LAusanne,350000,VD


In [97]:
data['cities'].map(lambda x: mapping[x.lower()])

0    ZH
1    BE
2    BS
3    VD
Name: cities, dtype: object

### Werte ersetzen

In [98]:
data = pd.Series([1,999,3.5,999,7])
data

0      1.0
1    999.0
2      3.5
3    999.0
4      7.0
dtype: float64

In [99]:
data.replace(999,np.nan)

0    1.0
1    NaN
2    3.5
3    NaN
4    7.0
dtype: float64