# Wes McKinney: Datenanalyse mit Python

## Numpy

In [1]:
import numpy as np
data = np.random.randn(2,3)
data

array([[ 0.25156576,  0.29480549, -0.51773917],
       [ 0.34063933, -0.58936834, -0.59181639]])

In [2]:
data * 10

array([[ 2.51565756,  2.94805486, -5.17739168],
       [ 3.40639329, -5.89368343, -5.91816395]])

In [9]:
data.shape

(2, 3)

In [7]:
data.dtype

dtype('float64')

In [16]:
my_list = [1,3.4,5,7]
arr = np.array(my_list)
arr

array([1. , 3.4, 5. , 7. ])

In [17]:
int_arr = arr.astype(np.int32)
int_arr

array([1, 3, 5, 7])

In [14]:
arr = np.arange(9)
arr

array([0, 1, 2, 3, 4, 5, 6, 7, 8])

### Rechnen

In [18]:
arr = np.array([[1,2,3],[4,5,6]])
arr * arr

array([[ 1,  4,  9],
       [16, 25, 36]])

In [19]:
arr - arr

array([[0, 0, 0],
       [0, 0, 0]])

In [20]:
arr * 2

array([[ 2,  4,  6],
       [ 8, 10, 12]])

### Slicing

In [22]:
arr = np.arange(10)
arr[2:5]

array([2, 3, 4])

In [23]:
arr[2:5] = 3
arr

array([0, 1, 3, 3, 3, 5, 6, 7, 8, 9])

Achtung: Slices sind nur Views auf den originalen Array!

### Transponieren und Achsen tauschen

In [24]:
arr = np.arange(15).reshape((3,5))
arr

array([[ 0,  1,  2,  3,  4],
       [ 5,  6,  7,  8,  9],
       [10, 11, 12, 13, 14]])

In [25]:
arr.T

array([[ 0,  5, 10],
       [ 1,  6, 11],
       [ 2,  7, 12],
       [ 3,  8, 13],
       [ 4,  9, 14]])

In [33]:
arr.swapaxes(1,0)

array([[ 0,  5, 10],
       [ 1,  6, 11],
       [ 2,  7, 12],
       [ 3,  8, 13],
       [ 4,  9, 14]])

### Universelle Funktionen

In [34]:
arr = np.arange(10)
np.sqrt(arr)

array([0.        , 1.        , 1.41421356, 1.73205081, 2.        ,
       2.23606798, 2.44948974, 2.64575131, 2.82842712, 3.        ])

In [36]:
x = np.random.randn(8)
y = np.random.randn(8)
x

array([-0.44135588,  0.00845968,  0.60806886,  0.30242691, -0.32682074,
       -0.09348695, -0.5448053 , -0.96186682])

In [37]:
y

array([-1.20957537,  0.87480842,  0.65594657,  0.62626169,  0.41614379,
        0.14858748,  0.99802086,  0.2070704 ])

In [38]:
np.maximum(x,y)

array([-0.44135588,  0.87480842,  0.65594657,  0.62626169,  0.41614379,
        0.14858748,  0.99802086,  0.2070704 ])

### Mathematische und Statistische Methoden

In [43]:
arr = np.random.randn(5,4)
arr

array([[ 0.4677534 ,  0.27811891,  0.35704938,  0.76113888],
       [ 1.75992058, -0.03345403, -0.75600059,  2.13301448],
       [ 0.03186892, -0.64392482, -0.47240514, -1.93440141],
       [ 0.90564606, -1.2627532 ,  0.40148696, -0.12370467],
       [-0.9546801 , -1.73621086, -1.06800743, -0.53758302]])

In [44]:
arr.mean()

-0.12135638564588606

In [45]:
arr.sum()

-2.4271277129177213

In [46]:
# Mittelwert über Spalten
arr.mean(axis=1)

array([ 0.46601514,  0.77587011, -0.75471561, -0.01983121, -1.07412035])

In [47]:
# Summe über Zeilen
arr.sum(axis=0)

array([ 2.21050884, -3.398224  , -1.53787682,  0.29846426])

### Sortieren

In [48]:
arr = np.random.randn(10)
arr

array([-0.62655298, -1.13672454, -0.89436418, -1.06697554,  0.49504995,
        1.66092063,  0.21803015,  1.39112196, -1.26164087,  0.63970848])

In [50]:
arr.sort()
arr

array([-1.26164087, -1.13672454, -1.06697554, -0.89436418, -0.62655298,
        0.21803015,  0.49504995,  0.63970848,  1.39112196,  1.66092063])

## Pandas: Erste Schritte

In [1]:
import pandas as pd
from pandas import Series, DataFrame

### Series

In [2]:
obj = pd.Series([4,7,-5,3])
obj

0    4
1    7
2   -5
3    3
dtype: int64

In [4]:
obj.index

RangeIndex(start=0, stop=4, step=1)

In [5]:
obj2 = pd.Series([4,7,-5,3], index = ['a','b','c','d'])
obj2

a    4
b    7
c   -5
d    3
dtype: int64

In [6]:
obj2['c']

-5

In [7]:
obj2 * 2

a     8
b    14
c   -10
d     6
dtype: int64

In [8]:
sdata = {'Zürich':450000,'Bern':300000,'Genf':400000,'Basel':350000}
obj3 = pd.Series(sdata)
obj3

Zürich    450000
Bern      300000
Genf      400000
Basel     350000
dtype: int64

In [9]:
obj4 = pd.Series(sdata,['Zürich','Bern','Lausanne','Basel'])
obj3 + obj4

Basel       700000.0
Bern        600000.0
Genf             NaN
Lausanne         NaN
Zürich      900000.0
dtype: float64

In [10]:
obj3.name = 'population'
obj3.index.name = 'city'
obj3

city
Zürich    450000
Bern      300000
Genf      400000
Basel     350000
Name: population, dtype: int64

### DataFrame

In [12]:
data = {'name': ['Anton','Barbara','Claudia'],'year':[1966,1979,1981],'height':[1.8,1.74,1.69]}
frame = pd.DataFrame(data)
frame

Unnamed: 0,name,year,height
0,Anton,1966,1.8
1,Barbara,1979,1.74
2,Claudia,1981,1.69


In [14]:
frame['name']

0      Anton
1    Barbara
2    Claudia
Name: name, dtype: object

In [15]:
frame.year

0    1966
1    1979
2    1981
Name: year, dtype: int64

In [17]:
frame.loc[1]

name      Barbara
year         1979
height       1.74
Name: 1, dtype: object

In [20]:
frame['test'] = frame.year == 1979
frame

Unnamed: 0,name,year,height,test
0,Anton,1966,1.8,False
1,Barbara,1979,1.74,True
2,Claudia,1981,1.69,False


In [21]:
del frame['test']
frame

Unnamed: 0,name,year,height
0,Anton,1966,1.8
1,Barbara,1979,1.74
2,Claudia,1981,1.69
