# Uaktualnianie indeksu

In [2]:
import pandas as pd
from pandas import Series, DataFrame
import numpy as np

# 5. Rozpoczynamy pracę z biblioteką pandas..............................................................129
## 5.1. Wprowadzenie do struktur danych biblioteki pandas 130 
### Obiekt Series 130 
### Obiekt DataFrame 134 
### Obiekty index 141

# 5.2. Podstawowe funkcjonalności 142 

In [3]:
obj = pd.Series([4.5,7.2,-5.3,3.6],index=['d','b','a','c'])
obj


d    4.5
b    7.2
a   -5.3
c    3.6
dtype: float64

### Uaktualnianie indeksu 143

changing the data order and adopting to the new index (zmiana kolejnosci danych  i przystosowanie do nowego indeksu)

In [4]:
obj2 = obj.reindex(['a','b','c','d','e'])
obj2

a   -5.3
b    7.2
c    3.6
d    4.5
e    NaN
dtype: float64

In [5]:
obj3 = pd.Series(['blue','purple','yellow'],index=[0,2,4])
obj3

0      blue
2    purple
4    yellow
dtype: object

In [6]:
# przy szeregach czasowych - moze bedzie trzeba uzyc opcji z metody *ffill*; wypelnia wartości do przodu

obj3.reindex(range(6),method='ffill')

0      blue
1      blue
2    purple
3    purple
4    yellow
5    yellow
dtype: object

In [7]:
# w przypadku DataFrame - zmiana kolejnosci wierszy indeksu, kolumn lub obu tych elementow
# np.arange - z biblioteki numpy wykorzystanie; wypelnia od 1 do 9 w 3 kolumny i 3 wiersze
frame = pd.DataFrame(np.arange(9).reshape((3,3)),
                      index = ['a','b','c'],
                      columns = ['Ohio','Texas','California']
                      )
frame

Unnamed: 0,Ohio,Texas,California
a,0,1,2
b,3,4,5
c,6,7,8


In [8]:
frame_t = pd.DataFrame(np.arange(16).reshape((4,4)),
                      index = ['a','b','c','d'],
                      columns = ['Ohio','Texas','California','aaa']
                      )
frame_t

Unnamed: 0,Ohio,Texas,California,aaa
a,0,1,2,3
b,4,5,6,7
c,8,9,10,11
d,12,13,14,15


In [9]:
frame2 = frame.reindex(['a','d','b','c'])
frame2

Unnamed: 0,Ohio,Texas,California
a,0.0,1.0,2.0
d,,,
b,3.0,4.0,5.0
c,6.0,7.0,8.0


In [10]:
# uaktualnienie kolumn
states = ['Texas','Utah','California']
frame.reindex(columns=states)

Unnamed: 0,Texas,Utah,California
a,1,,2
b,4,,5
c,7,,8


In [11]:
# Another way to update the index, add new labels in the position argument and next point this axe with the use of the axis keyword 
frame.reindex(states, axis="columns")


Unnamed: 0,Texas,Utah,California
a,1,,2
b,4,,5
c,7,,8


In [12]:
# in case of reindex method - it inserts NaN values when adding new indexes

### rejecting axes elements

- two methods possible: reindex or loc; method drop - returns a new object 
np.arange
NumPy arange() is one of the array creation routines based on numerical ranges. It creates an instance of ndarray with evenly spaced values and returns the reference to it
![image.png](attachment:image.png)

![image-2.png](attachment:image-2.png)

In [13]:
obj = pd.Series(np.arange(5.),index=(['a','b','c','d','e']))
#obj2 = pd.Series(np.arange(1,10,2),index=(['a','b','c','d','e']))
obj

a    0.0
b    1.0
c    2.0
d    3.0
e    4.0
dtype: float64

In [14]:
dropObjC = obj.drop('c')
dropObjC

a    0.0
b    1.0
d    3.0
e    4.0
dtype: float64

In [15]:
obj.drop(['d','c'])

a    0.0
b    1.0
e    4.0
dtype: float64

In [16]:
# we can remove any data i dataframe. additional example
# creating dataframe with np.arange ang reshape, adding index and column names
data = pd.DataFrame(np.arange(16).reshape(4,4),
        index = ['Ohio','Colorado','Utha','New York'],
        columns = ['one','two','three','four']
)
data

Unnamed: 0,one,two,three,four
Ohio,0,1,2,3
Colorado,4,5,6,7
Utha,8,9,10,11
New York,12,13,14,15


In [17]:
# drop with labels 
data.drop(['Ohio','Colorado'])

Unnamed: 0,one,two,three,four
Utha,8,9,10,11
New York,12,13,14,15


In [18]:
# drop columns
data.drop(columns=['two'])

Unnamed: 0,one,three,four
Ohio,0,2,3
Colorado,4,6,7
Utha,8,10,11
New York,12,14,15


In [19]:
# drop doesn't modify the object 
data

Unnamed: 0,one,two,three,four
Ohio,0,1,2,3
Colorado,4,5,6,7
Utha,8,9,10,11
New York,12,13,14,15


In [20]:
# other way 
data.drop('two',axis=1) # removing column

Unnamed: 0,one,three,four
Ohio,0,2,3
Colorado,4,6,7
Utha,8,10,11
New York,12,14,15


In [21]:
data.drop(['two','four'], axis='columns')

Unnamed: 0,one,three
Ohio,0,2
Colorado,4,6
Utha,8,10
New York,12,14


### index, select and filter

In [22]:
obj = pd.Series(np.arange(4.),index = ['a','b','c','d'])
obj

a    0.0
b    1.0
c    2.0
d    3.0
dtype: float64

In [23]:
obj['b']

1.0

In [24]:
obj[1]

1.0

In [25]:
obj[2:4]

c    2.0
d    3.0
dtype: float64

In [26]:
obj

a    0.0
b    1.0
c    2.0
d    3.0
dtype: float64

In [27]:
 obj[['a','d','b']] # an error with multiindex, without two square brackets

a    0.0
d    3.0
b    1.0
dtype: float64

In [28]:
obj[[1,3]]

b    1.0
d    3.0
dtype: float64

In [29]:
obj[obj<=2]

a    0.0
b    1.0
c    2.0
dtype: float64

In [30]:
obj[obj<2]

a    0.0
b    1.0
dtype: float64

In [31]:
# this way you can select data by labels, but there is a better way - use loc

In [32]:
obj.loc[['b','a','d']]

b    1.0
a    0.0
d    3.0
dtype: float64

In [33]:
# why loc - it treats integers differently when indexing with square brackets

In [34]:
obj1 = pd.Series([1,2,3],index=[2,0,1])
obj1


2    1
0    2
1    3
dtype: int64

In [35]:
obj2 = pd.Series([1,2,3],index=['a','b','c'])
obj2

a    1
b    2
c    3
dtype: int64

In [36]:
obj1[[0,1,2]]

0    2
1    3
2    1
dtype: int64

In [37]:
obj2[[0,1,2]]

a    1
b    2
c    3
dtype: int64

In [38]:
# if an index doesn't contain integers, then using the expression obj.loc[[0,1,2]] ends a failure

In [39]:
#obj.loc[[0,1,2]]

In [40]:
# 20230709
# Except for loc operator, iloc indexes only with integers. Thanks to that, it works correctly regardless an index contains integers or not. 

In [41]:
obj1.iloc[[0,1,2]]

2    1
0    2
1    3
dtype: int64

In [42]:
obj2.iloc[[0,1,2]]

a    1
b    2
c    3
dtype: int64

In [43]:
obj2.loc['b':'c']

b    2
c    3
dtype: int64

In [44]:
obj2.loc['b':'c'] = 5
obj2

a    1
b    5
c    5
dtype: int64

In [45]:
# DataFrame index lets gain access to the chosen column or selected columns
data = pd.DataFrame(np.arange(16).reshape((4,4)),
                    index = ['Ohio','Colorado','Utah','New York'],
                    columns = ['one','two','three','four'])
data

Unnamed: 0,one,two,three,four
Ohio,0,1,2,3
Colorado,4,5,6,7
Utah,8,9,10,11
New York,12,13,14,15


In [46]:
data[['three','one']]

Unnamed: 0,three,one
Ohio,2,0
Colorado,6,4
Utah,10,8
New York,14,12


In [47]:
data[:2] # chosing rows     - without square brackets

Unnamed: 0,one,two,three,four
Ohio,0,1,2,3
Colorado,4,5,6,7


In [48]:
data[data['three']>5] # one element or square brackets []  means selecting columns
# select data where in column three we data higher than 5

Unnamed: 0,one,two,three,four
Colorado,4,5,6,7
Utah,8,9,10,11
New York,12,13,14,15


In [49]:
# scalar comparing

In [50]:
data < 5

Unnamed: 0,one,two,three,four
Ohio,True,True,True,True
Colorado,True,False,False,False
Utah,False,False,False,False
New York,False,False,False,False


In [51]:
# using this method we can assign data
data[data < 5] = 0 
data

Unnamed: 0,one,two,three,four
Ohio,0,0,0,0
Colorado,0,5,6,7
Utah,8,9,10,11
New York,12,13,14,15


#### wybieranie za pomocą operatorów loc i iloc
These operators help to select a subset of data rows and columns from dataframe (a notion similar to NumPy)
thanks to 
- loc - axis labels # lokalizacja po lablels - <b>loc</b>
- iloc - integer values   - int loc -> - <b>iloc</b>

![image-7.png](attachment:image-7.png)


![image.png](attachment:image.png)
![image-2.png](attachment:image-2.png)

![image-3.png](attachment:image-3.png)

![image-4.png](attachment:image-4.png)

![image-5.png](attachment:image-5.png)

![image-6.png](attachment:image-6.png)



In [52]:
data

Unnamed: 0,one,two,three,four
Ohio,0,0,0,0
Colorado,0,5,6,7
Utah,8,9,10,11
New York,12,13,14,15


In [53]:
data.loc["Colorado"] # selecting one row using label, vertical view - seria for one row
# or data.iloc[2] 

one      0
two      5
three    6
four     7
Name: Colorado, dtype: int32

In [54]:
data.loc[["Colorado","New York"]] # normal view for more than 1 element 
# or data.iloc[[1,3]]

Unnamed: 0,one,two,three,four
Colorado,0,5,6,7
New York,12,13,14,15


In [55]:
# you can join selecting rows, separating labels by comma  
# subset of data
data.loc[["Colorado","New York"],["two","three"]]
# or data.iloc[[1,3],[2,1]]

Unnamed: 0,two,three
Colorado,5,6
New York,13,14


In [56]:
# now I use the same operations using iloc
data.iloc[2]

one       8
two       9
three    10
four     11
Name: Utah, dtype: int32

In [57]:
data.iloc[[1,3]]

Unnamed: 0,one,two,three,four
Colorado,0,5,6,7
New York,12,13,14,15


In [58]:
data.iloc[[2,1]]

Unnamed: 0,one,two,three,four
Utah,8,9,10,11
Colorado,0,5,6,7


In [59]:
data.iloc[2,[3,0,1]]  # second row, columns 3=four, 0=one, 1=two 

four    11
one      8
two      9
Name: Utah, dtype: int32

In [60]:
data.iloc[[1,3],[2,1]]

Unnamed: 0,three,two
Colorado,6,5
New York,14,13


In [61]:
# both functions work with slices (wycinkaimi ":2")
data.loc[:'Utah','two']

Ohio        0
Colorado    5
Utah        9
Name: two, dtype: int32

In [62]:
data.iloc[:,:3][data.three>5]

Unnamed: 0,one,two,three
Colorado,0,5,6
Utah,8,9,10
New York,12,13,14


slice notation 
![image.png](attachment:image.png)

In [66]:
data

Unnamed: 0,one,two,three,four
Ohio,0,0,0,0
Colorado,0,5,6,7
Utah,8,9,10,11
New York,12,13,14,15


In [69]:
# 20230710
# Logic tables can be used wit loc operator, not with iloc

data.loc[data.three >= 7] # dsp rows where data >= 7 in three column

Unnamed: 0,one,two,three,four
Utah,8,9,10,11
New York,12,13,14,15


Indexing option of DataFrame - table

![image.png](attachment:image.png)

#### Pitfalls of integer indexing - Pułapki indeksowania całkowitoliczbiwego

![image.png](attachment:image.png)


### Działania arytmetyczne i wyrównywanie danych 156 
### Funkcje apply i map 161 
### Sortowanie i tworzenie rankingów 163 
### Indeksy osi ze zduplikowanymi etykietami 167
## 5.3. Podsumowywanie i generowanie statystyk opisowych 168 
### Współczynnik korelacji i kowariancja 171 
### Unikalne wartości, ich liczba i przynależność 173
##  5.4. Podsumowanie 176