# Библиотека Pandas

In [1]:
import pandas as pd
import numpy as np

 ### Основные структуры данных

In [2]:
from pandas import Series, DataFrame

#### Series
 
одномерный объект, содержащий последовательность данных


In [3]:
list_a = [-3, 4, 5, 19.0]
ser_a = Series(list_a) # создание Series из списка, индексы от 0 до 3
ser_a

0    -3.0
1     4.0
2     5.0
3    19.0
dtype: float64

In [4]:
ser_b = Series(list_a, index=['a', 'b', 'c', 'd']) #можно задать индексы вручную, втч строки
ser_b

a    -3.0
b     4.0
c     5.0
d    19.0
dtype: float64

In [8]:
# можно обратить по буквенному ключу или по индексу
ser_b['c']
#и взять срез
print(ser_b['b':'d'])
print(ser_b[0:2])


b     4.0
c     5.0
d    19.0
dtype: float64
a   -3.0
b    4.0
dtype: float64


In [6]:
#как и в Numpy работают операции над массивом данных целиком: арифметика, выборка

ser_a * 4.0 # меняется тип

0   -12.0
1    16.0
2    20.0
3    76.0
dtype: float64

In [21]:
ser_a % 2 == 0

0    False
1     True
2    False
3    False
dtype: bool

In [23]:
ser_a[ser_a % 2==1], ser_b[ser_b > 4]

(0    -3.0
 2     5.0
 3    19.0
 dtype: float64,
 c     5.0
 d    19.0
 dtype: float64)

### Преобразование словаря в Series

In [31]:
D = {'Athens': 1896, 'Paris':1900, 'Amsterdam': 1928, 'Berlin':1936, 'Rome':1960, 'London':2012, 'Sochi':2014, 'Beijing':2022}
PD = Series(D)
print(PD)

PD[PD > 2000]


Athens       1896
Paris        1900
Amsterdam    1928
Berlin       1936
Rome         1960
London       2012
Sochi        2014
Beijing      2022
dtype: int64


London     2012
Sochi      2014
Beijing    2022
dtype: int64

In [32]:
# можно сопоставить словарь с каким-то списком
cities = ['Athens', 'Paris', 'Amsterdam', 'Berlin', 'Rostov-on-Don']
PD2 = Series(D, index=cities, dtype = np.int64)
PD2

Athens           1896.0
Paris            1900.0
Amsterdam        1928.0
Berlin           1936.0
Rostov-on-Don       NaN
dtype: float64

In [29]:
PD2['Rostov-on-Don'] = 3028
pd.isnull(PD2)
PD2

Athens           1896.0
Paris            1900.0
Amsterdam        1928.0
Berlin           1936.0
Rostov-on-Don    3028.0
dtype: float64

In [37]:
PD2.name = 'year'
PD2.index.name = 'city'
PD2['Tokyo'] = 2020
PD2['fff'] = 3050
PD2['Karaganda'] = 5891
PD2

city
Athens           1896.0
Paris            1900.0
Amsterdam        1928.0
Berlin           1936.0
Rostov-on-Don       NaN
Tokyo            2020.0
fff              3050.0
Karaganda        5891.0
Name: year, dtype: float64

In [40]:
PD2 = PD2.drop('fff') #нужно присваивать, чтобы результат изменил PD


In [41]:
PD2

city
Athens           1896.0
Paris            1900.0
Amsterdam        1928.0
Berlin           1936.0
Rostov-on-Don       NaN
Tokyo            2020.0
Karaganda        5891.0
Name: year, dtype: float64

In [49]:
midx = pd.MultiIndex(levels=[['lama', 'cow', 'falcon','çat'],
                             ['speed', 'weight', 'length']],
                     codes=[[0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3],
                            [0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2]])
s = pd.Series([45, 200, 1.2, 30, 250, 1.5, 320, 1, 0.3, 4, 14, 2],
              index=midx)
print(s['lama':'cow'])
s

lama  speed      45.0
      weight    200.0
      length      1.2
cow   speed      30.0
      weight    250.0
      length      1.5
dtype: float64


lama    speed      45.0
        weight    200.0
        length      1.2
cow     speed      30.0
        weight    250.0
        length      1.5
falcon  speed     320.0
        weight      1.0
        length      0.3
çat     speed       4.0
        weight     14.0
        length      2.0
dtype: float64

In [54]:
cars = pd.MultiIndex(levels=[['vw', 'audi', 'renault', 'pegeout'],
                            ['model', 'color']],
                     codes=[[0,1, 1,0, 2,2, 3,3], [0,1,0,1,0,1,0,1]]
                    )
cd = pd.Series([1,2,3,4,5,6,7,8],
              index=cars)
cd

vw       model    1
audi     color    2
         model    3
vw       color    4
renault  model    5
         color    6
pegeout  model    7
         color    8
dtype: int64

### DataFrame

табличная структура данных, состоящая из упорядоченной последовательности столбцов (в разных столбцах могут быть данные разного типа)
 



два индекса: по строкам и по столбцам

#### Преобразование в DataFrame из словаря

In [58]:
print(D)
city = list(D.keys())
year = list(D.values())
parts = np.random.random(len(city)) * 10
print(city, year, parts.astype('int32'), sep='\n')

data = {'city': city,
        'year': year,
        'parts': parts.astype('int32'),    
}
data

{'Athens': 1896, 'Paris': 1900, 'Amsterdam': 1928, 'Berlin': 1936, 'Rome': 1960, 'London': 2012, 'Sochi': 2014, 'Beijing': 2022}
['Athens', 'Paris', 'Amsterdam', 'Berlin', 'Rome', 'London', 'Sochi', 'Beijing']
[1896, 1900, 1928, 1936, 1960, 2012, 2014, 2022]
[4 4 9 4 8 4 5 8]


{'city': ['Athens',
  'Paris',
  'Amsterdam',
  'Berlin',
  'Rome',
  'London',
  'Sochi',
  'Beijing'],
 'year': [1896, 1900, 1928, 1936, 1960, 2012, 2014, 2022],
 'parts': array([4, 4, 9, 4, 8, 4, 5, 8], dtype=int32)}

In [57]:
type(city), year

(list, [1896, 1900, 1928, 1936, 1960, 2012, 2014, 2022])

In [60]:
DF = pd.DataFrame(data)
DF

Unnamed: 0,city,year,parts
0,Athens,1896,4
1,Paris,1900,4
2,Amsterdam,1928,9
3,Berlin,1936,4
4,Rome,1960,8
5,London,2012,4
6,Sochi,2014,5
7,Beijing,2022,8


In [62]:
DF.head(10)

Unnamed: 0,city,year,parts
0,Athens,1896,4
1,Paris,1900,4
2,Amsterdam,1928,9
3,Berlin,1936,4
4,Rome,1960,8
5,London,2012,4
6,Sochi,2014,5
7,Beijing,2022,8


In [63]:
DF = pd.DataFrame(data, columns=['year','parts', 'city','medals'])# можно регулировать порядок колонок
DF # будет NaN Если в словаре нет такого ключа

Unnamed: 0,year,parts,city,medals
0,1896,4,Athens,
1,1900,4,Paris,
2,1928,9,Amsterdam,
3,1936,4,Berlin,
4,1960,8,Rome,
5,2012,4,London,
6,2014,5,Sochi,
7,2022,8,Beijing,


In [67]:
# можно получить столбец DataFrame как объект типа Series
DF['year'], DF.year #каждый столбец == атрибут
DF.city


0       Athens
1        Paris
2    Amsterdam
3       Berlin
4         Rome
5       London
6        Sochi
7      Beijing
Name: city, dtype: object

In [70]:
DF.loc[0] #индекс записи

year        1896
parts          4
city      Athens
medals       NaN
Name: 0, dtype: object

In [71]:
DF.columns, DF.index

(Index(['year', 'parts', 'city', 'medals'], dtype='object'),
 RangeIndex(start=0, stop=8, step=1))

In [74]:
DF.index = [str(i**2) for i in range(8)]

In [75]:
DF.index


Index(['0', '1', '4', '9', '16', '25', '36', '49'], dtype='object')

In [76]:
DF

Unnamed: 0,year,parts,city,medals
0,1896,4,Athens,
1,1900,4,Paris,
4,1928,9,Amsterdam,
9,1936,4,Berlin,
16,1960,8,Rome,
25,2012,4,London,
36,2014,5,Sochi,
49,2022,8,Beijing,


In [78]:
del DF['medals'] # удаление столбца по ключу
DF

KeyError: 'medals'

In [81]:
DFT = DF.T # "транспонирование DataFrame"
DF

Unnamed: 0,year,parts,city
0,1896,4,Athens
1,1900,4,Paris
4,1928,9,Amsterdam
9,1936,4,Berlin
16,1960,8,Rome
25,2012,4,London
36,2014,5,Sochi
49,2022,8,Beijing


In [80]:
DFT

Unnamed: 0,0,1,4,9,16,25,36,49
year,1896,1900,1928,1936,1960,2012,2014,2022
parts,4,4,9,4,8,4,5,8
city,Athens,Paris,Amsterdam,Berlin,Rome,London,Sochi,Beijing


In [83]:
DFT.index.name = 'field'
DFT.columns.name = 'index'
#DFT.index = [1, 0, 0]
DFT.columns


Index(['0', '1', '4', '9', '16', '25', '36', '49'], dtype='object', name='index')

In [86]:
indices = DF.index # структура типа Index (собств объект) 
print(indices, indices[3:])
indices[0] = 0 # неизменяемы!


Index(['0', '1', '4', '9', '16', '25', '36', '49'], dtype='object', name='index') Index(['9', '16', '25', '36', '49'], dtype='object', name='index')


TypeError: Index does not support mutable operations

In [88]:
print(DF.index)
'10' in DF.index, 'year' in DF.columns # можно проверять имя индекса и колонки на in


Index(['0', '1', '4', '9', '16', '25', '36', '49'], dtype='object', name='index')


(False, True)

In [90]:
from random import shuffle
newI = list(DF.index)
print(newI)
shuffle(newI)
print(newI)


['0', '1', '4', '9', '16', '25', '36', '49']
['9', '49', '4', '25', '36', '0', '1', '16']


In [92]:
DF.reindex(newI.sort())# можно пересортировать, используя reindex


field,year,parts,city
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,1896,4,Athens
1,1900,4,Paris
4,1928,9,Amsterdam
9,1936,4,Berlin
16,1960,8,Rome
25,2012,4,London
36,2014,5,Sochi
49,2022,8,Beijing


In [94]:
DF.drop('parts', axis=1, inplace=True) 
# возвращает новый объект, если не указано inplace=True!


KeyError: "['parts'] not found in axis"

In [95]:
DF

field,year,city
index,Unnamed: 1_level_1,Unnamed: 2_level_1
0,1896,Athens
1,1900,Paris
4,1928,Amsterdam
9,1936,Berlin
16,1960,Rome
25,2012,London
36,2014,Sochi
49,2022,Beijing


### Выборка данных из DataFrame

In [96]:
DF

field,year,city
index,Unnamed: 1_level_1,Unnamed: 2_level_1
0,1896,Athens
1,1900,Paris
4,1928,Amsterdam
9,1936,Berlin
16,1960,Rome
25,2012,London
36,2014,Sochi
49,2022,Beijing


In [109]:
DF.loc['36',['city', 'year']] #выборка по именам)


field
city    Sochi
year     2014
Name: 36, dtype: object

In [112]:
DF.iloc[[7,0,7], [1,0,1]] #выборка по индексам


field,city,year,city
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
49,Beijing,2022,Beijing
0,Athens,1896,Athens
49,Beijing,2022,Beijing


In [113]:
DF[DF['year']>1950]

field,year,city
index,Unnamed: 1_level_1,Unnamed: 2_level_1
16,1960,Rome
25,2012,London
36,2014,Sochi
49,2022,Beijing


In [120]:
shuffle(newI)
DF = DF.reindex(newI)
newI, DF

(['49', '36', '4', '25', '0', '16', '9', '1'],
 field  year       city
 index                 
 49     2022    Beijing
 36     2014      Sochi
 4      1928  Amsterdam
 25     2012     London
 0      1896     Athens
 16     1960       Rome
 9      1936     Berlin
 1      1900      Paris)

In [122]:
DF.index

(field  year       city
 index                 
 0      1896     Athens
 1      1900      Paris
 16     1960       Rome
 25     2012     London
 36     2014      Sochi
 4      1928  Amsterdam
 49     2022    Beijing
 9      1936     Berlin,
 Index(['49', '36', '4', '25', '0', '16', '9', '1'], dtype='object', name='index'))

In [123]:
DF.sort_index()

field,year,city
index,Unnamed: 1_level_1,Unnamed: 2_level_1
0,1896,Athens
1,1900,Paris
16,1960,Rome
25,2012,London
36,2014,Sochi
4,1928,Amsterdam
49,2022,Beijing
9,1936,Berlin


In [124]:
DF.sort_index(axis=1, ascending=False)

field,year,city
index,Unnamed: 1_level_1,Unnamed: 2_level_1
49,2022,Beijing
36,2014,Sochi
4,1928,Amsterdam
25,2012,London
0,1896,Athens
16,1960,Rome
9,1936,Berlin
1,1900,Paris


In [128]:
DF.iloc[:,1].sort_values()

index
4     Amsterdam
0        Athens
49      Beijing
9        Berlin
25       London
1         Paris
16         Rome
36        Sochi
Name: city, dtype: object

In [126]:
DF

field,year,city
index,Unnamed: 1_level_1,Unnamed: 2_level_1
49,2022,Beijing
36,2014,Sochi
4,1928,Amsterdam
25,2012,London
0,1896,Athens
16,1960,Rome
9,1936,Berlin
1,1900,Paris
