# Python и форматы данных

В данном ноутбуке мы разберем различные форматы данных, библиотеки для python для чтения и обработки этих данных.

## Сериализация и десериализация

Основным механизмом сохранения и передачи данных является механизм сериализации и десериализации данных. Делать это можно "в лоб", не задумываясь о конечном формате данных, если эти данные будут открываться только одной системой или средой. Обычно принято сохранять данные в бинарном формате, например, в файл. 

## Напомним про виды данных в Python

В основном встречаются следующие типы данных

- Числа (целые, с плавающей точкой)
- Строки
- bool
- None

Данные типы данных могут агрегироваться в следующие структуры данных

- списки (массивы)
- кортежи
- словари (в последствии с ними и будем работать для сложных структур данных)

### Числа

In [1]:
number = 1
double_number = 0.3
very_smal_number = 1e-10
print(number, double_number, very_smal_number)

1 0.3 1e-10


### Поддерживаемые операции

In [2]:
1 + 2

3

In [3]:
3 - 4

-1

In [4]:
1 * 2

2

In [5]:
2 ** 3

8

In [6]:
2 / 3

0.6666666666666666

In [7]:
5 % 2

1

### Строки

In [8]:
str1 = 'abc'
str2 = "def"
print(str1, str2)

abc def


### Сложение строк

In [9]:
"abc" + "def"

'abcdef'

### Умножение строк

In [10]:
"abc" * 3

'abcabcabc'

### Инъектирование в строку

In [11]:
"abc" + str(number) + "def"

'abc1def'

In [12]:
f'abc {number} abc'

'abc 1 abc'

In [13]:
'abc {0}, abc {1}'.format(number, double_number)

'abc 1, abc 0.3'

### Приведение типов

In [20]:
one = 1

In [22]:
id(one)

140194299496752

In [25]:
stringa = "a"

In [27]:
id(stringa)

140194298514800

In [28]:
stringa += "aa"

In [29]:
id(stringa)

140194243762480

In [14]:
str(1)

'1'

In [15]:
int('1')

1

## Структуры данных

### Мутабельность

Характеристика объекта в Python, описываемая как "изменяемость объекта".

- Мутабельные объекты - изменяемы по адресу в памяти.
- Иммутабельные - неизменяемые объекты по адресу в памяти.

### Кортежи

- Иммутабельны
- Могут содержать любой тип данных

In [33]:
s = ('s')
s

's'

In [32]:
s = ('s',)
s

('s',)

In [37]:
a = (1, 'a', {"a": 1}, [1, 2, 3])
a

(1, 'a', {'a': 1}, [1, 2, 3])

In [45]:
a[3] = [0]

TypeError: 'tuple' object does not support item assignment

### Списки

- Мутабельны
- Могут содержать любой тип данных (в том числе список списков)

Имеют следующие операции

- Добавление
- Удаление
- Конкатенация
- Умножение
- Функциональная магия

In [51]:
list1 = [1, 2, 3, 4, 5]

In [52]:
list1.append(6)
list1

[1, 2, 3, 4, 5, 6]

In [53]:
list1.remove(5)
list1

[1, 2, 3, 4, 6]

In [54]:
list1.pop()

6

In [55]:
list1 * 3

[1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4]

In [56]:
list2 = [3, 2, 1]
list1 + list2
list1

[1, 2, 3, 4]

In [57]:
list1.extend(list2)
list1

[1, 2, 3, 4, 3, 2, 1]

## Очевидный способ обойти массив и предобработать данные

In [58]:
for n in list1:
    m = n + 1
    print(m)

2
3
4
5
4
3
2


In [59]:
list1

[1, 2, 3, 4, 3, 2, 1]

In [60]:
for n in list1:
    n += 1
list1

[1, 2, 3, 4, 3, 2, 1]

In [62]:
list(enumerate(list1))

[(0, 1), (1, 2), (2, 3), (3, 4), (4, 3), (5, 2), (6, 1)]

In [63]:
for i, n in enumerate(list1):
    list1[i] += 1
list1

[2, 3, 4, 5, 4, 3, 2]

In [65]:
list3 = [x ** 2 for x in list1]
list3

[4, 9, 16, 25, 16, 9, 4]

In [66]:
list1

[2, 3, 4, 5, 4, 3, 2]

Кортежи имеют схожие операции, что и у списков, кроме изменения кортежа, поскольку он иммутабелен.

## Lambda-выражения

Под lambda-выражениями будем понимать обработку каждого элемента списка одной операцией. Данный функциональный подход наиболее оптимизирован в Python для обработки большого массива данных и позволяет грамотно и лаконично задать процесс обработки в одну строку

In [71]:
def more3(l):
    result = []
    for n in l:
        if n > 3:
            result.append(n)
    return result #[x > 3  for x in l]

In [72]:
more3(list1)

[4, 5, 4]

In [76]:
def isMoreThan3(n):
    return n > 3

In [77]:
list(filter(isMoreThan3, list1))

[4, 5, 4]

In [73]:
list4 = filter(lambda x: x > 3, list1)
list(list4)

[4, 5, 4]

In [74]:
%timeit more3(list1)

750 ns ± 24.7 ns per loop (mean ± std. dev. of 7 runs, 1000000 loops each)


In [79]:
%timeit filter(lambda x: x > 3, list1)

306 ns ± 8.95 ns per loop (mean ± std. dev. of 7 runs, 1000000 loops each)


In [80]:
%timeit filter(isMoreThan3, list1)

224 ns ± 2.36 ns per loop (mean ± std. dev. of 7 runs, 1000000 loops each)


In [81]:
list5 = map(lambda x: x * 2, list1)
list(list5)

[4, 6, 8, 10, 8, 6, 4]

In [85]:
strings = ['a', 'b', 'c']
numbers = [1, 2, 3, 4, 5]
list(zip(strings, numbers))

[('a', 1), ('b', 2), ('c', 3)]

In [95]:
from functools import reduce
list6 = [1, 2, 3, 4, 5]
reduce(lambda x, y: x * y, list6)

120

In [96]:
reduce(lambda x, y: x + y, map(lambda x: x * 2, list6))

30

In [97]:
sum([1, 2])

3

In [98]:
import numpy as np

In [111]:
np.ndarray((3, 4))

array([[6.92651904e-310, 4.66432981e-310, 0.00000000e+000,
        0.00000000e+000],
       [2.90379506e-057, 8.61659126e-043, 4.08659378e+179,
        2.98711027e-032],
       [2.87873039e+180, 4.91320478e-062, 1.27883586e+161,
        8.02833005e-042]])

In [114]:
np.zeros((5, 5))

array([[0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0.]])

In [119]:
np.ones((3, 3)) + np.ones((3, 3))

array([[0., 0., 0.],
       [0., 0., 0.],
       [0., 0., 0.]])

In [125]:
shaped = np.array([1, 2, 3, 4], dtype='int')

In [156]:
shaped

array([[1, 2],
       [3, 4]])

In [127]:
shaped.reshape(2, 2)

array([[1, 2],
       [3, 4]])

In [123]:
shaped.reshape(4, 1)

array([[1],
       [2],
       [3],
       [4]])

In [130]:
shaped.reshape(2, 2).shape

(2, 2)

In [134]:
shaped.reshape(2, 2).flatten()

array([1, 2, 3, 4])

In [171]:
shaped2 = np.ndarray(4, dtype='int16')
shaped2

array([    0,     0,     0, 16404], dtype=int16)

In [137]:
shaped = shaped.reshape(2, 2)
shaped2 = shaped2.reshape(2, 2)

In [138]:
shaped

array([[1, 2],
       [3, 4]])

In [168]:
shaped2.shape

(2, 2)

In [144]:
np.concatenate((shaped, shaped2), axis=1).shape

(2, 4)

In [184]:
list7 = np.ndarray(20, dtype='int')
list7

array([     94392664617276,                   0,     140191747881520,
           140194299684464,                 784,                 128,
               23048605692, 2891359550276660520, 8241879964965809201,
       3556536777818333554, 2314861394126905388, 3484414059051229216,
       2308706538487947308, 6582890602682130464, 2308706537849236533,
       6566283579056201760, 2980641183310556212, 7599935853211230208,
       8030516744137238382, 2338053659250470263])

In [149]:
list7[5:10]

array([     140194266064352, -6378000811680512050,      140194266923440,
            140194266064496, -4628296560581731449])

In [150]:
list7[10:]

array([140194299384240, 140194266626224,               0,               0,
                     0,               0,               0,               0,
                     0,               0])

In [151]:
np.max(list7)

140194306149120

In [152]:
np.min(list7)

-6378000811680512050

In [154]:
%timeit np.sort(list7)

4.23 µs ± 84.4 ns per loop (mean ± std. dev. of 7 runs, 100000 loops each)


In [155]:
np.cos(shaped)

array([[ 0.54030231, -0.41614684],
       [-0.9899925 , -0.65364362]])

In [157]:
np.sin(shaped)

array([[ 0.84147098,  0.90929743],
       [ 0.14112001, -0.7568025 ]])

In [158]:
np.arccos([0.5, 0.7])

array([1.04719755, 0.79539883])

In [159]:
np.corrcoef(shaped)

array([[1., 1.],
       [1., 1.]])

In [164]:
np.linspace(1, 5, num=3)

array([1., 3., 5.])

### Словари

- Ключ-значение
- Мутабельны
- Имеют отдельный список ключей и отдельный список значений
- Имеют отдельное название - хэшмап

In [186]:
dictionary = dict({'a': 1})
dictionary

{'a': 1}

In [188]:
dictionary['b'] = 3
dictionary

{'a': 1, 'b': 3}

In [189]:
dictionary.keys()

dict_keys(['a', 'b'])

In [190]:
dictionary.values()

dict_values([1, 3])

In [198]:
for n in dictionary:
    print(dictionary[n])

1
3


In [200]:
dictionary['a']

1

## Попробуем сохранить один из листов

In [203]:
list1

[2, 3, 4, 5, 4, 3, 2]

In [202]:
textfile = open('list1.txt', 'w')
for n in list1:
    textfile.write(str(n) + '\n')
textfile.close()

## Попробуем загрузить

In [204]:
newfile = open('list1.txt', 'r')
list1 = newfile.readlines()
newfile.close()
list1

['2\n', '3\n', '4\n', '5\n', '4\n', '3\n', '2\n']

In [205]:
list1 = list(map(lambda x: int(x), list1))
list1

[2, 3, 4, 5, 4, 3, 2]

In [206]:
from sklearn.datasets import load_wine

In [207]:
wine = load_wine()

In [208]:
wine

{'data': array([[1.423e+01, 1.710e+00, 2.430e+00, ..., 1.040e+00, 3.920e+00,
         1.065e+03],
        [1.320e+01, 1.780e+00, 2.140e+00, ..., 1.050e+00, 3.400e+00,
         1.050e+03],
        [1.316e+01, 2.360e+00, 2.670e+00, ..., 1.030e+00, 3.170e+00,
         1.185e+03],
        ...,
        [1.327e+01, 4.280e+00, 2.260e+00, ..., 5.900e-01, 1.560e+00,
         8.350e+02],
        [1.317e+01, 2.590e+00, 2.370e+00, ..., 6.000e-01, 1.620e+00,
         8.400e+02],
        [1.413e+01, 4.100e+00, 2.740e+00, ..., 6.100e-01, 1.600e+00,
         5.600e+02]]),
 'target': array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1

### JSON

Формат хранения данных в виде словарей из простейших объектов. Является стандартном для практически всех языков программирования. Пришел из JavaScript. 

In [225]:
import json
from numpyencoder import NumpyEncoder
import datetime

In [222]:
json_str = json.dumps(wine, indent=4, sort_keys=True,
              separators=(',\n', ':\t'), ensure_ascii=False,
              cls=NumpyEncoder)

In [231]:
obj = {
    "title": "Service",
    "content": "this is a recommendation",
    "date": str(datetime.datetime.now())
}

In [232]:
json_str = json.dumps(obj)

In [233]:
file = open('datetimeexample.json', 'w')
file.write(json_str)
file.close()

In [234]:
file = open('datetimeexample.json', 'r')
json_str = file.read()
wine = json.loads(json_str)
wine

{'title': 'Service',
 'content': 'this is a recommendation',
 'date': '2021-09-15 20:23:53.160856'}

## Датафреймы

Для обработки данных в питоне обычно используют датафреймы. В датафреймах удобно работать с различного рода таблицами, проводить над ними операции, использовать некоторые операции из баз данных. 

In [235]:
import pandas as pd

In [240]:
ser = pd.Series([1, 2, 3], index=[3, 2, 1])

In [251]:
ser

3    1
2    2
1    3
dtype: int64

In [248]:
ser.axes

[Int64Index([3, 2, 1], dtype='int64')]

In [249]:
df = pd.DataFrame({'a': [1], 'b': [2]})
df

Unnamed: 0,a,b
0,1,2


In [254]:
type(df)

pandas.core.frame.DataFrame

In [258]:
df = df.append({'a': 2, 'b': 3}, ignore_index=True)

In [259]:
df

Unnamed: 0,a,b
0,1,2
1,2,3


In [351]:
df = pd.read_csv('TSLA.csv')

In [261]:
df

Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume
0,2010-06-29,3.800000,5.000000,3.508000,4.778000,4.778000,93831500
1,2010-06-30,5.158000,6.084000,4.660000,4.766000,4.766000,85935500
2,2010-07-01,5.000000,5.184000,4.054000,4.392000,4.392000,41094000
3,2010-07-02,4.600000,4.620000,3.742000,3.840000,3.840000,25699000
4,2010-07-06,4.000000,4.000000,3.166000,3.222000,3.222000,34334500
...,...,...,...,...,...,...,...
2808,2021-08-24,710.679993,715.219971,702.640015,708.489990,708.489990,13083100
2809,2021-08-25,707.030029,716.969971,704.000000,711.200012,711.200012,12645600
2810,2021-08-26,708.309998,715.400024,697.619995,701.159973,701.159973,13214300
2811,2021-08-27,705.000000,715.000000,702.099976,711.919983,711.919983,13762100


In [266]:
df.head()

Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume
0,2010-06-29,3.8,5.0,3.508,4.778,4.778,93831500
1,2010-06-30,5.158,6.084,4.66,4.766,4.766,85935500
2,2010-07-01,5.0,5.184,4.054,4.392,4.392,41094000
3,2010-07-02,4.6,4.62,3.742,3.84,3.84,25699000
4,2010-07-06,4.0,4.0,3.166,3.222,3.222,34334500


In [267]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2813 entries, 0 to 2812
Data columns (total 7 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   Date       2813 non-null   object 
 1   Open       2813 non-null   float64
 2   High       2813 non-null   float64
 3   Low        2813 non-null   float64
 4   Close      2813 non-null   float64
 5   Adj Close  2813 non-null   float64
 6   Volume     2813 non-null   int64  
dtypes: float64(5), int64(1), object(1)
memory usage: 154.0+ KB


In [268]:
df['Date']

0       2010-06-29
1       2010-06-30
2       2010-07-01
3       2010-07-02
4       2010-07-06
           ...    
2808    2021-08-24
2809    2021-08-25
2810    2021-08-26
2811    2021-08-27
2812    2021-08-30
Name: Date, Length: 2813, dtype: object

In [279]:
df[['Date', 'Volume']]

Unnamed: 0,Date,Volume
0,2010-06-29,93831500
1,2010-06-30,85935500
2,2010-07-01,41094000
3,2010-07-02,25699000
4,2010-07-06,34334500
...,...,...
2808,2021-08-24,13083100
2809,2021-08-25,12645600
2810,2021-08-26,13214300
2811,2021-08-27,13762100


In [276]:
df.loc[2808]

Date         2021-08-24
Open         710.679993
High         715.219971
Low          702.640015
Close        708.489990
Adj Close    708.489990
Volume         13083100
Name: 2808, dtype: object

In [280]:
df.loc[[2808, 2812]]

Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume
2808,2021-08-24,710.679993,715.219971,702.640015,708.48999,708.48999,13083100
2812,2021-08-30,714.719971,731.0,712.72998,730.909973,730.909973,18502400


In [281]:
df.iloc[5]

Date         2010-07-07
Open              3.280
High              3.326
Low               2.996
Close             3.160
Adj Close         3.160
Volume         34608500
Name: 5, dtype: object

In [282]:
df.iloc[[5, 7]]

Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume
5,2010-07-07,3.28,3.326,2.996,3.16,3.16,34608500
7,2010-07-09,3.516,3.58,3.31,3.48,3.48,20253000


In [283]:
df.iloc[:20]

Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume
0,2010-06-29,3.8,5.0,3.508,4.778,4.778,93831500
1,2010-06-30,5.158,6.084,4.66,4.766,4.766,85935500
2,2010-07-01,5.0,5.184,4.054,4.392,4.392,41094000
3,2010-07-02,4.6,4.62,3.742,3.84,3.84,25699000
4,2010-07-06,4.0,4.0,3.166,3.222,3.222,34334500
5,2010-07-07,3.28,3.326,2.996,3.16,3.16,34608500
6,2010-07-08,3.228,3.504,3.114,3.492,3.492,38557000
7,2010-07-09,3.516,3.58,3.31,3.48,3.48,20253000
8,2010-07-12,3.59,3.614,3.4,3.41,3.41,11012500
9,2010-07-13,3.478,3.728,3.38,3.628,3.628,13400500


In [284]:
df[50:100]

Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume
50,2010-09-09,4.2,4.21,4.138,4.142,4.142,1881000
51,2010-09-10,4.15,4.186,3.952,4.034,4.034,1933000
52,2010-09-13,4.178,4.18,4.1,4.144,4.144,1804000
53,2010-09-14,4.108,4.32,4.106,4.224,4.224,3273500
54,2010-09-15,4.196,4.4,4.158,4.396,4.396,3423000
55,2010-09-16,4.43,4.632,4.168,4.188,4.188,13422500
56,2010-09-17,4.204,4.264,3.96,4.046,4.046,5992500
57,2010-09-20,4.134,4.27,4.032,4.212,4.212,4737500
58,2010-09-21,4.178,4.31,4.134,4.154,4.154,3980000
59,2010-09-22,4.174,4.19,3.96,3.974,3.974,4814500


In [285]:
df.iloc[lambda x: x.index % 5 == 0]

Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume
0,2010-06-29,3.800000,5.000000,3.508000,4.778000,4.778000,93831500
5,2010-07-07,3.280000,3.326000,2.996000,3.160000,3.160000,34608500
10,2010-07-14,3.588000,4.030000,3.552000,3.968000,3.968000,20976000
15,2010-07-21,4.132000,4.180000,3.900000,4.044000,4.044000,6262500
20,2010-07-28,4.110000,4.180000,4.102000,4.144000,4.144000,2336000
...,...,...,...,...,...,...,...
2790,2021-07-29,649.789978,683.690002,648.799988,677.349976,677.349976,30394600
2795,2021-08-05,716.000000,720.950012,711.409973,714.630005,714.630005,12919600
2800,2021-08-12,706.340027,722.799988,699.400024,722.250000,722.250000,17459100
2805,2021-08-19,678.210022,686.549988,667.590027,673.469971,673.469971,14313500


In [287]:
df[295:305]

Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume
295,2011-08-29,4.844,4.97,4.804,4.942,4.942,4017000
296,2011-08-30,4.9,4.954,4.818,4.926,4.926,1831000
297,2011-08-31,4.96,5.1,4.856,4.948,4.948,4119000
298,2011-09-01,4.932,4.974,4.768,4.8,4.8,4240500
299,2011-09-02,4.732,4.798,4.536,4.614,4.614,3849500
300,2011-09-06,4.5,4.64,4.458,4.588,4.588,4049000
301,2011-09-07,4.678,4.8,4.656,4.768,4.768,2296000
302,2011-09-08,4.716,4.806,4.656,4.722,4.722,2528500
303,2011-09-09,4.674,4.714,4.51,4.594,4.594,3346500
304,2011-09-12,4.5,4.662,4.49,4.576,4.576,2833000


In [289]:
df.iloc[300, 3]

4.458

In [292]:
df.iloc[[100, 300], [1, 4]]

Unnamed: 0,Open,Close
100,6.134,5.978
300,4.5,4.588


In [293]:
df['Volume'].max()

304694000

In [294]:
df['Close'].median()

45.664001

In [295]:
df['Close'].mean()

98.34987687984356

In [309]:
df = df.dropna()

In [297]:
df

Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume
0,2010-06-29,3.800000,5.000000,3.508000,4.778000,4.778000,93831500
1,2010-06-30,5.158000,6.084000,4.660000,4.766000,4.766000,85935500
2,2010-07-01,5.000000,5.184000,4.054000,4.392000,4.392000,41094000
3,2010-07-02,4.600000,4.620000,3.742000,3.840000,3.840000,25699000
4,2010-07-06,4.000000,4.000000,3.166000,3.222000,3.222000,34334500
...,...,...,...,...,...,...,...
2808,2021-08-24,710.679993,715.219971,702.640015,708.489990,708.489990,13083100
2809,2021-08-25,707.030029,716.969971,704.000000,711.200012,711.200012,12645600
2810,2021-08-26,708.309998,715.400024,697.619995,701.159973,701.159973,13214300
2811,2021-08-27,705.000000,715.000000,702.099976,711.919983,711.919983,13762100


In [298]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2813 entries, 0 to 2812
Data columns (total 7 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   Date       2813 non-null   object 
 1   Open       2813 non-null   float64
 2   High       2813 non-null   float64
 3   Low        2813 non-null   float64
 4   Close      2813 non-null   float64
 5   Adj Close  2813 non-null   float64
 6   Volume     2813 non-null   int64  
dtypes: float64(5), int64(1), object(1)
memory usage: 175.8+ KB


In [307]:
df['Open'].where(lambda x: x > 5.5).dropna()

95        5.720000
96        5.650000
97        6.044000
98        6.200000
99        6.040000
           ...    
2808    710.679993
2809    707.030029
2810    708.309998
2811    705.000000
2812    714.719971
Name: Open, Length: 2555, dtype: float64

In [310]:
df['Open'].apply(lambda x: x * 10)

0         38.00000
1         51.58000
2         50.00000
3         46.00000
4         40.00000
           ...    
2808    7106.79993
2809    7070.30029
2810    7083.09998
2811    7050.00000
2812    7147.19971
Name: Open, Length: 2813, dtype: float64

In [322]:
df['Open'].apply(lambda x: average(x))

0         3.800000
1         5.158000
2         5.000000
3         4.600000
4         4.000000
           ...    
2808    710.679993
2809    707.030029
2810    708.309998
2811    705.000000
2812    714.719971
Name: Open, Length: 2813, dtype: float64

In [344]:
df.rolling(2, win_type='triang').mean().dropna()

Unnamed: 0,Open,High,Low,Close,Adj Close,Volume
1,4.479000,5.542000,4.084000,4.772000,4.772000,89883500.0
2,5.079000,5.634000,4.357000,4.579000,4.579000,63514750.0
3,4.800000,4.902000,3.898000,4.116000,4.116000,33396500.0
4,4.300000,4.310000,3.454000,3.531000,3.531000,30016750.0
5,3.640000,3.663000,3.081000,3.191000,3.191000,34471500.0
...,...,...,...,...,...,...
2808,698.059998,713.674988,691.695007,707.394989,707.394989,16674000.0
2809,708.855011,716.094971,703.320007,709.845001,709.845001,12864350.0
2810,707.670013,716.184998,700.809998,706.179993,706.179993,12929950.0
2811,706.654999,715.200012,699.859985,706.539978,706.539978,13488200.0


In [343]:
np.mean([5, 4.6])

4.8

In [346]:
df = pd.DataFrame({"a": ["red", "blue", "red", "yellow", "red"], "b":[1, 6, 3, 5, 0]})
df

Unnamed: 0,a,b
0,red,1
1,blue,6
2,red,3
3,yellow,5
4,red,0


In [350]:
df.groupby('a').min()

Unnamed: 0_level_0,b
a,Unnamed: 1_level_1
blue,6
red,0
yellow,5


In [352]:
df[['Open', 'Low']].div(2)

Unnamed: 0,Open,Low
0,1.900000,1.754000
1,2.579000,2.330000
2,2.500000,2.027000
3,2.300000,1.871000
4,2.000000,1.583000
...,...,...
2808,355.339996,351.320007
2809,353.515015,352.000000
2810,354.154999,348.809998
2811,352.500000,351.049988


In [356]:
df['High'] = np.sin(df.loc[:, 'High'])
df

Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume
0,2010-06-29,3.800000,-0.544021,3.508000,4.778000,4.778000,93831500
1,2010-06-30,5.158000,-0.387917,4.660000,4.766000,4.766000,85935500
2,2010-07-01,5.000000,-0.809454,4.054000,4.392000,4.392000,41094000
3,2010-07-02,4.600000,0.183728,3.742000,3.840000,3.840000,25699000
4,2010-07-06,4.000000,0.989358,3.166000,3.222000,3.222000,34334500
...,...,...,...,...,...,...,...
2808,2021-08-24,710.679993,-0.849631,702.640015,708.489990,708.489990,13083100
2809,2021-08-25,707.030029,0.980638,704.000000,711.200012,711.200012,12645600
2810,2021-08-26,708.309998,-0.980969,697.619995,701.159973,701.159973,13214300
2811,2021-08-27,705.000000,-0.544122,702.099976,711.919983,711.919983,13762100


In [357]:
df.loc[:, 'High']

0      -0.544021
1      -0.387917
2      -0.809454
3       0.183728
4       0.989358
          ...   
2808   -0.849631
2809    0.980638
2810   -0.980969
2811   -0.544122
2812   -0.916570
Name: High, Length: 2813, dtype: float64

In [355]:
%timeit df.loc[:, 'High'] * 2

240 µs ± 3.51 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)


In [354]:
%timeit df['High'].apply(lambda x: x * 2)

1.37 ms ± 39 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [358]:
ser2 = pd.Series(['abs', 'bsa', 'asb'])
ser2

0    abs
1    bsa
2    asb
dtype: object

In [359]:
import re

In [369]:
[re.subn("b", "", x) for x in ser2.loc[:]]

[('as', 1), ('sa', 1), ('as', 1)]