In [3]:
### Подготовительная часть
# импортирование библиотек
import pandas as pd
import numpy as np
# загрузка данных 
pd.read_csv(filepath_or_buffer = "../../source/sp500.csv", 
            sep =',')
# модификация данных 
sp500 = pd.read_csv(filepath_or_buffer = "../../source/sp500.csv", 
            sep =',',
            usecols = ['Symbol', 'GICS Sub-Industry', 'CIK','Founded'],
            index_col = 'Symbol')
np.random.seed(123)
numbers = pd.Series(data = np.random.normal(size=10),
                    index = np.arange(25,35))
Simpsons = pd.Series({'Homer'  : 120,
                     'Marge'  : 60,
                     'BArt'   : 35,
                     'Lisa'   : 30,
                     'Maggie' : 7})

### Удаление

In [4]:
Simpsons

Homer     120
Marge      60
BArt       35
Lisa       30
Maggie      7
dtype: int64

In [6]:
Simpson_copy = Simpsons.copy()
del Simpson_copy['Maggie']
Simpson_copy

Homer    120
Marge     60
BArt      35
Lisa      30
dtype: int64

In [7]:
sp500.head()

Unnamed: 0_level_0,GICS Sub-Industry,CIK,Founded
Symbol,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
MMM,Industrial Conglomerates,66740,1902
AOS,Building Products,4343243243432434,1916
ABT,Health Care Equipment,1800,1888
ABBV,Pharmaceuticals,1551152,2013 (1888)
ACN,IT Consulting & Other Services,1467373,1989


In [9]:
sp500_copy = sp500.copy()
del sp500_copy['CIK']

In [12]:
# вывод первых 5 строк
sp500_copy.iloc[:5]

Unnamed: 0_level_0,GICS Sub-Industry,Founded
Symbol,Unnamed: 1_level_1,Unnamed: 2_level_1
MMM,Industrial Conglomerates,1902
AOS,Building Products,1916
ABT,Health Care Equipment,1888
ABBV,Pharmaceuticals,2013 (1888)
ACN,IT Consulting & Other Services,1989


### POP удаляет и возвращает удаленную часть
### для Series применение pop идентично 

In [13]:
sp500_copy = sp500.copy()
sp500_copy.head(3)

Unnamed: 0_level_0,GICS Sub-Industry,CIK,Founded
Symbol,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
MMM,Industrial Conglomerates,66740,1902
AOS,Building Products,4343243243432434,1916
ABT,Health Care Equipment,1800,1888


In [14]:
# удаление столбца GICS Sub-Industry и возвращение его как серии 
popped_column = sp500_copy.pop('GICS Sub-Industry')

In [15]:
sp500_copy.head(3)

Unnamed: 0_level_0,CIK,Founded
Symbol,Unnamed: 1_level_1,Unnamed: 2_level_1
MMM,66740,1902
AOS,4343243243432434,1916
ABT,1800,1888


In [17]:
popped_column.head(3)

Symbol
MMM    Industrial Conglomerates
AOS           Building Products
ABT       Health Care Equipment
Name: GICS Sub-Industry, dtype: object

### drop возвращает копию данных после удаления (исходный объект удален не будет)
### для Series применение идентично 

In [19]:
sp500_copy = sp500.copy()
sp500_copy.head(3)

Unnamed: 0_level_0,GICS Sub-Industry,CIK,Founded
Symbol,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
MMM,Industrial Conglomerates,66740,1902
AOS,Building Products,4343243243432434,1916
ABT,Health Care Equipment,1800,1888


 - эта строка вернет новый датафрейм с удаленным столбцом GICS Sub-Industry
 - копия датафрейма sp500_copy не изменится (те оригинал не затрагивается)

In [22]:
sp500_copy_after_drop = sp500_copy.drop(['GICS Sub-Industry'], axis = 1)
# axis = 1 - параметр для удаления столбцов
sp500_copy_after_drop.head(3)

Unnamed: 0_level_0,CIK,Founded
Symbol,Unnamed: 1_level_1,Unnamed: 2_level_1
MMM,66740,1902
AOS,4343243243432434,1916
ABT,1800,1888


In [23]:
# получаем копию первый 5 строк 
sp500_part_copy = sp500.iloc[:5].copy()
sp500_part_copy

Unnamed: 0_level_0,GICS Sub-Industry,CIK,Founded
Symbol,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
MMM,Industrial Conglomerates,66740,1902
AOS,Building Products,4343243243432434,1916
ABT,Health Care Equipment,1800,1888
ABBV,Pharmaceuticals,1551152,2013 (1888)
ACN,IT Consulting & Other Services,1467373,1989


In [24]:
# удаляем строки с метками ABT, ACN 
sp500_part_copy = sp500_part_copy.drop(['ABT', 'ACN'], axis=0)
# axis = 1 - параметр для удаления строк
sp500_part_copy.head(5)

Unnamed: 0_level_0,GICS Sub-Industry,CIK,Founded
Symbol,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
MMM,Industrial Conglomerates,66740,1902
AOS,Building Products,4343243243432434,1916
ABBV,Pharmaceuticals,1551152,2013 (1888)


In [26]:
# Фильтрация по условию
# Series
numbers

25   -1.085631
26    0.997345
27    0.282978
28   -1.506295
29   -0.578600
30    1.651437
31   -2.426679
32   -0.428913
33    1.265936
34   -0.866740
dtype: float64

In [27]:
# какие строки имеют значение больше 0 и меньше 1 возвращаемое значение True/False
logical_results = (numbers > 0) & (numbers <1)
logical_results

25    False
26     True
27     True
28    False
29    False
30    False
31    False
32    False
33    False
34    False
dtype: bool

In [28]:
type(logical_results)

pandas.core.series.Series

In [29]:
# отбираем строки со значением True
numbers[logical_results]

26    0.997345
27    0.282978
dtype: float64

In [32]:
# where когда необходимо сохранить исходную серию 
numbers.where((numbers > 0 ) & (numbers <1))

25         NaN
26    0.997345
27    0.282978
28         NaN
29         NaN
30         NaN
31         NaN
32         NaN
33         NaN
34         NaN
dtype: float64

In [33]:
numbers.where((numbers > 0 ) & (numbers <1), other= -1)

25   -1.000000
26    0.997345
27    0.282978
28   -1.000000
29   -1.000000
30   -1.000000
31   -1.000000
32   -1.000000
33   -1.000000
34   -1.000000
dtype: float64

In [34]:
# все ли элементы >=0 ?
(numbers >=0).all()

False

In [35]:
# есть ли элементы меньше 2
(numbers < 2).any()

True

In [37]:
# сколько значений < 1
(numbers < 1).sum()

8

In [38]:
# DataFrames

In [40]:
# какие строки имеют значение CIK <100000?
sp500.CIK <100000

Symbol
MMM      True
AOS     False
ABT      True
ABBV    False
ACN     False
        ...  
YUM     False
ZBRA    False
ZBH     False
ZION    False
ZTS     False
Name: CIK, Length: 503, dtype: bool

In [41]:
# получим строки, в которых значение CIK <100000?

In [42]:
sp500[sp500.CIK < 100000]

Unnamed: 0_level_0,GICS Sub-Industry,CIK,Founded
Symbol,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
MMM,Industrial Conglomerates,66740,1902
ABT,Health Care Equipment,1800,1888
ADM,Agricultural Products & Services,7084,1902
ADP,Human Resource & Employment Services,8670,1949
AFL,Life & Health Insurance,4977,1955
...,...,...,...
TFC,Regional Banks,92230,1872
USB,Diversified Banks,36104,1968
UDR,Residential REITs,74208,1972
WFC,Diversified Banks,72971,1852


In [43]:
# извлекаем лишь те в которых значение CIK <100000 и > 5000

In [44]:
r = sp500[(sp500['CIK'] < 100000) & (sp500['CIK'] > 5000)]
r

Unnamed: 0_level_0,GICS Sub-Industry,CIK,Founded
Symbol,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
MMM,Industrial Conglomerates,66740,1902
ADM,Agricultural Products & Services,7084,1902
ADP,Human Resource & Employment Services,8670,1949
AAL,Passenger Airlines,6201,1934
AIG,Property & Casualty Insurance,5272,1919
...,...,...,...
TFC,Regional Banks,92230,1872
USB,Diversified Banks,36104,1968
UDR,Residential REITs,74208,1972
WFC,Diversified Banks,72971,1852


In [45]:
r = sp500[(sp500['CIK'] < 100000) & (sp500.CIK > 5000)]['CIK']
r

Symbol
MMM    66740
ADM     7084
ADP     8670
AAL     6201
AIG     5272
       ...  
TFC    92230
USB    36104
UDR    74208
WFC    72971
XEL    72903
Name: CIK, Length: 116, dtype: int64

In [47]:
# извлекаем строки в которых переменная GICS Sub-Industry принимает значение Regional Banks, а CIK больше или равна > 1000

In [56]:
r = sp500[(sp500['GICS Sub-Industry'] =='Regional Banks') & 
          (sp500.CIK >= 1000)]
r

Unnamed: 0_level_0,GICS Sub-Industry,CIK,Founded
Symbol,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
CFG,Regional Banks,759944,1828
FITB,Regional Banks,35527,1858
HBAN,Regional Banks,49196,1866
KEY,Regional Banks,91576,1825
MTB,Regional Banks,36270,1856
PNC,Regional Banks,713676,1845
RF,Regional Banks,1281761,1971
TFC,Regional Banks,92230,1872
ZION,Regional Banks,109380,1873


In [57]:
r = sp500[(sp500['GICS Sub-Industry'] =='Regional Banks')& 
          (sp500.CIK >= 1000)][['GICS Sub-Industry','CIK']]
r

Unnamed: 0_level_0,GICS Sub-Industry,CIK
Symbol,Unnamed: 1_level_1,Unnamed: 2_level_1
CFG,Regional Banks,759944
FITB,Regional Banks,35527
HBAN,Regional Banks,49196
KEY,Regional Banks,91576
MTB,Regional Banks,36270
PNC,Regional Banks,713676
RF,Regional Banks,1281761
TFC,Regional Banks,92230
ZION,Regional Banks,109380


In [46]:
# метод isin

In [64]:
s_tmp = sp500.CIK.isin([91576])
s_tmp

Symbol
MMM     False
AOS     False
ABT     False
ABBV    False
ACN     False
        ...  
YUM     False
ZBRA    False
ZBH     False
ZION    False
ZTS     False
Name: CIK, Length: 503, dtype: bool

In [65]:
sp500[s_tmp].head()

Unnamed: 0_level_0,GICS Sub-Industry,CIK,Founded
Symbol,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
KEY,Regional Banks,91576,1825


In [66]:
# использование метода .query

In [67]:
r = sp500[(sp500['GICS Sub-Industry'] =='Regional Banks')& 
          (sp500.CIK >= 1000)][['GICS Sub-Industry','CIK']]
r

Unnamed: 0_level_0,GICS Sub-Industry,CIK
Symbol,Unnamed: 1_level_1,Unnamed: 2_level_1
CFG,Regional Banks,759944
FITB,Regional Banks,35527
HBAN,Regional Banks,49196
KEY,Regional Banks,91576
MTB,Regional Banks,36270
PNC,Regional Banks,713676
RF,Regional Banks,1281761
TFC,Regional Banks,92230
ZION,Regional Banks,109380


In [81]:
q = sp500.query("CIK>=1000 & Symbol=='KEY'")[['CIK', 'Founded']]
q

Unnamed: 0_level_0,CIK,Founded
Symbol,Unnamed: 1_level_1,Unnamed: 2_level_1
KEY,91576,1825
