In [2]:
import pandas as pd
import numpy as np

# Series

In [3]:
a = pd.Series([1,2,3,4, 0.5])
b = pd.Series([1, 1, 2, 3], dtype=float)
c = pd.Series({'a':1, 'b':2, 'd':9})
# a = a.astype(np.int64)
a, b, c

(0    1.0
 1    2.0
 2    3.0
 3    4.0
 4    0.5
 dtype: float64,
 0    1.0
 1    1.0
 2    2.0
 3    3.0
 dtype: float64,
 a    1
 b    2
 d    9
 dtype: int64)

In [8]:
def get_type(ingredients: int) -> str:
    def pr(num):
        if num <= 5:
            return "easy"
        if 5 < num <= 15:
            return "medium"
        else:
            return "hard"
    return ingredients.map(pr)

In [9]:
a

0    1.0
1    2.0
2    3.0
3    4.0
4    0.5
dtype: float64

In [10]:
get_type(a)

0    easy
1    easy
2    easy
3    easy
4    easy
dtype: object

In [9]:
a.values

array([1. , 2. , 3. , 4. , 0.5])

In [11]:
c.index

Index(['a', 'b', 'd'], dtype='object')

# Пропуски

In [22]:
s = pd.Series([1,2,2,np.nan,9,9,np.nan, 10])
s.isna().sum() # кол-во пропусков #.isnull(), .notnull()

2

In [24]:
s[s.isnull()] = 100
s

0      1.0
1      2.0
2      2.0
3    100.0
4      9.0
5      9.0
6    100.0
7     10.0
dtype: float64

# Описательная статистика 

In [26]:
s.describe()

count      8.000000
mean      29.125000
std       43.890896
min        1.000000
25%        2.000000
50%        9.000000
75%       32.500000
max      100.000000
dtype: float64

# Доступ к данным

In [29]:
s[:1]

0    1.0
dtype: float64

In [30]:
s[:1]

0    1.0
dtype: float64

In [31]:
s[::2]

0      1.0
2      2.0
4      9.0
6    100.0
dtype: float64

In [32]:
s[s > 3]

3    100.0
4      9.0
5      9.0
6    100.0
7     10.0
dtype: float64

In [35]:
s.head(5)

0      1.0
1      2.0
2      2.0
3    100.0
4      9.0
dtype: float64

In [37]:
s.tail(3)

5      9.0
6    100.0
7     10.0
dtype: float64

In [41]:
s.nunique(), s.unique()

(5, array([  1.,   2., 100.,   9.,  10.]))

In [42]:
s.value_counts()

2.0      2
100.0    2
9.0      2
1.0      1
10.0     1
dtype: int64

In [43]:
s.isin([2,3,4])

0    False
1     True
2     True
3    False
4    False
5    False
6    False
7    False
dtype: bool

In [44]:
np.exp(s)

0    2.718282e+00
1    7.389056e+00
2    7.389056e+00
3    2.688117e+43
4    8.103084e+03
5    8.103084e+03
6    2.688117e+43
7    2.202647e+04
dtype: float64

# Добавление и удаление данных в серии

In [47]:
f = pd.Series({'1st':1, '2nd':8, '3th':7})
g = pd.concat([s,f], axis=0) #axis=1 добавление стобца
g

0        1.0
1        2.0
2        2.0
3      100.0
4        9.0
5        9.0
6      100.0
7       10.0
1st      1.0
2nd      8.0
3th      7.0
dtype: float64

In [49]:
h = g.drop([0,4,'1st'])
h

1        2.0
2        2.0
3      100.0
5        9.0
6      100.0
7       10.0
2nd      8.0
3th      7.0
dtype: float64

# DataFrame

In [2]:
dic = {
    'col1':[1,2,3,4],
    'col2':['a','b','c','d']
}
df1 = pd.DataFrame(dic)
df1

Unnamed: 0,col1,col2
0,1,a
1,2,b
2,3,c
3,4,d


In [56]:
df1.shape

(4, 2)

In [57]:
df1.columns

Index(['col1', 'col2'], dtype='object')

In [58]:
df1.describe()

Unnamed: 0,col1
count,4.0
mean,2.5
std,1.290994
min,1.0
25%,1.75
50%,2.5
75%,3.25
max,4.0


# Выбор данных

In [59]:
df1['col1']

0    1
1    2
2    3
3    4
Name: col1, dtype: int64

In [60]:
df1.loc[1]

col1    2
col2    b
Name: 1, dtype: object

In [66]:
df1.loc[[0,1,3], 'col1']

0    1
1    2
3    4
Name: col1, dtype: int64

In [75]:
df1.loc[1:3, 'col1']

1    2
2    3
3    4
Name: col1, dtype: int64

In [78]:
df1.loc[df1['col1']>2]

Unnamed: 0,col1,col2
2,3,c
3,4,d


In [79]:
df1[df1['col1'].between(1,3)]

Unnamed: 0,col1,col2
0,1,a
1,2,b
2,3,c


# Пропуски

In [5]:
df2 = pd.DataFrame({
    'col1':[np.nan,2,3,np.nan],
    'col2':['b', np.nan, 'b', np.nan],
    'col3':['ldld', 'dkdk','sgg','f']
})
df2

Unnamed: 0,col1,col2,col3
0,,b,ldld
1,2.0,,dkdk
2,3.0,b,sgg
3,,,f


In [88]:
# .isna()

In [89]:
df2.isna().any()

col1     True
col2     True
col3    False
dtype: bool

In [7]:
df2.dropna(subset=['col2'])

Unnamed: 0,col1,col2,col3
0,,b,ldld
2,3.0,b,sgg


In [90]:
df2.isna().sum()

col1    1
col2    2
col3    0
dtype: int64

In [92]:
df2['col1'] = df2['col1'].fillna(value=df2['col1'].mean())
df2

Unnamed: 0,col1,col2,col3
0,1.0,b,ldld
1,2.0,,dkdk
2,3.0,b,sgg
3,2.0,,aa


In [93]:
df2['col2'].value_counts()

b    2
Name: col2, dtype: int64

In [96]:
df2['col2'] = df2['col2'].fillna('b')
df2

Unnamed: 0,col1,col2,col3
0,1.0,b,ldld
1,2.0,b,dkdk
2,3.0,b,sgg
3,2.0,b,aa


In [98]:
df2['col1'].replace(1, 7)

0    7.0
1    2.0
2    3.0
3    2.0
Name: col1, dtype: float64

In [13]:
df2.rename(columns={'col1':'numbers'})

Unnamed: 0,numbers,col2,col3
0,1.0,b,ldld
1,2.0,,dkdk
2,3.0,b,sgg
3,,,aa


In [14]:
df2

Unnamed: 0,col1,col2,col3
0,1.0,b,ldld
1,2.0,,dkdk
2,3.0,b,sgg
3,,,aa


In [120]:
df3 = pd.DataFrame({
    'col1':[1,2,3,np.nan],
    'col2':['b', np.nan, 'b', np.nan],
    'col3':['ldld', 'dkdk','sgg','aa']
})
df3

Unnamed: 0,col1,col2,col3
0,1.0,b,ldld
1,2.0,,dkdk
2,3.0,b,sgg
3,,,aa


In [121]:
df3 = df3.drop('col2', axis=1)
df3

Unnamed: 0,col1,col3
0,1.0,ldld
1,2.0,dkdk
2,3.0,sgg
3,,aa


In [122]:
df3 = df3[df3.index!=1]
df3

Unnamed: 0,col1,col3
0,1.0,ldld
2,3.0,sgg
3,,aa


In [123]:
df3.insert(1,'col4',[2,3,4])
df3

Unnamed: 0,col1,col4,col3
0,1.0,2,ldld
2,3.0,3,sgg
3,,4,aa


# Объединение по ключевому полю

In [18]:
dfa = pd.DataFrame({
    'col1':[1,3,4,6,7],
    'col2':['a','v','d','d','p'],
    'col3':['ss','csv','json', 'xml', 'doc']
})
dfb = pd.DataFrame({
    'col1':[2,3,4,6,8],
    'col4':['a','v','d','d','p']
    #'col3':['dima','katya','sonya', 'valya', 'liza']
})

In [16]:
pd.merge(dfa, dfb, on='col1')

Unnamed: 0,col1,col2_x,col3,col2_y
0,3,v,csv,v
1,4,d,json,d
2,6,d,xml,d


In [23]:
pd.concat([dfa, dfb]).rename(columns={'col4':'col3'}).isna().sum()

col1    0
col2    5
col3    5
col3    5
dtype: int64

# Дата

In [140]:
import datetime as dt

In [143]:
df = pd.read_csv('insurance_miptstats.csv', parse_dates=[0])
df.head()

Unnamed: 0,birthday,sex,bmi,children,smoker,region,charges
0,2001-12-20,female,27.9,0,yes,southwest,16884.924
1,2003-03-18,male,33.77,1,no,southeast,1725.5523
2,1992-11-02,male,33.0,3,no,southeast,4449.462
3,1987-07-27,male,22.705,0,no,northwest,21984.47061
4,1988-11-04,male,28.88,0,no,northwest,3866.8552


In [137]:
# df.to_csv(path, sep, header, index, ...)
# df.to_excel(...)

In [144]:
df['age'] = 2023 - df['birthday'].dt.year

In [12]:
dic = pd.DataFrame({
    'col1':[1,2,3,4],
    'col2':['a fk dd','b ff','c dd','d fk fk']
})

In [13]:
dic

Unnamed: 0,col1,col2
0,1,a fk dd
1,2,b ff
2,3,c dd
3,4,d fk fk


In [18]:
dic['col2'].str.split()

0    [a, fk, dd]
1        [b, ff]
2        [c, dd]
3    [d, fk, fk]
Name: col2, dtype: object

In [26]:
''.join(dic['col2']).split()

['a', 'fk', 'ddb', 'ffc', 'ddd', 'fk', 'fk']