# Anaconda 
- 1. Перейти на сайт Anaconda: https://www.anaconda.com/download/
- 2. Выбрать операционную систему, которая стоит у вас на компьютере (Windows, Linux или MacOS)
- 3. Скачать соответствующий разрядности Вашего процессора (32 бит или 64 бит) вариант Anaconda
- 4. Для установки библиотеки jupyter: conda install jupyter
- 5. Запустить ноутбук: jupyter notebook

- Создать новое окружение:
`conda create --name test_env python=3.6`
- Посмотреть уже созданные окружения:
`conda info --envs`
- Активировать окружение:
`conda activate test_env`
- Выйти из окружения в изначальное (базовое):
`conda activate`
- Установить новый пакет:
`conda install beautifulsoup4`
- Посмотреть все установленные пакеты:
`conda list`
- Обновить пакет:
`conda update beautifulsoup4`


# Jupyter Notebook, JupyterLab
https://jupyter-notebook.readthedocs.io/en/latest/

https://jupyterlab.readthedocs.io/en/stable/

# Numpy

https://numpy.org/doc/stable/

## Создание массива

In [1]:
import numpy as np

In [2]:
np.__version__

'1.18.5'

In [3]:
python_list = [0, 1, 2, 3]

In [4]:
# Так создавать numpy-массив нельзя
wrong_numpy_array = np.array(0, 1, 2, 3)

ValueError: only 2 non-keyword arguments accepted

* из списков или кортежей Python

In [None]:
a = np.array([1,2,3,4])
a

In [None]:
type(a)

In [None]:
python_list = [0, 1, 2, 3]
a = np.array(python_list)
a

In [None]:
python_tuple = (0, 1, 2, 3)
a = np.array(python_tuple)
a

* многомерные массивы

In [None]:
b = np.array([[0,1,2],[3,4,5]])
b

In [None]:
a.ndim

In [None]:
b.ndim

In [None]:
a.shape

In [None]:
b.shape

* нулевой массив

In [None]:
a = np.zeros((1,10))
a

In [None]:
a = np.zeros((4,3))
a

* единичный массив

In [None]:
a = np.ones((1,10))
a

In [None]:
a = np.ones((4,3))
a

* пустой массив со случайным заполнением 

In [None]:
a = np.empty((5,4))
a

* массив такой же формы

In [None]:
b = np.zeros_like(a)
b

* массив из последовательности

In [None]:
a = np.arange(0,10,2)
a

In [None]:
a = np.arange(0,10,0.5)
a

In [None]:
a = np.linspace(0,99,num=15)
a

In [None]:
a = np.logspace(1,3,10,base=2)
a

#### сравним скорость вычислений типов list и numpy.ndarray

In [None]:
%timeit [i**2 for i in range(1000)]

In [None]:
%timeit np.arange(1000)**2

## Типы данных

https://numpy.org/doc/stable/user/basics.types.html

* тип данных по-умолчанию

In [None]:
a = np.array([0, 1, 2, 3])

In [None]:
a.dtype

In [None]:
a = np.array([1.3, 0.5, 4])
a.dtype

In [None]:
a = np.array([1+2j, 3+4j, 5+6*1j])
a.dtype

In [None]:
a = np.array([True, False, False, True])
a.dtype

In [None]:
a = np.array(['dgf', 0.5, 4])
a.dtype

In [None]:
a = np.array(['Bonjo12345werwer67', 'Hello', 'Hallo',])
a.dtype

In [None]:
a1 = 12313241545646848648645645645645412786873132156465456465464

In [None]:
a = np.array(a1)
a.dtype

* явное указание типа данных

In [None]:
a = np.array([0, 1, 2, 3], dtype=np.int8)
a.dtype

In [None]:
a = np.array([0, 1, 2, 3], dtype=np.bool)
a

In [None]:
a = np.array([1.3, 0.5, 4], dtype=np.int8)
a

In [None]:
a = np.array(['dgf', 0.5, 4], dtype=np.int8)

* None и np.nan

In [None]:
none = None

In [None]:
none is None

In [None]:
None is None # Замена для `id(None) == id(None)`

In [None]:
None == None, None is None

In [None]:
np.nan == np.nan, np.nan is np.nan

## Индексирование

* одномерный массив

In [None]:
a = np.arange(10)
a

In [None]:
a[0]

In [None]:
a[5]

In [None]:
a[9]

In [None]:
# [start:end:step]
a[1:6]

In [None]:
a[1:6:2]

In [None]:
a[:6]

In [None]:
a[6:]

In [None]:
a[:]

In [None]:
a[-1]

In [None]:
a[-2]

In [None]:
a[-10]

In [None]:
a[::-1]

In [None]:
a[::-2]

In [None]:
a[-1:-5:-1]

* многомерный массив

In [None]:
a = np.array([[0,1,2], [3,4,5], [6,7,8], [9,10,11]])
a

In [None]:
a.shape

In [None]:
a[2,1]

In [None]:
a[2]

In [5]:
a[:,1]

NameError: name 'a' is not defined

In [None]:
a[1:, 2]

In [None]:
a[1:3,1:3]

In [None]:
a[::-1]

## Изменение размерности

In [None]:
a = np.array([[0,1,2], [3,4,5], [6,7,8], [9,10,11]])
a

In [None]:
a.shape

In [None]:
b = a.flatten()
b

In [None]:
a

In [None]:
a = np.array([[0,1,2], [3,4,5], [6,7,8], [9,10,11]])
a

In [None]:
a.reshape((6,2))

In [None]:
a.reshape((3,4))

In [None]:
a

In [None]:
a.resize((3,4))
a

In [None]:
a.reshape(-1,6)

In [None]:
a

In [None]:
a.T

In [None]:
a

## Рандом

In [None]:
# uniform in [0, 5)
a = 5 * np.random.rand(4, 4)
a

In [None]:
# Gaussian
b =  5 * np.random.randn(4,4) + 1 
b  

* фиксация рандома

In [None]:
np.random.seed(123)

In [None]:
np.random.randn(3)

In [None]:
np.random.randn(4)

In [None]:
from numpy.random import MT19937
from numpy.random import RandomState, SeedSequence

In [None]:
rs = RandomState(MT19937(SeedSequence(123456789)))

In [None]:
rs.randn(3)

In [None]:
rs.randn(4)

## Выбор по условию (маски)

In [None]:
np.random.seed(3)
a = np.random.randint(0, 20, 15)
a

In [None]:
a[a > 10]

In [None]:
mask = a>10
mask

In [None]:
a[mask]

In [None]:
mask = (a>5) & (a<10)
a[mask]

In [None]:
mask = (a % 3 == 0)
print(mask)
extract_from_a = a[mask]
extract_from_a

In [None]:
a[a % 3 == 0] = -1
a

## Математические операции

* одномерные массивы

In [None]:
a = np.array([10,20,30,40])
b = np.array([1,2,3,4])

In [None]:
a

In [None]:
b

In [6]:
a + 5

NameError: name 'a' is not defined

In [None]:
a - b

In [None]:
a + b

In [None]:
a * b

In [None]:
a / b

In [None]:
a ** b

* многомерные массивы

In [None]:
A = np.array([[10,20],[30,40]])
B = np.array([[1,2],[3,4]])

In [None]:
A

In [None]:
B

In [None]:
A - B

In [None]:
A + B

In [None]:
A * B

In [None]:
A / B

In [None]:
A ** B

In [None]:
A.dot(B)

* использование функций

In [None]:
a

In [None]:
a.sum(), np.sum(a)

In [None]:
a.prod(), np.prod(a)

In [None]:
np.sqrt(a)

In [None]:
np.log(a)

In [None]:
a.min(), a.max()

In [None]:
a.mean(), np.mean(a)

In [None]:
np.median(a)

In [None]:
a.std(), np.std(a)

In [None]:
a.argmin(), a.argmax()

## Другие полезные функции

In [None]:
a

* проверка вхождения в массив

In [None]:
2 in a

In [None]:
20 in a

* преобразование в список

In [7]:
a.tolist()

NameError: name 'a' is not defined

In [None]:
%config IPCompleter.greedy=True


* сортировка списка

In [None]:
a = np.array([40,10,20,5])
a

In [None]:
a.sort()

In [None]:
a

* заполение массива одинаковым значением

In [None]:
a.fill(7)

In [None]:
a

## Копирование

In [None]:
a = np.array([[ 0,  1,  2,  3],
              [ 4,  5,  6,  7],
              [ 8,  9, 10, 11]])
a

In [None]:
b = a
b

In [None]:
a[1,1] = 555

In [None]:
a

In [None]:
b

In [None]:
a is b

In [None]:
id(a), id(b)

In [None]:
b = a.copy()

In [None]:
a is b

In [None]:
id(a), id(b)

In [None]:
a

In [8]:
b

NameError: name 'b' is not defined

In [None]:
a[1,1] = 5
a

In [None]:
b

# Pandas

https://pandas.pydata.org/pandas-docs/stable/user_guide/index.html

https://pandas.pydata.org/pandas-docs/stable/getting_started/10min.html#min

In [None]:
import pandas as pd

In [None]:
pd.__version__

## Series

![](https://pandas.pydata.org/docs/_images/01_table_series.svg)

### Создание

* из списка

In [None]:
salaries = pd.Series(data=[400, 300, 200, 250, 100]) 
salaries

In [None]:
salaries = pd.Series(data=[400, 300, 200, 250, 100], 
                     index=['Max', 'Ann', 'Charles', 'John', 'Mike']) 
salaries                                                               

In [None]:
salaries.index

In [None]:
salaries.values

* из numpy-массива

In [None]:
s = pd.Series(np.random.randn(5), index=['a', 'b', 'c', 'd', 'e'])
s

* из словаря

In [None]:
d = {'b': 1, 'a': 0, 'c': 2}

In [None]:
s = pd.Series(d)
s

* из скалярного значения

In [9]:
s =  pd.Series(5., index=['a', 'b', 'c', 'd', 'e'])
s

NameError: name 'pd' is not defined

### Доступ к элементам и операции над ними

In [None]:
salaries

In [None]:
salaries[0]

In [None]:
salaries[3]

In [None]:
salaries['Max']

In [10]:
salaries['John']

NameError: name 'salaries' is not defined

In [None]:
salaries[2:]

In [None]:
salaries[2:].to_numpy()

In [None]:
salaries[2:].values

In [None]:
salaries['Ann'] = 350

In [None]:
salaries

In [None]:
salaries[['Max', 'John']]

In [None]:
salaries['Ivan']

In [None]:
val = salaries.get('Ivan')
val

In [None]:
val is None

In [None]:
val = salaries.get('Ivan', default='undefined')
val

### Атрибуты 

* тип данных

In [11]:
salaries

NameError: name 'salaries' is not defined

In [None]:
salaries.dtype

In [None]:
salaries.astype('int32')

In [None]:
salaries = salaries.astype('int32')
salaries

* имя

In [None]:
salaries.name = 'salaries'

In [None]:
salaries

* размер

In [None]:
salaries.shape

In [None]:
len(salaries)

### проверка на пропущенные значения

In [None]:
np.isnan(np.nan), np.isnan(pd.NA)

In [None]:
np.isnan(None)

In [None]:
pd.isna(None), pd.isna(np.nan), pd.isna(pd.NA)

In [None]:
pd.NA

In [None]:
salaries

In [None]:
salaries['Ivan'] = None

In [None]:
salaries

In [None]:
salaries.isna()

In [None]:
mask = salaries.isna()
salaries[mask]

## DataFrame

![](https://pandas.pydata.org/docs/_images/01_table_dataframe1.svg)

### Создание

* явное указание параметров

In [12]:
df = pd.DataFrame(data=np.random.randn(5, 3), 
                   index=['o1', 'o2', 'o3', 'o4', 'o5'], 
                   columns=['f1', 'f2', 'f3'])
df

NameError: name 'pd' is not defined

* из словаря

In [None]:
d = {'A': np.random.random(5), 
     'B': [1,2,3,4,5], 
     'C': np.arange(5) > 2}
df2 = pd.DataFrame(d)
df2

* из списка словарей

In [None]:
df3 = pd.DataFrame([{'A': 1, 'B': 2}, {'A': 2, 'C': 3}])
df3

* из Series

In [None]:
df4 = pd.DataFrame(salaries)
df4

### Доступ к элементам

![title](indexing.png)

In [None]:
df

In [None]:
df['f2']

In [None]:
df[['f2', 'f3']]

In [None]:
df[1:4]

In [None]:
df[1:]

In [None]:
df.loc['o2']

In [None]:
df.loc[['o2', 'o3']]

In [None]:
df.loc[['o2', 'o3'],'f2']

In [None]:
df.iloc[1,1:]

In [None]:
df.iloc[1:3,1:]

### Изменение, удаление, вставка новых значений

In [None]:
df

In [None]:
df.loc['o3', 'f2']

In [None]:
df.loc['o3', 'f2'] = 100

In [None]:
df

In [None]:
df.loc['o2'] = [1,2,3]

In [None]:
df

In [None]:
df.loc['o6'] = [4,5,6]
df

In [13]:
df['f4'] = df['f1'] * df['f2']
df

NameError: name 'df' is not defined

In [None]:
df.drop('f2', axis=1)

In [None]:
df

In [None]:
df.drop('f4', axis=1, inplace=True)

In [None]:
df

### Объединение датафреймов

In [None]:
df2.columns = ['f1', 'f2', 'f3']
df2

In [None]:
df3 = df.append(df2)
df3

In [None]:
pd.concat([df, df2], axis=0)

In [None]:
df4 = df.copy()

In [None]:
df4

In [None]:
df4['f4'] = df4['f2']**2
df4

In [None]:
df4 + 10

In [None]:
df

In [None]:
df4

In [None]:
df.join(df4['f4'])

Merge, join, concatenate:
https://pandas.pydata.org/docs/user_guide/merging.html

## Пропущенные значения

In [None]:
df3.loc['o4','f2'] = None
df3.loc[3,'f1'] = None
df3

In [None]:
df3.isna()

In [None]:
df3.dropna()

In [None]:
df3.mean()

In [None]:
df3

In [None]:
df3.fillna(df3.mean())

## Сохранение

In [None]:
df3.to_csv('dataframe.csv')

## Первичный анализ

In [None]:
df = pd.read_csv('https://raw.githubusercontent.com/Yorko/mlcourse.ai/master/data/beauty.csv', sep=';')

In [None]:
df.head()

In [None]:
df.tail(3)

In [None]:
df.info()

In [None]:
print(df.shape)
print(df.index.values)

In [None]:
df['wage'].to_numpy()

In [None]:
df.describe()

In [None]:
df.sort_values(by='union', ascending=False).head()

In [None]:
df.sort_values(by=['female', 'wage'], ascending=[True, False]).head()

In [None]:
df['goodhlth'].mean()

In [14]:
df[df['female'] == 1].head()

NameError: name 'df' is not defined

In [None]:
df[(df['goodhlth'] == 1) | (df['female'] == 1)].median()

## Группировка

* зависимость зарплаты от пола

In [None]:
df.groupby('female')['wage'].mean()

* зависимость зарплаты от пола и семейного статуса

In [None]:
df.groupby(['female', 'married'])['wage'].mean()

In [None]:
df.groupby(['female', 'married'])['wage'].mean().unstack()

In [None]:
pvt = df.pivot_table(index=['female'], columns=['married'], values='wage', aggfunc='mean')
pvt

## Построение графиков

In [None]:
df['wage'].hist()

In [None]:
pvt.plot.bar()

## Применение функций

In [None]:
df.head()

* Apply

In [None]:
df.apply(np.max)

In [None]:
df.apply(lambda x: x.max() - x.min())

In [None]:
df['female'].apply(lambda x: 'Male' if x == 0 else 'Female')

* Map

In [None]:
d = {0:'Male', 1:'Female'}
df['female'].map(d)

In [None]:
df['female'].map(lambda x: 'Male' if x == 0 else 'Female')

# sklearn

https://scikit-learn.org/stable/user_guide.html

In [None]:
from sklearn.linear_model import Ridge

In [None]:
model = Ridge(random_state=42)

In [None]:
x = df.drop('looks', axis=1)
y = df['looks']

In [None]:
model.fit(x, y)

In [None]:
preds = model.predict(x).astype(int)

In [None]:
from sklearn.metrics import accuracy_score

In [None]:
accuracy_score(y, preds)

In [None]:
y.value_counts()

In [None]:
pd.Series(preds).value_counts()

In [None]:
from sklearn.metrics.pairwise import euclidean_distances

In [None]:
dists = euclidean_distances(x)

In [None]:
for x in range(1, 6):
    dists = euclidean_distances(df[df.looks == x])
    print(x, dists.mean(), dists.max())

In [None]:
euclidean_distances(df[(df.looks == 2) | (df.looks == 5)]).mean()

# SciPy

https://docs.scipy.org/doc/scipy/reference/

In [15]:
from scipy import stats

In [16]:
pearson_coef, p_value = stats.pearsonr(df['wage'], df['looks'])
print(f'PearsonR = {pearson_coef:.3f}; P_value = {p_value:.3f}')

NameError: name 'df' is not defined

In [None]:
import scipy.linalg as linalg

In [None]:
# 3x + 2y =  2
#  x -  y =  4
#  y +  z = -1
# Ax=b
a = np.array([[3, 2, 0],
              [1, -1, 0],
              [0, 5, 1]])
b = np.array([2, 4, -1])
x = linalg.solve(a, b)
print(f"x={x}, check: {np.dot(a, x) == b}")