### Очистка и подготовка данных


In [None]:
url = "https://raw.githubusercontent.com/dm-fedorov/pandas_basic/master/%D0%B1%D1%8B%D1%81%D1%82%D1%80%D0%BE%D0%B5%20%D0%B2%D0%B2%D0%B5%D0%B4%D0%B5%D0%BD%D0%B8%D0%B5%20%D0%B2%20pandas/data/titanic.csv"

In [None]:
import pandas as pd
df = pd.read_csv(url)
df.head(3)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S


In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


In [None]:
# Удаляем столбцы, в которых нет ценной для нас информации
df.drop(['PassengerId', 'Name', 'Ticket'],
        axis=1,
        inplace=True)

In [None]:
df.head(3)

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Cabin,Embarked
0,0,3,male,22.0,1,0,7.25,,S
1,1,1,female,38.0,1,0,71.2833,C85,C
2,1,3,female,26.0,0,0,7.925,,S


In [None]:
# Округляем стоимость билета до двух знаков после запятой
df['Fare'] = round(df['Fare'], 2)

In [None]:
df.head(3)

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Cabin,Embarked
0,0,3,male,22.0,1,0,7.25,,S
1,1,1,female,38.0,1,0,71.28,C85,C
2,1,3,female,26.0,0,0,7.92,,S


In [None]:
df.isna().sum()

Survived      0
Pclass        0
Sex           0
Age         177
SibSp         0
Parch         0
Fare          0
Cabin       687
Embarked      2
dtype: int64

In [None]:
# можно настраивать и изменять способ удаления данных, например с помощью параметра thresh=2,
# который оставит строки с более, чем 2 непустыми значениями
# df.dropna()

# https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.dropna.html

In [None]:
#Способ 1: Заменить на константу

#df['Age'].fillna(25)

In [None]:
# Способ 2: Среднее арифметическее
df['Age'].fillna(df['Age'].mean())

0      22.000000
1      38.000000
2      26.000000
3      35.000000
4      35.000000
         ...    
886    27.000000
887    19.000000
888    29.699118
889    26.000000
890    32.000000
Name: Age, Length: 891, dtype: float64

In [None]:
# Способ 2: Cредние арифметические в зависимости от класса каюты
df.query("Pclass == 1").Age.mean()

38.233440860215055

In [None]:
def fill_age(row):
    if pd.isnull(row['Age']):
        if row['Pclass'] == 1:
            return df.query("Pclass == 1").Age.mean()
        elif row['Pclass'] == 2:
            return df.query("Pclass == 2").Age.mean()
        elif row['Pclass'] == 3:
            return df.query("Pclass == 3").Age.mean()

    return row['Age']

In [None]:
df.apply(fill_age,
         axis="columns")

0      22.00000
1      38.00000
2      26.00000
3      35.00000
4      35.00000
         ...   
886    27.00000
887    19.00000
888    25.14062
889    26.00000
890    32.00000
Length: 891, dtype: float64

In [None]:
# https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.interpolate.html

# https://stackoverflow.com/questions/19966018/pandas-filling-missing-values-by-mean-in-each-group

In [None]:
df.groupby('Pclass')['Age'].apply(lambda x:x.fillna(x.mean()))

0      22.00000
1      38.00000
2      26.00000
3      35.00000
4      35.00000
         ...   
886    27.00000
887    19.00000
888    25.14062
889    26.00000
890    32.00000
Name: Age, Length: 891, dtype: float64

In [None]:
(df.apply(fill_age, axis = 1)).equals(df.groupby('Pclass')['Age'].apply(lambda x:x.fillna(x.mean())))

True

### Создаем новый столбец с информацией о том, был ли пассажир на борту один или с родственниками
Столбец должен содержать значение "alone", если он был на борту один (без супруга/супруги, братьев, сестер, детей и родителей) и значение "not alone", если пассажир путешествовал с кем-то из родственников.

- `SibSp` - Количество братьев и сестер / супругов на борту
- `Parch` - число родителей / детей на борту

In [None]:
# Способ 1: с помощью функции и apply
def alone_check(row):
    if row['SibSp'] > 0 or row['Parch'] > 0:
        return 'not_alone'
    return 'alone'

df['Alone'] = df.apply(alone_check, axis=1)
df

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Cabin,Embarked,Alone
0,0,3,male,22.0,1,0,7.25,,S,not_alone
1,1,1,female,38.0,1,0,71.28,C85,C,not_alone
2,1,3,female,26.0,0,0,7.92,,S,alone
3,1,1,female,35.0,1,0,53.10,C123,S,not_alone
4,0,3,male,35.0,0,0,8.05,,S,alone
...,...,...,...,...,...,...,...,...,...,...
886,0,2,male,27.0,0,0,13.00,,S,alone
887,1,1,female,19.0,0,0,30.00,B42,S,alone
888,0,3,female,,1,2,23.45,,S,not_alone
889,1,1,male,26.0,0,0,30.00,C148,C,alone


In [None]:
# Способ 2: с помощью lambda-функции
df['Alone'] = df.apply(lambda x: 'not_alone' if x['SibSp'] or x['Parch'] > 0 else 'alone', axis=1)
df

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Cabin,Embarked,Alone
0,0,3,male,22.0,1,0,7.25,,S,not_alone
1,1,1,female,38.0,1,0,71.28,C85,C,not_alone
2,1,3,female,26.0,0,0,7.92,,S,alone
3,1,1,female,35.0,1,0,53.10,C123,S,not_alone
4,0,3,male,35.0,0,0,8.05,,S,alone
...,...,...,...,...,...,...,...,...,...,...
886,0,2,male,27.0,0,0,13.00,,S,alone
887,1,1,female,19.0,0,0,30.00,B42,S,alone
888,0,3,female,,1,2,23.45,,S,not_alone
889,1,1,male,26.0,0,0,30.00,C148,C,alone
