In [29]:
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import matplotlib.pyplot as plt
import seaborn as sns

#### Подготовка данных

In [44]:
def load_titanic():
    train_frame = pd.read_csv(r'C:\Users\o1lorvek\Downloads\train.csv')
    test_frame = pd.read_csv(r'C:\Users\o1lorvek\Downloads\test.csv')

    test_frame['Survived'] = None
    dataframe = pd.concat([train_frame, test_frame], ignore_index=True, sort=False)

    dataframe['Survived'] = dataframe['Survived'].astype('category')
    dataframe['Pclass'] = dataframe['Pclass'].astype('category')
    dataframe['Sex'] = dataframe['Sex'].astype('category')

    return dataframe
dataframe = load_titanic()

#### Анализ таблицы

In [14]:
def maximum_passengers(dataframe : pd.DataFrame):
    class_counts = dataframe['Pclass'].value_counts()
    print("Количество пассажиров по классам:")
    print(class_counts)
    print(f"Больше всего пассажиров было в {class_counts.idxmax()} классе: {class_counts.max()} человек")

maximum_passengers(load_titanic())

Количество пассажиров по классам:
Pclass
3    709
1    323
2    277
Name: count, dtype: int64
Больше всего пассажиров было в 3 классе: 709 человек


In [None]:
def group_by_sex(dataframe: pd.DataFrame):
    age_by_class_sex = dataframe.groupby(['Pclass', 'Sex'], observed=True)['Age'].mean()
    print("\nСредний возраст по классу и полу:")
    print(age_by_class_sex)

    youngest_group = age_by_class_sex.idxmin()
    oldest_group = age_by_class_sex.idxmax()
    youngest_age = age_by_class_sex.min()
    oldest_age = age_by_class_sex.max()
    difference = oldest_age - youngest_age

    print(f"\nСамый юный: {youngest_group[0]} класс, {youngest_group[1]} - {youngest_age:.1f} лет")
    print(f"Самый взрослый: {oldest_group[0]} класс, {oldest_group[1]} - {oldest_age:.1f} лет")
    print(f"Разница: {difference:.1f} лет")

group_by_sex(load_titanic())


Средний возраст по классу и полу:
Pclass  Sex   
1       female    37.037594
        male      41.029272
2       female    27.499223
        male      30.815380
3       female    22.185329
        male      25.962264
Name: Age, dtype: float64

Самый юный: 3 класс, female - 22.2 лет
Самый взрослый: 1 класс, male - 41.0 лет

Разница: 18.8 лет


In [27]:
def group_by_name(dataframe: pd.DataFrame):
    survived_k = dataframe[
        (dataframe['Survived'] == 1) &
        (dataframe['Name'].str.split(',').str[0].str.startswith('K'))
    ].copy()

    survived_k_sorted = survived_k.sort_values('Fare', ascending=False)

    print(f"\nВыжившие с фамилией на К: {len(survived_k_sorted)} человек")
    if not survived_k_sorted.empty:
        max_fare = survived_k_sorted.iloc[0]
        min_fare = survived_k_sorted.iloc[-1]
        print(f"Самый дорогой билет: {max_fare['Name']} - ${max_fare['Fare']:.2f}")
        print(f"Самый дешевый билет: {min_fare['Name']} - ${min_fare['Fare']:.2f}")

group_by_name(load_titanic())


Выжившие с фамилией на К: 9 человек
Самый дорогой билет: Kimball, Mr. Edwin Nelson Jr - $52.55
Самый дешевый билет: Kelly, Miss. Mary - $7.75


#### Визуализация

In [37]:
import plotly.io as pio
pio.renderers.default = 'vscode'

In [None]:
def scatter_plot(dataframe: pd.DataFrame):
    graph = px.scatter(dataframe, x='Age', y='Fare', color = 'Pclass', 
                    hover_data=['Name'], title="Fares by passenger age")
    graph.show()

scatter_plot(load_titanic())

In [None]:
def linear_plot(dataframe: pd.DataFrame):
    dataframe['Name_Length'] = dataframe['Name'].str.len()
    
    dataframe['Survived_Numeric'] = dataframe['Survived'].astype(float)
    
    survival_by_name_length = dataframe.groupby('Name_Length')['Survived_Numeric'].mean().reset_index()
    
    graph = px.line(survival_by_name_length, x='Name_Length', y='Survived_Numeric',
                title='Dependence of survival on the length of the name',
                labels={'Name_Length': 'Name length', 'Survived_Numeric': 'Survival chance'})
    
    graph.show()

linear_plot(dataframe)

In [52]:
def histogram(dataframe: pd.DataFrame):
    dataframe['Ticket length'] = dataframe['Ticket'].str.len()
    graph = px.histogram(dataframe, x='Ticket length', nbins=30)
    graph.show()

histogram(dataframe)

In [63]:
def bar_chart(dataframe: pd.DataFrame):
    dataframe['Name_length'] = dataframe['Name'].str.len()
    sorted_by_name_length = dataframe.groupby('Name_length')['Fare'].mean().reset_index()
    graph = px.bar(sorted_by_name_length, x='Name_length', y='Fare')
    graph.show()

bar_chart(dataframe)

In [None]:
def horizontal_bar_chart(dataframe: pd.DataFrame):
    dataframe['Ticket_name_length'] = dataframe['Ticket'].str.len()
    
    clean_data = dataframe.dropna(subset=['Fare'])
    
    sorted_by_name_length = clean_data.groupby('Ticket_name_length')['Fare'].mean().reset_index()
    
    graph = px.bar(sorted_by_name_length, x='Fare', y='Ticket_name_length', orientation='h',
                title='Average ticket price by ticket name length',
                labels={'Ticket_name_length': 'Ticket name length', 'Fare': 'Average fare'})
    graph.show()

horizontal_bar_chart(dataframe)

In [None]:
def pie_chart(dataframe: pd.DataFrame):
    fare_by_class = dataframe.groupby('Pclass', observed=True)['Fare'].sum().reset_index()
    graph = px.pie(fare_by_class, names='Pclass', values='Fare',
                title='Distribution of total ticket costs by class')
    graph.show()

pie_chart(dataframe)

In [None]:
def box_chart(dataframe: pd.DataFrame):
    graph = px.box(dataframe, x='Pclass', y='Age', color='Sex',
            title='Distribution of age by class and gender')
    graph.show()

box_chart(dataframe)