In [None]:
# For the v3 will try to do a better explanatory data analysis
# Now I will try to use everything given in the dataset by encoding non numerical data

In [2]:
# Importing the libraries for the data analysis
import pandas as pd 
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder

In [3]:
train_data = pd.read_csv('train.csv')

In [4]:
train_data.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [5]:
train_data[train_data['Embarked'].isnull()]



Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
61,62,1,1,"Icard, Miss. Amelie",female,38.0,0,0,113572,80.0,B28,
829,830,1,1,"Stone, Mrs. George Nelson (Martha Evelyn)",female,62.0,0,0,113572,80.0,B28,


In [6]:
train_data['Embarked'].fillna('S', inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  train_data['Embarked'].fillna('S', inplace=True)


In [7]:
label_encoder = LabelEncoder()
train_data['Sex'] = label_encoder.fit_transform(train_data['Sex'])

In [8]:
num_train_data = train_data[['Age', 'Pclass', 'SibSp', 'Fare', 'Survived', 'Parch', 'PassengerId', 'Sex']]
# Calculate the correlation matrix
df_all_corr = num_train_data.corr().abs().unstack().sort_values(kind="quicksort", ascending=False).reset_index()

# Rename columns for better readability
df_all_corr.rename(columns={"level_0": "Feature 1", "level_1": "Feature 2", 0: 'Correlation Coefficient'}, inplace=True)

# Filter the results for correlations involving the 'Age' feature
age_corr = df_all_corr[df_all_corr['Feature 1'] == 'Age']

# Display the filtered correlations
print(age_corr)

   Feature 1    Feature 2  Correlation Coefficient
0        Age          Age                 1.000000
15       Age       Pclass                 0.369226
18       Age        SibSp                 0.308247
27       Age        Parch                 0.189119
37       Age         Fare                 0.096067
39       Age          Sex                 0.093254
44       Age     Survived                 0.077221
51       Age  PassengerId                 0.036847


In [9]:
age_by_pclass_sex = num_train_data.groupby(['Sex', 'Pclass']).median()['Age']

# Display the median ages for each group
for pclass in range(1, 4):
    for sex in [0, 1]:
        print('Median age of Pclass {} {}s: {}'.format(pclass, 'male' if sex == 1 else 'female', age_by_pclass_sex[sex][pclass]))

# Calculate and print the overall median age
print('Median age of all passengers: {}'.format(num_train_data['Age'].median()))

Median age of Pclass 1 females: 35.0
Median age of Pclass 1 males: 40.0
Median age of Pclass 2 females: 28.0
Median age of Pclass 2 males: 30.0
Median age of Pclass 3 females: 21.5
Median age of Pclass 3 males: 25.0
Median age of all passengers: 28.0


In [10]:
train_data['Age'] = train_data.groupby(['Sex', 'Pclass'])['Age'].transform(lambda x: x.fillna(x.median()))

In [11]:
train_data['Deck'] = train_data['Cabin'].apply(lambda s: s[0] if pd.notnull(s) else 'M')

df_all_decks = train_data.groupby(['Deck', 'Pclass']).count().drop(columns=['Survived', 'Sex', 'Age', 'SibSp', 'Parch', 
                                                                        'Fare', 'Embarked', 'Cabin', 'PassengerId', 'Ticket']).rename(columns={'Name': 'Count'}).transpose()

In [13]:
def get_pclass_dist(df):
    # Initialize dictionaries for counts and percentages
    decks = ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'M', 'T']
    deck_counts = {deck: {pclass: 0 for pclass in range(1, 4)} for deck in decks}
    deck_percentages = {}

    # Populate deck_counts with passenger class counts
    for deck in decks:
        for pclass in range(1, 4):
            try:
                count = df[deck][pclass][0]
                deck_counts[deck][pclass] = count
            except KeyError:
                deck_counts[deck][pclass] = 0

    # Convert counts to DataFrame for easier manipulation
    df_decks = pd.DataFrame(deck_counts)

    # Calculate percentages
    for deck in df_decks.columns:
        total = df_decks[deck].sum()
        deck_percentages[deck] = [(count / total) * 100 if total > 0 else 0 for count in df_decks[deck]]

    return deck_counts, deck_percentages

In [14]:
all_deck_count, all_deck_per = get_pclass_dist(df_all_decks)

print("Deck Counts:")
print(all_deck_count)

print("\nDeck Percentages:")
print(all_deck_per)

Deck Counts:
{'A': {1: 15, 2: 0, 3: 0}, 'B': {1: 47, 2: 0, 3: 0}, 'C': {1: 59, 2: 0, 3: 0}, 'D': {1: 29, 2: 4, 3: 0}, 'E': {1: 25, 2: 4, 3: 3}, 'F': {1: 0, 2: 8, 3: 5}, 'G': {1: 0, 2: 0, 3: 4}, 'M': {1: 40, 2: 168, 3: 479}, 'T': {1: 1, 2: 0, 3: 0}}

Deck Percentages:
{'A': [100.0, 0.0, 0.0], 'B': [100.0, 0.0, 0.0], 'C': [100.0, 0.0, 0.0], 'D': [87.87878787878788, 12.121212121212121, 0.0], 'E': [78.125, 12.5, 9.375], 'F': [0.0, 61.53846153846154, 38.46153846153847], 'G': [0.0, 0.0, 100.0], 'M': [5.822416302765648, 24.45414847161572, 69.72343522561863], 'T': [100.0, 0.0, 0.0]}


  count = df[deck][pclass][0]


In [15]:
idx = train_data[train_data['Deck'] == 'T'].index
train_data.loc[idx, 'Deck'] = 'A'

In [16]:
train_data ['Deck'] = train_data['Deck'].replace(['A', 'B', 'C'], 'ABC')
train_data['Deck'] = train_data['Deck'].replace(['D', 'E'], 'DE')
train_data['Deck'] = train_data['Deck'].replace(['F', 'G'], 'FG')

train_data['Deck'].value_counts()

Deck
M      687
ABC    122
DE      65
FG      17
Name: count, dtype: int64

In [17]:
test_data = pd.read_csv('test.csv')

In [18]:
test_data.isnull().sum()

PassengerId      0
Pclass           0
Name             0
Sex              0
Age             86
SibSp            0
Parch            0
Ticket           0
Fare             1
Cabin          327
Embarked         0
dtype: int64

In [19]:
label_encoder = LabelEncoder()
test_data['Sex'] = label_encoder.fit_transform(test_data['Sex'])

In [20]:
num_test_data = test_data[['Age', 'Pclass', 'SibSp', 'Fare','Parch', 'PassengerId', 'Sex']]
age_by_pclass_sex = num_test_data.groupby(['Sex', 'Pclass']).median()['Age']
test_data['Age'] = test_data.groupby(['Sex', 'Pclass'])['Age'].transform(lambda x: x.fillna(x.median()))

In [21]:
test_data['Deck'] = test_data['Cabin'].apply(lambda s: s[0] if pd.notnull(s) else 'M')

df_all_test_decks = test_data.groupby(['Deck', 'Pclass']).count().drop(columns=['Sex', 'Age', 'SibSp', 'Parch', 
                                                                        'Fare', 'Embarked', 'Cabin', 'PassengerId', 'Ticket']).rename(columns={'Name': 'Count'}).transpose()

In [22]:
all_test_deck_count, all_test_deck_per = get_pclass_dist(df_all_test_decks)

print("Deck Counts:")
print(all_test_deck_count)


Deck Counts:
{'A': {1: 7, 2: 0, 3: 0}, 'B': {1: 18, 2: 0, 3: 0}, 'C': {1: 35, 2: 0, 3: 0}, 'D': {1: 11, 2: 2, 3: 0}, 'E': {1: 9, 2: 0, 3: 0}, 'F': {1: 0, 2: 5, 3: 3}, 'G': {1: 0, 2: 0, 3: 1}, 'M': {1: 27, 2: 86, 3: 214}, 'T': {1: 0, 2: 0, 3: 0}}


  count = df[deck][pclass][0]


In [23]:
test_data ['Deck'] = test_data['Deck'].replace(['A', 'B', 'C'], 'ABC')
test_data['Deck'] = test_data['Deck'].replace(['D', 'E'], 'DE')
test_data['Deck'] = test_data['Deck'].replace(['F', 'G'], 'FG')

test_data['Deck'].value_counts()

Deck
M      327
ABC     60
DE      22
FG       9
Name: count, dtype: int64

In [24]:
med_fare = test_data.groupby(['Pclass', 'Parch', 'SibSp']).Fare.median()[3][0][0]
# Filling the missing value in Fare with the median Fare of 3rd class alone passenger
test_data['Fare'] = test_data['Fare'].fillna(med_fare)

In [25]:
df_train_corr = train_data.drop(['PassengerId'], axis=1).corr().abs().unstack().sort_values(kind="quicksort", ascending=False).reset_index()
df_train_corr.rename(columns={"level_0": "Feature 1", "level_1": "Feature 2", 0: 'Correlation Coefficient'}, inplace=True)
df_train_corr.drop(df_train_corr.iloc[1::2].index, inplace=True)
df_train_corr_nd = df_train_corr.drop(df_train_corr[df_train_corr['Correlation Coefficient'] == 1.0].index)

df_test_corr = test_data.corr().abs().unstack().sort_values(kind="quicksort", ascending=False).reset_index()
df_test_corr.rename(columns={"level_0": "Feature 1", "level_1": "Feature 2", 0: 'Correlation Coefficient'}, inplace=True)
df_test_corr.drop(df_test_corr.iloc[1::2].index, inplace=True)
df_test_corr_nd = df_test_corr.drop(df_test_corr[df_test_corr['Correlation Coefficient'] == 1.0].index)

ValueError: could not convert string to float: 'Braund, Mr. Owen Harris'