In [6]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder

# Load the datasets
train = pd.read_csv(r"C:\Users\arjun\Downloads\internship tasks\train.csv")
test = pd.read_csv(r"C:\Users\arjun\Downloads\internship tasks\test.csv")

# Combine train and test for consistent preprocessing
train['source'] = 'train'
test['source'] = 'test'
test['Survived'] = np.nan  # Add placeholder for uniformity

full_data = pd.concat([train, test], ignore_index=True)

# Drop columns that are not useful or have too many missing values
full_data.drop(['Cabin', 'Ticket'], axis=1, inplace=True)

# Fill missing values
full_data['Age'].fillna(full_data['Age'].median(), inplace=True)
full_data['Embarked'].fillna(full_data['Embarked'].mode()[0], inplace=True)
full_data['Fare'].fillna(full_data['Fare'].median(), inplace=True)

# Encode categorical variables
label_encoders = {}
for column in ['Sex', 'Embarked']:
    le = LabelEncoder()
    full_data[column] = le.fit_transform(full_data[column])
    label_encoders[column] = le

# Optionally extract title from Name
full_data['Title'] = full_data['Name'].str.extract(' ([A-Za-z]+)\.', expand=False)
full_data['Title'] = full_data['Title'].replace(['Lady', 'Countess', 'Capt', 'Col', 'Don', 'Dr', 
                                                  'Major', 'Rev', 'Sir', 'Jonkheer', 'Dona'], 'Rare')
full_data['Title'] = full_data['Title'].replace('Mlle', 'Miss')
full_data['Title'] = full_data['Title'].replace('Ms', 'Miss')
full_data['Title'] = full_data['Title'].replace('Mme', 'Mrs')
full_data['Title'] = LabelEncoder().fit_transform(full_data['Title'])

# Drop columns not needed for model input
full_data.drop(['Name', 'PassengerId', 'source'], axis=1, inplace=True)

# Separate back into train and test
cleaned_train = full_data[full_data['Survived'].notnull()].copy()
cleaned_test = full_data[full_data['Survived'].isnull()].drop('Survived', axis=1).copy()


In [7]:
from IPython.display import display

print("Cleaned Training Data:")
display(cleaned_train.head(10))

print("Cleaned Test Data:")
display(cleaned_test.head(10))


Cleaned Training Data:


Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,Title
0,0.0,3,1,22.0,1,0,7.25,2,2
1,1.0,1,0,38.0,1,0,71.2833,0,3
2,1.0,3,0,26.0,0,0,7.925,2,1
3,1.0,1,0,35.0,1,0,53.1,2,3
4,0.0,3,1,35.0,0,0,8.05,2,2
5,0.0,3,1,28.0,0,0,8.4583,1,2
6,0.0,1,1,54.0,0,0,51.8625,2,2
7,0.0,3,1,2.0,3,1,21.075,2,0
8,1.0,3,0,27.0,0,2,11.1333,2,3
9,1.0,2,0,14.0,1,0,30.0708,0,3


Cleaned Test Data:


Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,Title
891,3,1,34.5,0,0,7.8292,1,2
892,3,0,47.0,1,0,7.0,2,3
893,2,1,62.0,0,0,9.6875,1,2
894,3,1,27.0,0,0,8.6625,2,2
895,3,0,22.0,1,1,12.2875,2,3
896,3,1,14.0,0,0,9.225,2,2
897,3,0,30.0,0,0,7.6292,1,1
898,2,1,26.0,1,1,29.0,2,2
899,3,0,18.0,0,0,7.2292,0,3
900,3,1,21.0,2,0,24.15,2,2


In [8]:
# Save the cleaned training dataset
cleaned_train.to_csv("cleaned_train.csv", index=False)

# Save the cleaned test dataset
cleaned_test.to_csv("cleaned_test.csv", index=False)

print("Cleaned datasets saved successfully as 'cleaned_train.csv' and 'cleaned_test.csv'")


Cleaned datasets saved successfully as 'cleaned_train.csv' and 'cleaned_test.csv'


In [9]:
# Calculate mean
mean_values = cleaned_train.mean(numeric_only=True)

# Calculate median
median_values = cleaned_train.median(numeric_only=True)

# Calculate mode (returns a DataFrame — take the first mode)
mode_values = cleaned_train.mode(numeric_only=True).iloc[0]

# Display the results
print("=== Mean Values ===")
print(mean_values)

print("\n=== Median Values ===")
print(median_values)

print("\n=== Mode Values ===")
print(mode_values)


=== Mean Values ===
Survived     0.383838
Pclass       2.308642
Sex          0.647587
Age         29.361582
SibSp        0.523008
Parch        0.381594
Fare        32.204208
Embarked     1.536476
Title        1.895623
dtype: float64

=== Median Values ===
Survived     0.0000
Pclass       3.0000
Sex          1.0000
Age         28.0000
SibSp        0.0000
Parch        0.0000
Fare        14.4542
Embarked     2.0000
Title        2.0000
dtype: float64

=== Mode Values ===
Survived     0.00
Pclass       3.00
Sex          1.00
Age         28.00
SibSp        0.00
Parch        0.00
Fare         8.05
Embarked     2.00
Title        2.00
Name: 0, dtype: float64


In [None]:
# Calculate standard deviation for numerical columns
std_values = cleaned_train.std(numeric_only=True)

# Display the standard deviations
print("=== Standard Deviation of Numerical Columns ===")
print(std_values)


=== Standard Deviation of Numerical Columns ===
Survived     0.486592
Pclass       0.836071
Sex          0.477990
Age         13.019697
SibSp        1.102743
Parch        0.806057
Fare        49.693429
Embarked     0.791503
Title        0.788465
dtype: float64
