In [None]:
import numpy as np
import pandas as pd
import scipy as sp
import seaborn as sb
from sklearn.feature_extraction.text import CountVectorizer
import matplotlib as plt

In [None]:
D = pd.read_excel("/content/titanic3.xls")

D

Unnamed: 0,pclass,survived,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked,boat,body,home.dest
0,1,1,"Allen, Miss. Elisabeth Walton",female,29.0000,0,0,24160,211.3375,B5,S,2,,"St Louis, MO"
1,1,1,"Allison, Master. Hudson Trevor",male,0.9167,1,2,113781,151.5500,C22 C26,S,11,,"Montreal, PQ / Chesterville, ON"
2,1,0,"Allison, Miss. Helen Loraine",female,2.0000,1,2,113781,151.5500,C22 C26,S,,,"Montreal, PQ / Chesterville, ON"
3,1,0,"Allison, Mr. Hudson Joshua Creighton",male,30.0000,1,2,113781,151.5500,C22 C26,S,,135.0,"Montreal, PQ / Chesterville, ON"
4,1,0,"Allison, Mrs. Hudson J C (Bessie Waldo Daniels)",female,25.0000,1,2,113781,151.5500,C22 C26,S,,,"Montreal, PQ / Chesterville, ON"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1304,3,0,"Zabour, Miss. Hileni",female,14.5000,1,0,2665,14.4542,,C,,328.0,
1305,3,0,"Zabour, Miss. Thamine",female,,1,0,2665,14.4542,,C,,,
1306,3,0,"Zakarian, Mr. Mapriededer",male,26.5000,0,0,2656,7.2250,,C,,304.0,
1307,3,0,"Zakarian, Mr. Ortin",male,27.0000,0,0,2670,7.2250,,C,,,


In [None]:
D.isnull().sum()

Unnamed: 0,0
pclass,0
survived,0
name,0
sex,0
age,263
sibsp,0
parch,0
ticket,0
fare,1
cabin,1014


In [None]:
D.isnull().sum() / len(D) * 100

Unnamed: 0,0
pclass,0.0
survived,0.0
name,0.0
sex,0.0
age,20.091673
sibsp,0.0
parch,0.0
ticket,0.0
fare,0.076394
cabin,77.463713


In [None]:
D.drop(columns=['home.dest'], inplace=True)
D.drop(columns=['embarked'], inplace=True)
D.drop(columns=['cabin'], inplace=True)
D.drop(columns=['body'], inplace=True)
D.drop(columns=['fare'], inplace=True)
D.drop(columns=['boat'], inplace=True)
D.drop(columns=['ticket'], inplace=True)
D.drop(columns=['sibsp'], inplace=True)
D.drop(columns=['name'], inplace=True)

D

Unnamed: 0,pclass,survived,sex,age,parch
0,1,1,female,29.0000,0
1,1,1,male,0.9167,2
2,1,0,female,2.0000,2
3,1,0,male,30.0000,2
4,1,0,female,25.0000,2
...,...,...,...,...,...
1304,3,0,female,14.5000,0
1305,3,0,female,,0
1306,3,0,male,26.5000,0
1307,3,0,male,27.0000,0


In [None]:
D.isnull().sum()

Unnamed: 0,0
pclass,0
survived,0
sex,0
age,263
parch,0


In [None]:
D.describe()

Unnamed: 0,pclass,survived,age,parch
count,1309.0,1309.0,1046.0,1309.0
mean,2.294882,0.381971,29.881135,0.385027
std,0.837836,0.486055,14.4135,0.86556
min,1.0,0.0,0.1667,0.0
25%,2.0,0.0,21.0,0.0
50%,3.0,0.0,28.0,0.0
75%,3.0,1.0,39.0,0.0
max,3.0,1.0,80.0,9.0


In [None]:
D['age'].fillna(D['age'].median(), inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  D['age'].fillna(D['age'].median(), inplace=True)


In [None]:
D.isnull().sum()

Unnamed: 0,0
pclass,0
survived,0
sex,0
age,0
parch,0


In [None]:
from sklearn.preprocessing import OneHotEncoder

encoder = OneHotEncoder()

encoded_sex = encoder.fit_transform(D[['sex']]).toarray()
encoded_sex_df = pd.DataFrame(encoded_sex, columns=encoder.get_feature_names_out(['sex']))

encoded_pclass = encoder.fit_transform(D[['pclass']]).toarray()
encoded_pclass_df = pd.DataFrame(encoded_pclass, columns=encoder.get_feature_names_out(['pclass']))

D_encoded = pd.concat([D, encoded_sex_df, encoded_pclass_df], axis=1).drop(columns=['sex', 'pclass'])

D_encoded

Unnamed: 0,survived,age,parch,sex_female,sex_male,pclass_1,pclass_2,pclass_3
0,1,29.0000,0,1.0,0.0,1.0,0.0,0.0
1,1,0.9167,2,0.0,1.0,1.0,0.0,0.0
2,0,2.0000,2,1.0,0.0,1.0,0.0,0.0
3,0,30.0000,2,0.0,1.0,1.0,0.0,0.0
4,0,25.0000,2,1.0,0.0,1.0,0.0,0.0
...,...,...,...,...,...,...,...,...
1304,0,14.5000,0,1.0,0.0,0.0,0.0,1.0
1305,0,28.0000,0,1.0,0.0,0.0,0.0,1.0
1306,0,26.5000,0,0.0,1.0,0.0,0.0,1.0
1307,0,27.0000,0,0.0,1.0,0.0,0.0,1.0


In [None]:
from sklearn.model_selection import train_test_split

X = D_encoded.drop(columns=['survived'])  # Features
y = D_encoded['survived']  # Target variable

# Split dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)


In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

# Initialize and train
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)

# Evaluate
accuracy = accuracy_score(y_test, y_pred)
print(f"Random Forest Accuracy: {accuracy:.4f}")


Random Forest Accuracy: 0.8130
