#### Decision Trees

#### Random Forests

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import LabelEncoder

#load dataset
df = pd.read_csv('Titanic-Dataset.csv')
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [None]:
# Pclass 3, sex male, age 45, sibsp 2, parch 0, fare 10.00

In [4]:
#basic preprocessing 
df = df[['Survived', 'Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare']] #select features
df.dropna(inplace=True)  #drop rows with missing Age

#check for null values
a= df.isnull().sum()
print(a)

#encode categorical variables
le = LabelEncoder()
df['Sex'] = le.fit_transform(df['Sex'])  # male=1, female=0

#split data into features and target
X = df.drop('Survived', axis=1)
y = df['Survived']


Survived    0
Pclass      0
Sex         0
Age         0
SibSp       0
Parch       0
Fare        0
dtype: int64


In [3]:
#train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

#Random Forest Model
rf = RandomForestClassifier(n_estimators = 100, max_depth = 5, random_state=42)
rf.fit(X_train, y_train)

#Predictions
y_pred = rf.predict(X_test)

#Evaluation
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:", classification_report(y_test, y_pred))

Accuracy: 0.7902097902097902
Classification Report:               precision    recall  f1-score   support

           0       0.81      0.85      0.83        87
           1       0.75      0.70      0.72        56

    accuracy                           0.79       143
   macro avg       0.78      0.77      0.78       143
weighted avg       0.79      0.79      0.79       143



In [6]:
predicted_category = rf.predict([[3,1,45,2,1,10.00]])
print("Predicted Category:", predicted_category[0])

Predicted Category: 0




In [7]:
import pickle

#save the model 

with open('titanic_dataset.pkl', 'wb') as model_file:
    pickle.dump(rf, model_file)

