In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score, KFold, StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.impute import SimpleImputer
from sklearn.pipeline import make_pipeline


## 1. Load the dataset into python environment

In [18]:
url = "/content/titanic_dataset .csv"
titanic_df = pd.read_csv(url)


## 2. Do all the necessary pre-processing steps

### Set 'PassengerId' as the index

In [19]:
titanic_df.set_index('PassengerId', inplace=True)

### Drop unnecessary columns

In [4]:
titanic_df.drop(['Name', 'Ticket', 'Cabin'], axis=1, inplace=True)

### Convert 'Sex' and 'Embarked' to numerical values

In [5]:
titanic_df['Sex'] = titanic_df['Sex'].map({'male': 0, 'female': 1})
titanic_df = pd.get_dummies(titanic_df, columns=['Embarked'], drop_first=True)


### Handle missing values

In [6]:
imputer = SimpleImputer(strategy='mean')
titanic_df[['Age']] = imputer.fit_transform(titanic_df[['Age']])

### 3. Create kNN and SVM models

In [7]:
X = titanic_df.drop('Survived', axis=1)
y = titanic_df['Survived']

### Split the data into training and testing sets

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

### Create kNN and SVM models

In [9]:
knn_model = make_pipeline(StandardScaler(), KNeighborsClassifier())
svm_model = make_pipeline(StandardScaler(), SVC())

### 4. Do k-fold and stratified stratified k-fold cross validation techniques and find the average accuracy score of the models

### Define the number of folds

In [10]:
k_folds = 5

### Perform k-fold cross-validation for kNN model

In [11]:
knn_cv_scores = cross_val_score(knn_model, X_train, y_train, cv=KFold(n_splits=k_folds, shuffle=True, random_state=42))
average_knn_cv_score = knn_cv_scores.mean()


### Perform stratified k-fold cross-validation for SVM model

In [12]:
svm_cv_scores = cross_val_score(svm_model, X_train, y_train, cv=StratifiedKFold(n_splits=k_folds, shuffle=True, random_state=42))
average_svm_cv_score = svm_cv_scores.mean()

### Display the results

In [13]:
print(f"Average kNN Cross-Validation Accuracy: {average_knn_cv_score}")
print(f"Average SVM Cross-Validation Accuracy: {average_svm_cv_score}")

Average kNN Cross-Validation Accuracy: 0.7936078006500541
Average SVM Cross-Validation Accuracy: 0.8300305328474342
