# CodSoft Internship Task: 1
## Titanic Survival Prediction

### Importing Necessary Libraries

In [2]:
# Importing the necessary libraries.
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score
from sklearn.svm import SVC

### Loading the datasets into Pandas DataFrames

In [3]:
# Importing the datasets.
training_set = pd.read_csv('train.csv')
testing_set = pd.read_csv('test.csv')

### Cleaning and Preprocessing the Training Set

In [4]:
# Handling missing values in the training set.
training_set['Age'].fillna(training_set['Age'].median(), inplace=True)
training_set['Embarked'].fillna(training_set['Embarked'].mode()[0], inplace=True)
training_set.drop(['Cabin'], axis=1, inplace=True)

# Making suitable data type conversions.
training_set[['Embarked', 'Survived', 'Pclass', 'Sex']] = training_set[['Embarked', 'Survived', 'Pclass', 'Sex']].astype('category')

# Performing feature engineering on the training set, and dropping the columns used to engineer the new feature.
training_set['FamilySize'] = training_set['SibSp'] + training_set['Parch'] + 1
training_set.drop(['SibSp', 'Parch'], axis=1, inplace=True)

# Performing log normalization on the training set to deal with high variances.
training_set['Age'] = np.log(training_set['Age'])
training_set['Fare'] = np.log(training_set['Fare'] + 0.1)
training_set['FamilySize'] = np.log(training_set['FamilySize'])

# Performing one-hot encoding on the categorical columns to convert them into numeric format.
le = LabelEncoder()
training_set['Sex'] = le.fit_transform(training_set['Sex'])
training_set = pd.get_dummies(training_set, columns=['Pclass', 'Embarked'])

# Performing feature scaling on the training set.
standard_scaler = StandardScaler()
training_set[['Age', 'Fare', 'FamilySize']] = standard_scaler.fit_transform(training_set[['Age', 'Fare', 'FamilySize']])

# Performing feature selection by dropping irrelevant columns, and the column to be predicted.
X_train = training_set.drop(['PassengerId', 'Survived', 'Name', 'Ticket'], axis=1)
y_train = training_set['Survived']

### Cleaning and Preprocessing the Testing Set

In [5]:
# Handling missing values in the testing set.
testing_set['Age'].fillna(testing_set['Age'].median(), inplace=True)
testing_set['Fare'].fillna(testing_set['Fare'].median(), inplace=True)
testing_set.drop(['Cabin'], axis=1, inplace=True)

# Making suitable data type conversions.
testing_set[['Embarked', 'Pclass', 'Sex']] = testing_set[['Embarked', 'Pclass', 'Sex']].astype('category')

# Performing feature engineering on the testing set, and dropping the columns used to engineer the new feature.
testing_set['FamilySize'] = testing_set['SibSp'] + testing_set['Parch'] + 1
testing_set.drop(['SibSp', 'Parch'], axis=1, inplace=True)

# Performing log normalization on the testing set to deal with high variances.
testing_set['Age'] = np.log(testing_set['Age'])
testing_set['Fare'] = np.log(testing_set['Fare'] + 0.1)
testing_set['FamilySize'] = np.log(testing_set['FamilySize'])

# Performing one-hot encoding on the categorical columns to convert them into numeric format.
testing_set['Sex'] = testing_set['Sex'].map({'male': 1, 'female': 0})
testing_set = pd.get_dummies(testing_set, columns=['Pclass', 'Embarked'])

# Performing feature scaling on the testing set.
testing_set[['Age', 'Fare', 'FamilySize']] = standard_scaler.transform(testing_set[['Age', 'Fare', 'FamilySize']])

# Performing feature selection by dropping irrelevant columns.
X_test = testing_set.drop(['PassengerId', 'Name', 'Ticket'], axis=1)

### Making Predictions using SVM Classifier

In [6]:
# After fine-tuning the model, the SVM classifier is implemented.
svm = SVC(gamma=0.1)

# Fit the model to the training data.
svm.fit(X_train, y_train)

# Evaluate with cross-validation.
svm_cv_scores = cross_val_score(svm, X_train, y_train, cv=5, scoring='accuracy')

print("Tuned SVM CV Accuracy:", svm_cv_scores.mean())

# Making predictions on the test set.
y_pred = svm.predict(X_test)

# Printing the first few predictions.
print(y_pred[:10])

Tuned SVM CV Accuracy: 0.8226727763480006
[0 1 0 0 1 0 1 0 1 0]
