# Importing Libraries

In [71]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn import metrics
import pickle

    # Importing the Data

In [72]:
columns_to_be_added_as_features = ['Sex','Age','SibSp','Parch','Pclass','Fare','Embarked']

train_df = pd.read_csv(r'C:\Users\User\Desktop\Machine Learning\Projects\Titanic Dataset\train.csv', usecols=columns_to_be_added_as_features + ['Survived'])
test_df_matcher = pd.read_csv(r'C:\Users\User\Desktop\Machine Learning\Projects\Titanic Dataset\test.csv', usecols=columns_to_be_added_as_features + ['PassengerId'])
test_df = test_df_matcher[columns_to_be_added_as_features]

In [73]:
train_df.head(1000)

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,0,3,male,22.0,1,0,7.2500,S
1,1,1,female,38.0,1,0,71.2833,C
2,1,3,female,26.0,0,0,7.9250,S
3,1,1,female,35.0,1,0,53.1000,S
4,0,3,male,35.0,0,0,8.0500,S
...,...,...,...,...,...,...,...,...
886,0,2,male,27.0,0,0,13.0000,S
887,1,1,female,19.0,0,0,30.0000,S
888,0,3,female,,1,2,23.4500,S
889,1,1,male,26.0,0,0,30.0000,C


In [74]:
test_df.head()

Unnamed: 0,Sex,Age,SibSp,Parch,Pclass,Fare,Embarked
0,male,34.5,0,0,3,7.8292,Q
1,female,47.0,1,0,3,7.0,S
2,male,62.0,0,0,2,9.6875,Q
3,male,27.0,0,0,3,8.6625,S
4,female,22.0,1,1,3,12.2875,S


In [75]:
print("Number of rows in training set: {}".format(len(train_df)))
print("Number of rows in test set: {}".format(len(test_df)))

Number of rows in training set: 891
Number of rows in test set: 418


# Preprocessing

In [76]:
for column_title in columns_to_be_added_as_features:
    if column_title in ['Embarked', "Sex"]:
        continue
    train_df[column_title] = pd.to_numeric(train_df[column_title], downcast="float")
    test_df[column_title] = pd.to_numeric(test_df[column_title], downcast="float")

train_df["Survived"] = pd.to_numeric(train_df["Survived"], downcast="float")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_df[column_title] = pd.to_numeric(test_df[column_title], downcast="float")


In [77]:
train_df.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,0.0,3.0,male,22.0,1.0,0.0,7.25,S
1,1.0,1.0,female,38.0,1.0,0.0,71.283302,C
2,1.0,3.0,female,26.0,0.0,0.0,7.925,S
3,1.0,1.0,female,35.0,1.0,0.0,53.099998,S
4,0.0,3.0,male,35.0,0.0,0.0,8.05,S


In [78]:
train_df['Embarked'].replace('Q', 0,inplace=True)
train_df['Embarked'].replace('S', 1,inplace=True)
train_df['Embarked'].replace('C', 2,inplace=True)

test_df['Embarked'].replace('Q', 0,inplace=True)
test_df['Embarked'].replace('S', 1,inplace=True)
test_df['Embarked'].replace('C', 2,inplace=True)

train_df['Sex'].replace('male', 0,inplace=True)
train_df['Sex'].replace('female', 1,inplace=True)

test_df['Sex'].replace('male', 0,inplace=True)
test_df['Sex'].replace('female', 1,inplace=True)

train_df.head()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return self._update_inplace(result)


Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,0.0,3.0,0,22.0,1.0,0.0,7.25,1.0
1,1.0,1.0,1,38.0,1.0,0.0,71.283302,2.0
2,1.0,3.0,1,26.0,0.0,0.0,7.925,1.0
3,1.0,1.0,1,35.0,1.0,0.0,53.099998,1.0
4,0.0,3.0,0,35.0,0.0,0.0,8.05,1.0


# Normalization

In [79]:
def normalize(df):
    result = df.copy()
    for feature_name in df.columns:
        max_value = df[feature_name].max()
        min_value = df[feature_name].min()
        result[feature_name] = (df[feature_name] - min_value) / (max_value - min_value)
    return result

train_df = normalize(train_df)
test_df = normalize(test_df)

train_df.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,0.0,1.0,0.0,0.271174,0.125,0.0,0.014151,0.5
1,1.0,0.0,1.0,0.472229,0.125,0.0,0.139136,1.0
2,1.0,1.0,1.0,0.321438,0.0,0.0,0.015469,0.5
3,1.0,0.0,1.0,0.434531,0.125,0.0,0.103644,0.5
4,0.0,1.0,0.0,0.434531,0.0,0.0,0.015713,0.5


# Train Test Split

In [80]:
# Shuffle the dataset to avoid groupings in only one set

train_df = train_df.sample(frac=1).reset_index(drop=True)

In [81]:
# Fill the missing values with zeros

train_df = train_df.fillna(0)
test_df = test_df.fillna(0)

In [82]:
# We have training set split into 80/20, 80-for training and 20-for validation
validation_set_ratio = 0.2
validation_set_size = int(len(train_df)*validation_set_ratio)
training_set_size = len(train_df) - validation_set_size

print("Total set size: {}".format(len(train_df)))
print("Training set size: {}".format(training_set_size))
print("Validation set size: {}".format(validation_set_size))

Total set size: 891
Training set size: 713
Validation set size: 178


In [83]:
train, val = train_test_split(train_df, test_size=validation_set_ratio)

train_X = train[columns_to_be_added_as_features]
train_Label = train[['Survived']]

val_X = val[columns_to_be_added_as_features]
val_Label = val[['Survived']]

# Training

In [84]:
# We will use Support Vector Machines for Modeling

SVM_KERNEL = "linear"
SVM_C = 10
SVM_GAMMA = 0.000001

svm_model = SVC(kernel = SVM_KERNEL, C = SVM_C, gamma = SVM_GAMMA)
svm_model.fit(train_X, train_Label)

  return f(*args, **kwargs)


SVC(C=10, gamma=1e-06, kernel='linear')

# Validation

In [85]:
# Checking the accuracy with validation score
y_pred = svm_model.predict(val_X)
print("Accuracy:",metrics.accuracy_score(val_Label, y_pred))

Accuracy: 0.7877094972067039


# Predicting the Test Set

In [86]:
test_pred = svm_model.predict(test_df)

# Generate the Output

In [87]:
result = pd.DataFrame(test_df_matcher['PassengerId'])
print(result.head(10))

   PassengerId
0          892
1          893
2          894
3          895
4          896
5          897
6          898
7          899
8          900
9          901


In [88]:
result.insert(1, "Survived", test_pred, True)
result["Survived"] = pd.to_numeric(result["Survived"], downcast="integer")
print(result.head(10))

   PassengerId  Survived
0          892         0
1          893         1
2          894         0
3          895         0
4          896         1
5          897         0
6          898         1
7          899         0
8          900         1
9          901         0


# Generating the Output File

In [89]:
result.to_csv(r'C:\Users\User\Desktop\Machine Learning\Projects\Titanic Dataset\res.csv', index=False)

# Export Model as a File

In [90]:
filename = 'svm_model.sav'
pickle.dump(svm_model, open(filename, 'wb'))