In [None]:
import numpy as np 
import pandas as pd 

# data visulization & analysis
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

# machine learning methods
from sklearn import svm
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler

In [None]:
training = pd.read_csv("train.csv")
testing = pd.read_csv("test.csv")

In [None]:
print(training.shape)
print(training.columns)
# print(training.isnull().sum())
# print(training.info())
# print(training.head())
# print("\n")
print(testing.columns)
# training.describe()

In [None]:
# EXPLORATORY DATA ANALYSIS

# training['Sex'].value_counts()
training['Embarked'].value_counts()
# sns.countplot(x = 'Pclass', data = training)
# sns.countplot(x = 'Embarked', data = training)  # C = Cherbourg, Q = Queenstown, S = Southampton
# sns.countplot(x = 'SibSp', data = training)
# sns.countplot(x = 'Survived', data = training)
# sns.countplot(x = 'Sex', data = training)

In [None]:
# SEEING IF SURVIVAL RATE VARIES BY GENDER

#  Adding new "Died" column to get survival rate
training['Died'] = 1 - training['Survived']

t1 = training.groupby('Sex').agg('sum')[['Survived', 'Died']]
print(t1)
      
t1.plot(kind='bar',figsize = (10,5),stacked = True) 

In [None]:
# Same as above but shown as % of gender surviving
training.groupby('Sex').agg('mean')[['Survived', 'Died']].plot(kind='bar',
                                                               figsize = (10,5),
                                                               stacked = True
                                                              )

In [None]:
# Does Survival rate vary by Pclass?

training.groupby('Pclass').agg('mean')[['Survived', 'Died']].plot(kind='bar',
                                                               figsize = (10,5),
                                                               stacked = True
                                                              )

In [None]:
# Does Survival rate vary by Embarked?

training.groupby('Embarked').agg('mean')[['Survived', 'Died']].plot(kind='bar',
                                                               figsize = (10,5),
                                                               stacked = True
                                                              ) 

In [None]:
#  VISUALIZING SURVIVAL BASED ON FARE

figure = plt.figure(figsize=(15,7))

plt.hist([training[training['Survived'] == 1]['Fare'], training[training['Died'] == 1]["Fare"]],
         stacked = True,
         bins = 50, 
         label = ['Survived', 'Dead']
        )
plt.xlabel('Fare')
plt.ylabel('Number of Passengers')
plt.legend()

# very few passengers with high fares died 

In [None]:
# CREATING A FEATURE: "TITLE"

# Let's see if I can extract the titles for each passenger ("Sir", "Mr.", "Miss", etc.)
# training["Name"] is in format: "[lastname], [title]. [first name]"
# ** this only works because all passengers have a title

# Can isolate the title by splitting on "," and taking the second part,
# then splitting on "." and taking the first part

titles = set()
part1 = ""
part2 = ""

for name in training['Name']:
    part1 = name.split(',')[1]
    part2 = part1.split('.')[0]
    titles.add(part2.strip())
    
print(titles)


In [None]:
# Can simplify the Titles with this dictionary:
Title_Dictionary = {"Capt"  : "Officer", 
                    "Col"   : "Officer",
                    "Major" : "Officer",
                    "Dr"    : "Officer", 
                    "Rev"   : "Officer", 
                    "Don"   : "Royalty", 
                    "Sir"   : "Royalty", 
                    "Jonkheer": "Royalty",
                    "the Countess":"Royalty", 
                    "Mr"    : "Mr", 
                    "Mme"   : "Mrs", 
                    "Mrs"   : "Mrs", 
                    "Ms"    : "Miss", 
                    "Miss"  : "Miss", 
                    "Mlle"  : "Miss", 
                    "Master" : "Master", 
                    "Lady"  : "Royalty" 
                    }
    
# adding "Title" column to data

training["Title"] = training['Name'].apply(lambda x: x.split(',')[1].split('.')[0].strip())
training["Title"] = training["Title"].map(Title_Dictionary)
# print(training["Title"].value_counts())

testing["Title"] = testing['Name'].apply(lambda x: x.split(',')[1].split('.')[0].strip())
testing["Title"] = testing["Title"].map(Title_Dictionary)
# print(testing["Title"].value_counts())

In [None]:
# train = training.drop(['Name', 'Ticket', 'Cabin', 'PassengerId', 'Died'], axis=1)
# test = testing.drop(['Name', 'Ticket', 'Cabin','PassengerId'], axis=1)
train = training.drop(['Name', 'Ticket', 'Cabin', 'Died'], axis=1)
test = testing.drop(['Name', 'Ticket', 'Cabin'], axis=1)

print(train.columns)
print(test.columns)

In [None]:
#  Convert Categorical Str Variables to Numbers

# train['Sex'].replace(['female', 'male'], [0, 1], inplace=True) 
train['Sex'].replace({'female':0, 'male':1}, inplace = True) 
train['Embarked'].replace({'S':0, 'C':1, 'Q':2}, inplace = True ) 
train['Title'].replace({'Miss':0, 'Mrs':1, 'Mr':2, 'Master':3, 'Officer':4, 'Royalty':5}, inplace=True) 

test['Sex'].replace({'female':0, 'male':1}, inplace = True) 
test['Embarked'].replace({'S':0, 'C':1, 'Q':2}, inplace = True ) 
test['Title'].replace({'Miss':0, 'Mrs':1, 'Mr':2, 'Master':3, 'Officer':4, 'Royalty':5}, inplace=True) 

# print(train.head())
# print(test.info())

In [None]:
# There are still some null values in the Age column that need to be filled in 
# I will fill the nulls with the average age for their gender

avg_age_f = train[train['Sex'] == 0]['Age'].mean()
avg_age_m = train[train['Sex'] == 1]['Age'].mean()
# print(round(avg_age_f, 2))
# print(round(avg_age_m, 2))

train.loc[(train.Age.isnull()) & (train["Sex"]==0), 'Age'] = avg_age_f
train.loc[(train.Age.isnull()) & (train["Sex"]==1), 'Age'] = avg_age_m

print(train.isnull().sum())

# Drop the 2 remaining null values for simplicity
train.dropna(inplace=True)

print(train.isnull().sum())

In [None]:
print(test.isnull().sum())

# need to fill in the null Age rows
test.loc[(test.Age.isnull()) & (test["Sex"]==0), 'Age'] = avg_age_f
test.loc[(test.Age.isnull()) & (test["Sex"]==1), 'Age'] = avg_age_m

print(test.isnull().sum())

In [None]:
avg_fare = test['Fare'].mean()
avg_fare = round(avg_fare, 2)

# print(round(avg_fare,2))

# need to fill in the null Fare rows

test['Fare'] = test['Fare'].fillna(avg_fare)
# test.loc[(test.Fare.isnull())] = avg_fare
test['Title'] = test['Title'].fillna(0)

print(test.isnull().sum())

In [None]:
train.describe()

In [None]:
# Scaling the 'Fare' and 'Age' columns so each is on a scale from 0 to 1
scaler = MinMaxScaler()

train[['Age', 'Fare']] = scaler.fit_transform(train[['Age', 'Fare']])
test[['Age', 'Fare']] = scaler.fit_transform(test[['Age', 'Fare']])

# train.describe()
# test.head()


In [None]:
train_x = train[['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked', 'Title']].astype('int32')
train_y = train['Survived'].astype('int32')
test_x = test.drop(["PassengerId"], axis = 1)

# test is text_x, and test_y does not exist because Kaggle already split it up for me

# print("train_x:")
# print(train_x.info())

# print("\ntext_x:")
print(test_x.info())

# test_x = testing[train_x.columns.values]
# print(training.shape, train_x.shape, train_y.shape, test_x.shape)

In [None]:
# BUILDING THE MODEL on my cleaned up 'train' Dataframe!

my_model = LogisticRegression()
my_model.fit(train_x, train_y)

In [None]:
y_predicted = my_model.predict(test_x)
y_predicted

In [None]:
df = pd.DataFrame(y_predicted, columns = ['Survived'])
df.info()

In [None]:
submission = pd.DataFrame({"PassengerId" : test['PassengerId'], "Survived" : y_predicted}).astype('int32')
submission.info()

In [None]:
submission.to_csv("submission.csv", index=False)