In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.cluster import KMeans

In [None]:
df = pd.concat([pd.read_csv('train.csv'), pd.read_csv('test.csv')])
print(df.info())

In [None]:
print(df.head())

In [None]:
print(df.describe())

In [None]:
survived_sex = df[df.Survived == 1]
survived_sex = survived_sex.groupby('Sex')['Survived'].count()
survived_sex.plot(kind='bar', 
                  color=['green', 'red'],
                  title='Who had more chanses to survive by sex?')
print(survived_sex)

In [None]:
survived_pclass = df[df.Survived == 1]
survived_pclass = survived_pclass.groupby('Pclass')['Survived'].count().sort_values(ascending=False)
survived_pclass.plot(kind='bar', color=['green', 'blue', 'red'],
                     title='Who had more chanses to survive by passenger class?')
print(survived_pclass)

In [None]:
survived = df[df["Survived"] == 1]["Age"].dropna()
perished = df[df["Survived"] == 0]["Age"].dropna()

fig, (ax1, ax2) = plt.subplots(nrows=2, ncols=1)
fig.subplots_adjust(hspace=1)
ax1.hist(survived, facecolor='green')
ax1.set_title("Survived")
ax1.set_xlabel("Age")
ax1.set_ylabel("Amount")
ax2.hist(perished, facecolor='red')
ax2.set_title("Dead")
ax2.set_xlabel("Age")
ax2.set_ylabel("Amount")

In [None]:
non_empty_embarked = df["Embarked"].dropna()
unique_values, value_counts = non_empty_embarked.unique(), non_empty_embarked.value_counts()
X = range(len(unique_values))
colors = ["red", "blue", "grey"]

plt.bar(left=X,
        height=value_counts,
        color=colors,
        tick_label=unique_values)
plt.xlabel("Port of Embarkation")
plt.ylabel("Amount of embarked")
plt.title("Bar plot of embarked in Southampton, Queenstown, Cherbourg")

In [None]:
plt.figure(figsize=(6,6))
plt.scatter(df.Age, df.Fare, marker='o', s=20, c='r', alpha=.3)
plt.xlabel('Age of passengers, yo')
plt.ylabel('The passenger fares')
plt.show()

In [None]:
plt.figure(figsize=(6,6))
plt.scatter(df.Age, df.SibSp, marker='o', s=7, c='b', alpha=.3)
plt.xlabel('Age of passengers, yo')
plt.ylabel('The having of brothers and sisters')
plt.show()

In [None]:
df["Embarked"].fillna("S", inplace=True)
df["Age"].fillna(df["Age"].median(), inplace=True)
df["Fare"].fillna(df["Fare"].median(), inplace=True)

In [None]:
df['family_size'] = df['SibSp'] + df['Parch'] + 1
family_size = df.groupby('family_size')['Survived'].mean().sort_values(ascending=False)
print(family_size)
family_size.plot(kind='bar', title='Who had more chanses to survive by the family size')

In [None]:
age_fare = np.hstack([df.Age.values, df.Fare.values])
print(age_fare[:10])

In [None]:
### Get some clusters of the age and fare of passengers: age_fare_clusters
age_fare_clusters = np.array(df.loc[:,['Age', 'Fare']])


kmeans = KMeans(n_clusters=16)
kmeans.fit(age_fare_clusters)

print('\n Coordinates of cluster centers:\n{}. \
      \n Labels of each point:{}. \
      \n The value of the inertia criterion associated with the chosen partition: {}. \
      \n The inertia is defined as the sum of square distances of samples \
      to their nearest neighbor.'.format(kmeans.cluster_centers_, \
      kmeans.labels_, kmeans.inertia_))

In [None]:
df['age_fare_clusters'] = kmeans.labels_
print(df.age_fare_clusters[:10])

In [None]:
plt.figure(figsize=(6,6))
plt.scatter(df.tickets, df.Fare, marker='o', s=7, c='g', alpha=.3)
plt.xlabel('tickets number')
plt.ylabel('The fare')
plt.show()

In [None]:
df['name_length'] = df['Name'].apply(len)

In [None]:
print(df.head())

In [None]:
df.groupby('Survived')['name_length'].mean().sort_values(ascending=False)

In [None]:
# The binary features add:
def get_dummie_columns(df, name_column):
    """Convert to binar number of value categories current column"""
    
    df_dummie = pd.get_dummies(df[name_column][:], prefix=name_column)
    df_dummie = pd.concat([df[:],df_dummie[:]], axis=1)
    
    return df_dummie

In [None]:
name_column = ['Sex','Pclass', 'Embarked', 'family_size']

for col in name_column:
    x = df.loc[:,:]
    df = get_dummie_columns(x, col)
    df.drop(col, axis=1, inplace=True)

In [None]:
df.info()

In [None]:
tickets = df['Ticket'].values
print(tickets[:10])

In [None]:
df['tickets'] = pd.Series([ticket.split(' ')[-1] for ticket in tickets])
df.tickets = df['tickets'].values.astype(int)
print(df.tickets[:10])

In [None]:
plt.figure(figsize=(6,6))
plt.scatter(df.Fare, df.tickets,  marker='o', s=7, c='k', alpha=.7)
plt.xlabel('fare')
plt.ylabel('tickets')
plt.show()

In [None]:
df.describe()

In [None]:
df.loc[:,['Cabin', 'Survived']][50:700]

In [None]:
print(df.info())

In [None]:
train = df[(df.Survived == 1) | (df.Survived == 0)]
print(train.head())

In [None]:
survived = train['Survived'][:].values
print(survived.shape)
print(survived[:10])

In [None]:
train.drop(['PassengerId', 'Survived'], axis=1, inplace=True)
print(train.head())

In [None]:
print(df.info())

In [None]:
df.drop(['Cabin', 'Ticket', 'Name'], axis=1, inplace=True)

In [None]:
test = df[(df.Survived != 1) & (df.Survived != 0)]

In [None]:
pass_id = test.PassengerId[:]
print(type(pass_id))
print(pass_id[:5])

In [None]:
test.drop(['PassengerId', 'Survived'], axis=1, inplace=True)
print(test.head())

In [None]:
X = train.values
y = survived
X_pred = test.values
print("Shape X: {0}. Shape y: {1}. Shape X_pred : {2}"\
      .format(X.shape, y.shape, X_pred.shape))

In [None]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler().fit(X)
X_scaled = scaler.transform(X)

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=.2, random_state=42)
print("Shape X_train: {}. Shape y_train: {}. \nShape X_test : {}. Shape y_test : {}". \
     format(X_train.shape, y_train.shape, X_test.shape, y_test.shape))

In [None]:
X_train,X_val,y_train,y_val = train_test_split(X_train,y_train, random_state=144)

In [None]:
print("Shape X_train: {}. Shape y_train: {}. \nShape X_val : {}. Shape y_val : {}". \
     format(X_train.shape, y_train.shape, X_val.shape, y_val.shape))

In [None]:
submission = pd.read_csv('gender_submission.csv', index_col=0, header=0)
print(submission.info())
print(submission.head(10))

In [None]:
submission.Survived = y_pred
print(submission.info())
print(submission.head(10))
submission.to_csv('submission.csv')

In [None]:
import sklearn
print (sklearn.__version__)