In [60]:
# 引入库包
%matplotlib inline
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score
import time
import warnings
warnings.filterwarnings('ignore')

In [61]:
df_train = pd.read_csv('data/train.csv',header=None)
df_train.columns = ['PassengerId','Survived','Pclass','Name','Sex','Age',"SibSp","Parch","Ticket","Fare","Cabin","Embarked"]
df_train.drop(index=0,inplace=True)
df_train.head()
#PassengerId:乘客编号
#Survived：存活情况（存活：1，死亡：0）
#Pclass：客舱等级
#Name：乘客姓名
#Sex：性别
#Age：年龄
#SibSp：同乘的兄弟姐妹/配偶数
#Parch：同乘的父母/小孩数
#Ticket：船票编号
#Fare：船票价格
#Cabin：客舱号
#Embarked：登船港口

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
1,1,0,3,"Braund, Mr. Owen Harris",male,22,1,0,A/5 21171,7.25,,S
2,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38,1,0,PC 17599,71.2833,C85,C
3,3,1,3,"Heikkinen, Miss. Laina",female,26,0,0,STON/O2. 3101282,7.925,,S
4,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35,1,0,113803,53.1,C123,S
5,5,0,3,"Allen, Mr. William Henry",male,35,0,0,373450,8.05,,S


In [62]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 891 entries, 1 to 891
Data columns (total 12 columns):
PassengerId    891 non-null object
Survived       891 non-null object
Pclass         891 non-null object
Name           891 non-null object
Sex            891 non-null object
Age            714 non-null object
SibSp          891 non-null object
Parch          891 non-null object
Ticket         891 non-null object
Fare           891 non-null object
Cabin          204 non-null object
Embarked       889 non-null object
dtypes: object(12)
memory usage: 90.5+ KB


In [63]:
df_train.select_dtypes(include="object").describe().T.assign(
  missing_pct=df_train.apply(lambda x : (len(x)-x.count())/float(len(x))))

Unnamed: 0,count,unique,top,freq,missing_pct
PassengerId,891,891,455,1,0.0
Survived,891,2,0,549,0.0
Pclass,891,3,3,491,0.0
Name,891,891,"Andrews, Mr. Thomas Jr",1,0.0
Sex,891,2,male,577,0.0
Age,714,88,24,30,0.198653
SibSp,891,7,0,608,0.0
Parch,891,7,0,678,0.0
Ticket,891,681,1601,7,0.0
Fare,891,248,8.05,43,0.0


In [64]:
#姓名特征处理
df_train['Title'] = df_train['Name'].str.extract(' ([A-Za-z]+)\.', expand=False)

title_Dict = {}
title_Dict.update(dict.fromkeys(['Capt', 'Col', 'Major', 'Dr', 'Rev'], 'Officer'))
title_Dict.update(dict.fromkeys(['Don', 'Sir', 'the Countess', 'Dona', 'Lady'], 'Royalty'))
title_Dict.update(dict.fromkeys(['Mme', 'Ms', 'Mrs'], 'Mrs'))
title_Dict.update(dict.fromkeys(['Mlle', 'Miss'], 'Miss'))
title_Dict.update(dict.fromkeys(['Mr'], 'Mr'))
title_Dict.update(dict.fromkeys(['Master','Jonkheer'], 'Master'))

df_train['Title'] = df_train['Title'].map(title_Dict)

通过对Ticket数据的分析，我们可以看到部分票号数据有重复，同时结合亲属人数及名字的数据，和票价船舱等级对比，我们可以知道购买的票中有家庭票和团体票，所以我们需要将团体票的票价分配到每个人的头上

In [65]:
#船票编号特征处理
df_train['Fare'] = df_train['Fare'].astype("float")
df_train['Group_Ticket'] = df_train['Fare'].groupby(by=df_train['Ticket']).transform('count')
df_train['Fare'] = df_train['Fare'] / df_train['Group_Ticket']
df_train.drop(['Group_Ticket'], axis=1, inplace=True)

In [66]:
df_train.drop("PassengerId",1,inplace=True)
df_train.drop("Name",1,inplace=True)
df_train.drop("Ticket",1,inplace=True)

In [67]:
df_train.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Cabin,Embarked,Title
1,0,3,male,22,1,0,7.25,,S,Mr
2,1,1,female,38,1,0,71.2833,C85,C,Mrs
3,1,3,female,26,0,0,7.925,,S,Miss
4,1,1,female,35,1,0,26.55,C123,S,Mrs
5,0,3,male,35,0,0,8.05,,S,Mr


In [68]:
df_train.select_dtypes(include="object").describe().T.assign(
  missing_pct=df_train.apply(lambda x : (len(x)-x.count())/float(len(x))))

Unnamed: 0,count,unique,top,freq,missing_pct
Survived,891,2,0,549,0.0
Pclass,891,3,3,491,0.0
Sex,891,2,male,577,0.0
Age,714,88,24,30,0.198653
SibSp,891,7,0,608,0.0
Parch,891,7,0,678,0.0
Cabin,204,147,G6,4,0.771044
Embarked,889,3,S,644,0.002245
Title,890,6,Mr,517,0.001122


In [69]:
df_train.drop("Cabin",1,inplace=True)

In [70]:
df_train.Embarked.isnull().value_counts()

False    889
True       2
Name: Embarked, dtype: int64

In [71]:
#删除数据集中某行的特征
def drop_feature_null_rows(df,feature):
    df[feature] = df[feature].fillna("999")#登船港口
    null_rows_index = df[(df[feature] == "999")].index.tolist()
    return df.drop(null_rows_index)

#删除登船港口的空行
df_train =drop_feature_null_rows(df_train,'Embarked')
df_train.shape

(889, 9)

In [72]:
df_train.select_dtypes(include="object").describe().T.assign(
  missing_pct=df_train.apply(lambda x : (len(x)-x.count())/float(len(x))))

Unnamed: 0,count,unique,top,freq,missing_pct
Survived,889,2,0,549,0.0
Pclass,889,3,3,491,0.0
Sex,889,2,male,577,0.0
Age,712,88,24,30,0.1991
SibSp,889,7,0,606,0.0
Parch,889,7,0,676,0.0
Embarked,889,3,S,644,0.0
Title,888,6,Mr,517,0.001125


In [73]:
df_train = pd.get_dummies(df_train,columns = ["Sex","Pclass","Embarked","Title"])
df_train.head()

Unnamed: 0,Survived,Age,SibSp,Parch,Fare,Sex_female,Sex_male,Pclass_1,Pclass_2,Pclass_3,Embarked_C,Embarked_Q,Embarked_S,Title_Master,Title_Miss,Title_Mr,Title_Mrs,Title_Officer,Title_Royalty
1,0,22,1,0,7.25,0,1,0,0,1,0,0,1,0,0,1,0,0,0
2,1,38,1,0,71.2833,1,0,1,0,0,1,0,0,0,0,0,1,0,0
3,1,26,0,0,7.925,1,0,0,0,1,0,0,1,0,1,0,0,0,0
4,1,35,1,0,26.55,1,0,1,0,0,0,0,1,0,0,0,1,0,0
5,0,35,0,0,8.05,0,1,0,0,1,0,0,1,0,0,1,0,0,0


In [74]:
df_train.columns

Index(['Survived', 'Age', 'SibSp', 'Parch', 'Fare', 'Sex_female', 'Sex_male',
       'Pclass_1', 'Pclass_2', 'Pclass_3', 'Embarked_C', 'Embarked_Q',
       'Embarked_S', 'Title_Master', 'Title_Miss', 'Title_Mr', 'Title_Mrs',
       'Title_Officer', 'Title_Royalty'],
      dtype='object')

In [75]:
df_train.fillna(0,inplace=True)

In [76]:
from sklearn import preprocessing

#归一化处理
min_max_scaler = preprocessing.MinMaxScaler()
X_temp = min_max_scaler.fit_transform(df_train[["Fare","SibSp","Parch","Age"]])
#标准化处理
df_train[["Fare","SibSp","Parch","Age"]] = preprocessing.scale(X_temp)

# df_train.head()

In [77]:
Y = df_train.Survived
X = df_train.drop('Survived',1,inplace=False)

In [78]:
X.shape

(889, 18)

In [79]:
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.3, random_state=123)

## NN

In [21]:
num_examples = len(x_train) # 样本数
nn_input_dim = 18 # 输入的维度
nn_output_dim = 2 # 输出的类别个数
 
# 梯度下降参数
epsilon = 0.01 # 学习率
reg_lambda = 0.01 # 正则化参数

# 定义损失函数(才能用梯度下降啊...)
def calculate_loss(model):
    W1, b1, W2, b2 = model['W1'], model['b1'], model['W2'], model['b2']
    # 向前推进，前向运算
    z1 = x_train.dot(W1) + b1
    a1 = np.tanh(z1)
    z2 = a1.dot(W2) + b2
    exp_scores = np.exp(z2)
    probs = exp_scores / np.sum(exp_scores, axis=1, keepdims=True)
    # 计算损失
    corect_logprobs = -np.log(probs[range(num_examples), y_train])
    data_loss = np.sum(corect_logprobs)
    # 也得加一下正则化项
    data_loss += reg_lambda/2 * (np.sum(np.square(W1)) + np.sum(np.square(W2)))
    return 1./num_examples * data_loss

In [32]:
# 完整的训练建模函数定义
def build_model(nn_hdim, num_passes=2, print_loss=False):
    '''
    参数：
    1) nn_hdim: 隐层节点个数
    2）num_passes: 梯度下降迭代次数
    3）print_loss: 设定为True的话，每1000次迭代输出一次loss的当前值
    '''
    # 随机初始化一下权重呗
    np.random.seed(0)
    W1 = np.random.randn(nn_input_dim, nn_hdim) / np.sqrt(nn_input_dim)
    b1 = np.zeros((1, nn_hdim))
    W2 = np.random.randn(nn_hdim, nn_output_dim) / np.sqrt(nn_hdim)
    b2 = np.zeros((1, nn_output_dim))
 
    # 这是咱们最后学到的模型
    model = {}
     
    # 开始梯度下降...
    for i in range(0, num_passes):
 
        # 前向运算计算loss
        print(x_train.shape,W1.shape,b1.shape)
        z1 = x_train.dot(W1)
        z1 = np.array(z1)
        b1 = np.array(b1)
        z1 = z1 + b1
        a1 = np.tanh(z1)
        z2 = a1.dot(W2) + b2
        exp_scores = np.exp(z2)
        probs = exp_scores / np.sum(exp_scores, axis=1, keepdims=True)
 
        # 反向传播
        delta3 = probs
        delta3[range(num_examples),y_train] -= 1
        dW2 = (a1.T).dot(delta3)
        db2 = np.sum(delta3, axis=0, keepdims=True)
        delta2 = delta3.dot(W2.T) * (1 - np.power(a1, 2))
        dW1 = np.dot(X.T, delta2)
        db1 = np.sum(delta2, axis=0)
 
        # 加上正则化项
        dW2 += reg_lambda * W2
        dW1 += reg_lambda * W1
 
        # 梯度下降更新参数
        W1 += -epsilon * dW1
        b1 += -epsilon * db1
        W2 += -epsilon * dW2
        b2 += -epsilon * db2
         
        # 得到的模型实际上就是这些权重
        model = { 'W1': W1, 'b1': b1, 'W2': W2, 'b2': b2}
         
        # 如果设定print_loss了，那我们汇报一下中间状况
        if print_loss and i % 1000 == 0:
              print ("Loss after iteration %i: %f" %(i, calculate_loss(model)))
     
    return model

In [33]:
# 判定结果的函数
def predict(model, x):
    W1, b1, W2, b2 = model['W1'], model['b1'], model['W2'], model['b2']
    # 前向运算
    z1 = x.dot(W1) + b1
    a1 = np.tanh(z1)
    z2 = a1.dot(W2) + b2
    exp_scores = np.exp(z2)
    # 计算概率输出最大概率对应的类别
    probs = exp_scores / np.sum(exp_scores, axis=1, keepdims=True)
    return np.argmax(probs, axis=1)

In [34]:
# 建立隐层有3个节点(神经元)的神经网络
model = build_model(3, print_loss=True)
 
# 然后再把决策
print(predict(model, x_test))

(622, 18) (18, 3) (1, 3)


IndexError: only integers, slices (`:`), ellipsis (`...`), numpy.newaxis (`None`) and integer or boolean arrays are valid indices

# LR 模型训练

In [None]:
from sklearn.linear_model import LogisticRegression

lr = LogisticRegression()
lr.fit(x_train, y_train)

print(u'LR模型的平均正确率为：%s' % lr.score(x_test, y_test))

# 预测

In [None]:
df_test = pd.read_csv('data/test.csv',header=None)
df_test.columns = ['PassengerId','Pclass','Name','Sex','Age',"SibSp","Parch","Ticket","Fare","Cabin","Embarked"]
df_test.drop(index=0,inplace=True)

In [None]:
df_temp = df_test.copy()

In [None]:
df_test.drop("PassengerId",1,inplace=True)

df_test['Title'] = df_test['Name'].str.extract(' ([A-Za-z]+)\.', expand=False)
df_test['Title'] = df_test['Title'].map(title_Dict)
df_test.drop("Name",1,inplace=True)


df_test.drop("Cabin",1,inplace=True)

df_test['Fare'] = df_test['Fare'].astype("float")
df_test['Fare'] = df_test[['Fare']].fillna(df_test.groupby('Pclass').transform(np.mean))

#船票编号特征处理
df_test['Fare'] = df_test['Fare'].astype("float")
df_test['Group_Ticket'] = df_test['Fare'].groupby(by=df_test['Ticket']).transform('count')
df_test['Fare'] = df_test['Fare'] / df_test['Group_Ticket']
df_test.drop(['Group_Ticket'], axis=1, inplace=True)
df_test.drop("Ticket",1,inplace=True)

df_test = pd.get_dummies(df_test,columns = ["Sex","Pclass","Embarked","Title"])
df_test.fillna(0,inplace=True)

print(df_test.columns)

In [None]:
from sklearn import preprocessing

#归一化处理
min_max_scaler = preprocessing.MinMaxScaler()
X_temp = min_max_scaler.fit_transform(df_test[["Fare","SibSp","Parch","Age"]])

#标准化处理
df_test[["Fare","SibSp","Parch","Age"]] = preprocessing.scale(X_temp)

In [None]:
Predict = lr.predict(df_test)

In [None]:
df_test.columns

In [None]:
# submission
df = pd.DataFrame({"PassengerId": df_temp["PassengerId"].values, "Survived": Predict})
df.to_csv("gender_submission.csv", index=False)

In [29]:
a = np.array([[1,2],[3,4],[5,6]])
b = np.array([1,1])
c = a+b
c

array([[2, 3],
       [4, 5],
       [6, 7]])