In [1]:
import urllib.request
import os

data_url = "http://biostat.mc.vanderbilt.edu/wiki/pub/Main/DataSets/titanic3.xls"
data_file_path = "data/titanic3.xls"

if not os.path.isfile(data_file_path):
    result = urllib.request.urlretrieve(data_url, data_file_path)
    print('downloaded:', result)
else :
    print(data_file_path, 'data file already exists.')
    


data/titanic3.xls data file already exists.


In [2]:
import numpy
import pandas as pd

# 读取数据文件，结果是dataFrame
df_data = pd.read_excel(data_file_path)

In [3]:
# 查看数据摘要
df_data.describe()

Unnamed: 0,pclass,survived,age,sibsp,parch,fare,body
count,1309.0,1309.0,1046.0,1309.0,1309.0,1308.0,121.0
mean,2.294882,0.381971,29.881135,0.498854,0.385027,33.295479,160.809917
std,0.837836,0.486055,14.4135,1.041658,0.86556,51.758668,97.696922
min,1.0,0.0,0.1667,0.0,0.0,0.0,1.0
25%,2.0,0.0,21.0,0.0,0.0,7.8958,72.0
50%,3.0,0.0,28.0,0.0,0.0,14.4542,155.0
75%,3.0,1.0,39.0,1.0,0.0,31.275,256.0
max,3.0,1.0,80.0,8.0,9.0,512.3292,328.0


In [4]:
# 筛选字段
# 提取有用的特征字段
selected_cols = ['survived', 'name','pclass','sex','age','sibsp','parch','fare','embarked']

selected_df_data = df_data[selected_cols]

print(selected_df_data)

      survived                                               name  pclass  \
0            1                      Allen, Miss. Elisabeth Walton       1   
1            1                     Allison, Master. Hudson Trevor       1   
2            0                       Allison, Miss. Helen Loraine       1   
3            0               Allison, Mr. Hudson Joshua Creighton       1   
4            0    Allison, Mrs. Hudson J C (Bessie Waldo Daniels)       1   
5            1                                Anderson, Mr. Harry       1   
6            1                  Andrews, Miss. Kornelia Theodosia       1   
7            0                             Andrews, Mr. Thomas Jr       1   
8            1      Appleton, Mrs. Edward Dale (Charlotte Lamson)       1   
9            0                            Artagaveytia, Mr. Ramon       1   
10           0                             Astor, Col. John Jacob       1   
11           1  Astor, Mrs. John Jacob (Madeleine Talmadge Force)       1   

![数据处理](数据处理.png)

In [5]:
# pandas判断元素是否为null的方法： isnull()
# selected_df_data.isnull()
print(selected_df_data.isnull().any())
print(selected_df_data.isnull().sum())

survived    False
name        False
pclass      False
sex         False
age          True
sibsp       False
parch       False
fare         True
embarked     True
dtype: bool
survived      0
name          0
pclass        0
sex           0
age         263
sibsp         0
parch         0
fare          1
embarked      2
dtype: int64


In [6]:
# 含有空值的列
selected_df_data[selected_df_data.isnull().values==True]

Unnamed: 0,survived,name,pclass,sex,age,sibsp,parch,fare,embarked
15,0,"Baumann, Mr. John D",1,male,,0,0,25.9250,S
37,1,"Bradley, Mr. George (""George Arthur Brayton"")",1,male,,0,0,26.5500,S
40,0,"Brewe, Dr. Arthur Jackson",1,male,,0,0,39.6000,C
46,0,"Cairns, Mr. Alexander",1,male,,0,0,31.0000,S
59,1,"Cassebeer, Mrs. Henry Arthur Jr (Eleanor Genev...",1,female,,0,0,27.7208,C
69,1,"Chibnall, Mrs. (Edith Martha Bowerman)",1,female,,0,1,55.0000,S
70,0,"Chisholm, Mr. Roderick Robert Crispin",1,male,,0,0,0.0000,S
74,0,"Clifford, Mr. George Quincy",1,male,,0,0,52.0000,S
80,0,"Crafton, Mr. John Bertram",1,male,,0,0,26.5500,S
106,0,"Farthing, Mr. John",1,male,,0,0,221.7792,S


In [7]:
# 为age缺失记录填充平均值
# fillna 只会填充空值

age_mean_value = selected_df_data['age'].mean()
selected_df_data['age'] = selected_df_data['age'].fillna(age_mean_value)

# for fare
fare_mean_value = selected_df_data['fare'].mean()
selected_df_data['fare'] = selected_df_data['fare'].fillna(fare_mean_value)

# 港口随便填个s
selected_df_data['embarked'] = selected_df_data['embarked'].fillna('S')

# sex to int
selected_df_data['sex'] = selected_df_data['sex'].map({'female':0, 'male':1}).astype(int)
# 港口 to int
selected_df_data['embarked'] = selected_df_data['embarked'].map({'C':0, 'Q':1, 'S':2}).astype(int)

# 版本警告可以无视

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if sys.path[0] == '':
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stab

In [8]:
# 删除name字段 axis=1按列
selected_df_data = selected_df_data.drop(['name'],axis=1)
print(selected_df_data[:3])


   survived  pclass  sex      age  sibsp  parch      fare  embarked
0         1       1    0  29.0000      0      0  211.3375         2
1         1       1    1   0.9167      1      2  151.5500         2
2         0       1    0   2.0000      1      2  151.5500         2


In [9]:
# 转换为ndarray数组
ndarray_data = selected_df_data.values

# 分离特征值和标签值
# 后7列是特征值
features = ndarray_data[:,1:]

# 第0列是标签值
label = ndarray_data[:,0]

print(features[:3])
print(label[:3])

[[   1.        0.       29.        0.        0.      211.3375    2.    ]
 [   1.        1.        0.9167    1.        2.      151.55      2.    ]
 [   1.        0.        2.        1.        2.      151.55      2.    ]]
[ 1.  1.  0.]


In [10]:
# 特征值归一化
from sklearn import preprocessing

# 特征值标准化
minmax_scale = preprocessing.MinMaxScaler(feature_range=(0,1))
norm_features = minmax_scale.fit_transform(features)

print(norm_features[:3])

[[ 0.          0.          0.36116884  0.          0.          0.41250333
   1.        ]
 [ 0.          1.          0.00939458  0.125       0.22222222  0.2958059
   1.        ]
 [ 0.          0.          0.0229641   0.125       0.22222222  0.2958059
   1.        ]]
