# Pandas 基础
---

In [9]:
import pandas as pd
import numpy as np

titanic_survival = pd.read_csv("titanic_train.csv")
titanic_survival.head(5)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [10]:
age = titanic_survival["Age"]
print(age.loc[0:10])

0     22.0
1     38.0
2     26.0
3     35.0
4     35.0
5      NaN
6     54.0
7      2.0
8     27.0
9     14.0
10     4.0
Name: Age, dtype: float64


In [11]:
# 查看缺失值
age_is_null = pd.isnull(age)
print(age_is_null)

age_null_true = age[age_is_null]
print(age_null_true)
print(len(age_null_true))

0      False
1      False
2      False
3      False
4      False
5       True
6      False
7      False
8      False
9      False
10     False
11     False
12     False
13     False
14     False
15     False
16     False
17      True
18     False
19      True
20     False
21     False
22     False
23     False
24     False
25     False
26      True
27     False
28      True
29      True
       ...  
861    False
862    False
863     True
864    False
865    False
866    False
867    False
868     True
869    False
870    False
871    False
872    False
873    False
874    False
875    False
876    False
877    False
878     True
879    False
880    False
881    False
882    False
883    False
884    False
885    False
886    False
887    False
888     True
889    False
890    False
Name: Age, Length: 891, dtype: bool
5     NaN
17    NaN
19    NaN
26    NaN
28    NaN
29    NaN
31    NaN
32    NaN
36    NaN
42    NaN
45    NaN
46    NaN
47    NaN
48    NaN
55    NaN
64    NaN
65    NaN
7

In [12]:
# 缺失值处理
mean_age = sum(titanic_survival["Age"])/len(titanic_survival["Age"])
# 因为有缺失值，所以平均值为nan
print(mean_age)

nan


In [14]:
# mean for each class
passenger_classes = [1,2,3]
fares_by_class = {}

for this_class in passenger_classes:
    pclass_rows = titanic_survival[titanic_survival["Pclass"] == this_class]
    pclass_fares = pclass_rows["Fare"]
    fare_for_class = pclass_fares.mean()
    fares_by_class[this_class] = fare_for_class
print(fares_by_class)

{1: 84.15468749999992, 2: 20.66218315217391, 3: 13.675550101832997}


In [17]:
# 数据透视表
passenger_survival = titanic_survival.pivot_table(index="Pclass",values="Survived",aggfunc=np.mean)
print(passenger_survival)

        Survived
Pclass          
1       0.629630
2       0.472826
3       0.242363


In [18]:
passenger_age = titanic_survival.pivot_table(index="Pclass",values="Age")
print(passenger_age)

              Age
Pclass           
1       38.233441
2       29.877630
3       25.140620


In [21]:
fare_per_port = titanic_survival.pivot_table(index="Embarked",values="Fare")
print(fare_per_port)
port_stats = titanic_survival.pivot_table(index="Embarked",values=["Fare","Survived"],aggfunc=np.sum)
print(port_stats)

               Fare
Embarked           
C         59.954144
Q         13.276030
S         27.079812
                Fare  Survived
Embarked                      
C         10072.2962        93
Q          1022.2543        30
S         17439.3988       217


In [None]:
# 
drop_na_columns = titanic_survival.dropna(axis=1)
new_titanic_survived = titanic_survival.dropna(axis=0,subset=["Age","Sex"])

In [20]:
# 定为到具体的值
row_index_83_age = titanic_survival.loc[83,"Age"]
print(row_index_83_age)

28.0


In [23]:
# 重置索引
new_titanic = titanic_survival.sort_values("Age",ascending=False)
print(new_titanic[0:10])
titanic_reset_index = new_titanic.reset_index(drop=True)
print(titanic_reset_index[:10])

     PassengerId  Survived  Pclass                                  Name  \
630          631         1       1  Barkworth, Mr. Algernon Henry Wilson   
851          852         0       3                   Svensson, Mr. Johan   
493          494         0       1               Artagaveytia, Mr. Ramon   
96            97         0       1             Goldschmidt, Mr. George B   
116          117         0       3                  Connors, Mr. Patrick   
672          673         0       2           Mitchell, Mr. Henry Michael   
745          746         0       1          Crosby, Capt. Edward Gifford   
33            34         0       2                 Wheadon, Mr. Edward H   
54            55         0       1        Ostby, Mr. Engelhart Cornelius   
280          281         0       3                      Duane, Mr. Frank   

      Sex   Age  SibSp  Parch      Ticket     Fare Cabin Embarked  
630  male  80.0      0      0       27042  30.0000   A23        S  
851  male  74.0      0     

# pandas自定义函数
---

In [25]:
# 自定义函数
def handreTh_row(column):
    handreth_item = column.loc[99]
    return handreth_item

handreth_row = titanic_survival.apply(handreTh_row)
print(handreth_row)

PassengerId                  100
Survived                       0
Pclass                         2
Name           Kantor, Mr. Sinai
Sex                         male
Age                           34
SibSp                          1
Parch                          0
Ticket                    244367
Fare                          26
Cabin                        NaN
Embarked                       S
dtype: object


In [31]:
# 缺失值个数
def not_null_count(column):
    column_null = pd.isnull(column)
    null = column[column_null]
    return len(null)

column_null_count = titanic_survival.apply(not_null_count,axis=0)
print(column_null_count)

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64


In [29]:
# onehot编码
def which_class(row):
    pClass = row["Pclass"]
    if pd.isnull(pClass):
        return "unKnown"
    elif pClass == 1:
        return "First Class"
    elif pClass == 2:
        return "Second Class"
    elif pClass == 3:
        return "Third Class"

classes = titanic_survival.apply(which_class,axis=1)
print(classes)

0       Third Class
1       First Class
2       Third Class
3       First Class
4       Third Class
5       Third Class
6       First Class
7       Third Class
8       Third Class
9      Second Class
10      Third Class
11      First Class
12      Third Class
13      Third Class
14      Third Class
15     Second Class
16      Third Class
17     Second Class
18      Third Class
19      Third Class
20     Second Class
21     Second Class
22      Third Class
23      First Class
24      Third Class
25      Third Class
26      Third Class
27      First Class
28      Third Class
29      Third Class
           ...     
861    Second Class
862     First Class
863     Third Class
864    Second Class
865    Second Class
866    Second Class
867     First Class
868     Third Class
869     Third Class
870     Third Class
871     First Class
872     First Class
873     Third Class
874    Second Class
875     Third Class
876     Third Class
877     Third Class
878     Third Class
879     First Class


In [37]:
# 对年龄进行分段
def is_minor(row):
    if row["Age"] < 18:
        return True
    else:
        return False


minors = titanic_survival.apply(is_minor,axis=1)
# print(minors)

def generate_age_label(row):
    age = row["Age"]
    if pd.isnull(age):
        return "unKnown"
    elif age < 18:
        return "Minor"
    else:
        return "adult"
age_labels = titanic_survival.apply(generate_age_label,axis=1)
print(age_labels)

0        adult
1        adult
2        adult
3        adult
4        adult
5      unKnown
6        adult
7        Minor
8        adult
9        Minor
10       Minor
11       adult
12       adult
13       adult
14       Minor
15       adult
16       Minor
17     unKnown
18       adult
19     unKnown
20       adult
21       adult
22       Minor
23       adult
24       Minor
25       adult
26     unKnown
27       adult
28     unKnown
29     unKnown
        ...   
861      adult
862      adult
863    unKnown
864      adult
865      adult
866      adult
867      adult
868    unKnown
869      Minor
870      adult
871      adult
872      adult
873      adult
874      adult
875      Minor
876      adult
877      adult
878    unKnown
879      adult
880      adult
881      adult
882      adult
883      adult
884      adult
885      adult
886      adult
887      adult
888    unKnown
889      adult
890      adult
Length: 891, dtype: object


In [39]:
titanic_survival["age_labels"] = age_labels
age_group_survived = titanic_survival.pivot_table(index="age_labels",values="Survived")
print(age_group_survived)

            Survived
age_labels          
Minor       0.539823
adult       0.381032
unKnown     0.293785
