# Предобработка данных в Pandas

In [120]:
import numpy as np

In [8]:
M = np.random.normal(1, 10, (1000,50))
M = (M - np.mean(M, axis=0)) / np.std(M, axis=0)
print(M)

[[-0.04796081  0.43443634 -0.51316078 ...  1.22389937 -0.23577401
  -0.10168708]
 [-1.70617401  0.96022103 -1.3437397  ...  0.69756942  0.02468283
  -1.95343014]
 [ 0.61186478  1.2206675  -0.14198527 ... -0.63030898 -1.03771604
   0.85872176]
 ...
 [-0.84729404  0.01350363 -1.60255026 ...  0.48017629 -0.66736668
   0.27092034]
 [-0.29126457 -0.21207428 -0.16134119 ... -0.20473781 -0.86530246
  -0.16755906]
 [-0.6863361   0.89069578 -0.00963765 ... -1.2030911   1.14522964
  -1.49880762]]


In [11]:
Z = np.array([[4, 5, 0], 
             [1, 9, 3],              
             [5, 1, 1],
             [3, 3, 3], 
             [9, 9, 9], 
             [4, 7, 1]])
X = np.sum(Z, axis=1)
print(X)
A = np.nonzero(X > 10)
print(A)

[ 9 13  7  9 27 12]
(array([1, 4, 5]),)


In [20]:
X = np.eye(3)
Y = np.eye(3)
print(X)
M = np.vstack((X,Y))
print(M)

[[1. 0. 0.]
 [0. 1. 0.]
 [0. 0. 1.]]
[[1. 0. 0.]
 [0. 1. 0.]
 [0. 0. 1.]
 [1. 0. 0.]
 [0. 1. 0.]
 [0. 0. 1.]]


In [22]:
import pandas as pd

In [25]:
data = pd.read_csv('data/titanic.csv', index_col='PassengerId')
data.head()

Unnamed: 0_level_0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [30]:
data['Sex'].value_counts()

male      577
female    314
Name: Sex, dtype: int64

In [77]:
X = data[(data['Survived'] == 1)].count()
Y = data.shape[0]
print(X/Y)

surv_counts = data['Survived'].value_counts()
surv_percent = 100 * surv_counts[1] / surv_counts.sum()
print(2, "{:0.2f}".format(surv_percent))

Survived    0.383838
Pclass      0.383838
Name        0.383838
Sex         0.383838
Age         0.325477
SibSp       0.383838
Parch       0.383838
Ticket      0.383838
Fare        0.383838
Cabin       0.152637
Embarked    0.381594
dtype: float64
2 38.38


In [55]:
X = data['Age'].mean()
print(X)
X = data['Age'].median()
print(X)

29.69911764705882
28.0


In [57]:
data['SibSp'].corr(data['Parch'])

0.41483769862015624

In [75]:
data[data['Sex'] == 'female']['Name']

PassengerId
2      Cumings, Mrs. John Bradley (Florence Briggs Th...
3                                 Heikkinen, Miss. Laina
4           Futrelle, Mrs. Jacques Heath (Lily May Peel)
9      Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)
10                   Nasser, Mrs. Nicholas (Adele Achem)
11                       Sandstrom, Miss. Marguerite Rut
12                              Bonnell, Miss. Elizabeth
15                  Vestrom, Miss. Hulda Amanda Adolfina
16                      Hewlett, Mrs. (Mary D Kingcome) 
19     Vander Planke, Mrs. Julius (Emelia Maria Vande...
20                               Masselmani, Mrs. Fatima
23                           McGowan, Miss. Anna "Annie"
25                         Palsson, Miss. Torborg Danira
26     Asplund, Mrs. Carl Oscar (Selma Augusta Emilia...
29                         O'Dwyer, Miss. Ellen "Nellie"
32        Spencer, Mrs. William Augustus (Marie Eugenie)
33                              Glynn, Miss. Mary Agatha
39                 

In [122]:
data['Pclass'].value_counts()
216 / data.shape[0]


pclass_counts = data['Pclass'].value_counts()
pclass_percent = 100.0 * pclass_counts[1] / pclass_counts.sum()
print(pclass_percent)

24.242424242424242


In [162]:
import re
def get_name(name):

    s = re.search('^[^,]+, (.*)', name)
#     print(s)
    if s:
        name = s.group(1)


    s = re.search('\(([^)]+)\)', name)
#     print(s)
    if s:
        name = s.group(1)


    name = re.sub('(Miss\. |Mrs\. |Ms\. )', '', name)


    name = name.split(' ')[0].replace('"', '')

    return name


names = data[data['Sex'] == 'female']['Name'].map(get_name)
name_counts = names.value_counts()
print(name_counts.head(1).index.values[0])

Anna


# Решающие деревья. Важность признаков

In [142]:
x_labels = ['Pclass', 'Fare', 'Age', 'Sex']
X = data.loc[:, x_labels]
X.head()

Unnamed: 0_level_0,Pclass,Fare,Age,Sex
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,3,7.25,22.0,male
2,1,71.2833,38.0,female
3,3,7.925,26.0,female
4,1,53.1,35.0,female
5,3,8.05,35.0,male


In [143]:
X['Sex'] = X['Sex'].map(lambda x: 1 if x == 'male' else 0)
X.head()

Unnamed: 0_level_0,Pclass,Fare,Age,Sex
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,3,7.25,22.0,1
2,1,71.2833,38.0,0
3,3,7.925,26.0,0
4,1,53.1,35.0,0
5,3,8.05,35.0,1


In [144]:
y = data['Survived']

In [145]:
X = X.dropna()
y = y[X.index.values]

In [146]:
import pandas
import numpy as np
from sklearn.tree import DecisionTreeClassifier

clf = DecisionTreeClassifier(random_state=241)
clf.fit(X,y)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=241,
            splitter='best')

In [147]:
importances = pd.Series(clf.feature_importances_, index=x_labels)
print(importances)

Pclass    0.140005
Fare      0.303436
Age       0.256046
Sex       0.300512
dtype: float64
