In [118]:
import pandas
import sklearn
import numpy as np
train = pandas.read_csv('../input/train.csv')
test = pandas.read_csv('../input/test.csv')
train.shape, test.shape

((891, 12), (418, 11))

In [119]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
PassengerId    891 non-null int64
Survived       891 non-null int64
Pclass         891 non-null int64
Name           891 non-null object
Sex            891 non-null object
Age            714 non-null float64
SibSp          891 non-null int64
Parch          891 non-null int64
Ticket         891 non-null object
Fare           891 non-null float64
Cabin          204 non-null object
Embarked       889 non-null object
dtypes: float64(2), int64(5), object(5)
memory usage: 83.6+ KB


In [120]:
train['Embarked'] = train.Embarked.astype('category')
train['embarked_code'] = train.Embarked.cat.codes

In [121]:
train['Sex'] = train.Sex.astype('category')
train['sex_code'] = train.Sex.cat.codes

In [122]:
features = train.select_dtypes(include=['number'])
features.head()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare,embarked_code,sex_code
0,1,0,3,22.0,1,0,7.25,2,1
1,2,1,1,38.0,1,0,71.2833,0,0
2,3,1,3,26.0,0,0,7.925,2,0
3,4,1,1,35.0,1,0,53.1,2,0
4,5,0,3,35.0,0,0,8.05,2,1


In [123]:
features = features.drop('PassengerId', 1)

In [124]:
features.info() # we have nulls in the age.. one option is to drop them

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 8 columns):
Survived         891 non-null int64
Pclass           891 non-null int64
Age              714 non-null float64
SibSp            891 non-null int64
Parch            891 non-null int64
Fare             891 non-null float64
embarked_code    891 non-null int8
sex_code         891 non-null int8
dtypes: float64(2), int64(4), int8(2)
memory usage: 43.6 KB


In [125]:
features = features.dropna()
features.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 714 entries, 0 to 890
Data columns (total 8 columns):
Survived         714 non-null int64
Pclass           714 non-null int64
Age              714 non-null float64
SibSp            714 non-null int64
Parch            714 non-null int64
Fare             714 non-null float64
embarked_code    714 non-null int8
sex_code         714 non-null int8
dtypes: float64(2), int64(4), int8(2)
memory usage: 40.4 KB


In [131]:
from sklearn.decomposition import PCA
y = features.Survived
X = features.drop('Survived', 1)

pca = PCA()
pca.fit_transform(X, y)

array([[-27.66048714,   6.92663543,  -0.11217524, ...,   0.60067395,
         -0.39441363,   0.14374891],
       [ 36.82682095,  -7.26586577,   0.19407955, ...,   0.61168271,
          0.29992013,  -0.458254  ],
       [-26.8725542 ,   2.93225702,   0.45981059, ...,  -0.18973612,
         -0.24084608,  -0.80527713],
       ..., 
       [ -4.98957935,  10.5198213 ,   0.93796604, ...,   0.05057834,
          1.56229547,  -0.41635451],
       [ -4.78326948,   3.52051558,   1.20204543, ...,   0.09729529,
          0.63093967,   0.62568907],
       [-26.87393431,  -3.07208112,   0.5618049 , ...,  -0.08522515,
         -0.83429357,   0.1829026 ]])

In [132]:
pca.explained_variance_

array([  2.79891512e+03,   2.08817899e+02,   9.67723174e-01,
         5.78328742e-01,   4.87756236e-01,   3.97348695e-01,
         1.97495925e-01])

In [133]:
pca.components_

array([[ -8.81486441e-03,   2.84888109e-02,   2.38579646e-03,
          3.28030612e-03,   9.99536435e-01,  -4.28229261e-03,
         -1.67639002e-03],
       [  1.81931278e-02,  -9.99121561e-01,   2.09315732e-02,
          1.25423485e-02,   2.85435837e-02,   8.41153887e-04,
         -3.77248728e-03],
       [ -1.16870700e-01,  -2.59184758e-02,  -7.44234059e-01,
         -6.25818563e-01,   2.85321519e-03,  -1.87504234e-01,
          7.05002427e-02],
       [  2.92807198e-01,   7.73817271e-04,  -1.80017882e-01,
         -1.03130621e-01,   7.50440641e-03,   9.24493895e-01,
          1.28357726e-01],
       [ -1.21740198e-01,   1.39458155e-03,   6.40753421e-01,
         -7.39555477e-01,   2.98805653e-04,   5.92408954e-02,
          1.55436664e-01],
       [ -9.22816662e-01,  -1.55850979e-02,  -4.05259938e-02,
          1.04650959e-01,  -6.86692925e-03,   3.21121491e-01,
         -1.80017127e-01],
       [ -1.83807574e-01,  -5.22377921e-03,  -3.25284833e-02,
          1.99130618e-01,  -6.91

In [135]:
df = pandas.DataFrame(pca.components_, columns=X.columns)
df['weight'] = pca.explained_variance_
df

Unnamed: 0,Pclass,Age,SibSp,Parch,Fare,embarked_code,sex_code,weight
0,-0.008815,0.028489,0.002386,0.00328,0.999536,-0.004282,-0.001676,2798.915118
1,0.018193,-0.999122,0.020932,0.012542,0.028544,0.000841,-0.003772,208.817899
2,-0.116871,-0.025918,-0.744234,-0.625819,0.002853,-0.187504,0.0705,0.967723
3,0.292807,0.000774,-0.180018,-0.103131,0.007504,0.924494,0.128358,0.578329
4,-0.12174,0.001395,0.640753,-0.739555,0.000299,0.059241,0.155437,0.487756
5,-0.922817,-0.015585,-0.040526,0.104651,-0.006867,0.321121,-0.180017,0.397349
6,-0.183808,-0.005224,-0.032528,0.199131,-0.000691,-0.059209,0.960193,0.197496
