In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
df_train = pd.read_csv("train.csv")
df_test = pd.read_csv("test.csv")

In [3]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
PassengerId    891 non-null int64
Survived       891 non-null int64
Pclass         891 non-null int64
Name           891 non-null object
Sex            891 non-null object
Age            714 non-null float64
SibSp          891 non-null int64
Parch          891 non-null int64
Ticket         891 non-null object
Fare           891 non-null float64
Cabin          204 non-null object
Embarked       889 non-null object
dtypes: float64(2), int64(5), object(5)
memory usage: 83.6+ KB


In [4]:
df_train.drop(labels=['Ticket', 'Cabin', 'Name'], inplace=True, axis=1)
df_test.drop(labels=['Ticket', 'Cabin', 'Name'], inplace=True, axis=1)

df_train.dropna(subset=['Embarked'], inplace=True)#Exists value nan
df_test.dropna(subset=['Embarked'], inplace=True)

In [5]:
lista = (df_train.dtypes == 'object')
object_cols = list(lista[lista].index)

print("Variáveis Categóricas:")
print(object_cols)

Variáveis Categóricas:
['Sex', 'Embarked']


In [6]:
df_train.fillna(value={'Age': df_train['Age'].median()}, inplace=True)
df_test.fillna(value={'Age': df_test['Age'].median()}, inplace=True)

df_test.fillna(value={'Fare': df_test['Fare'].median()}, inplace=True)

In [7]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 889 entries, 0 to 890
Data columns (total 9 columns):
PassengerId    889 non-null int64
Survived       889 non-null int64
Pclass         889 non-null int64
Sex            889 non-null object
Age            889 non-null float64
SibSp          889 non-null int64
Parch          889 non-null int64
Fare           889 non-null float64
Embarked       889 non-null object
dtypes: float64(2), int64(5), object(2)
memory usage: 69.5+ KB


In [8]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn import metrics

X = df_train[['Sex', 'Pclass', 'SibSp', 'Embarked', 'PassengerId', 'Age', 'Parch', 'Fare']]
y = df_train['Survived']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

classifier = RandomForestClassifier(n_jobs=4, random_state=12)

  from numpy.core.umath_tests import inner1d


In [9]:
def score_dataset(X_train, X_valid, y_train, y_valid):
    classifier.fit(X_train, y_train)
    predicoes = classifier.predict(X_valid)    
    print(metrics.classification_report(y_test, predicoes, target_names=['No', 'Yes']))

In [10]:
drop_X_train = X_train.select_dtypes(exclude=['object'])
drop_X_valid = X_test.select_dtypes(exclude=['object'])

print("Approach 1 (Drop categorical variables):")
print(score_dataset(drop_X_train, drop_X_valid, y_train, y_test))

Approach 1 (Drop categorical variables):
             precision    recall  f1-score   support

         No       0.71      0.79      0.75       184
        Yes       0.56      0.45      0.50       110

avg / total       0.65      0.66      0.65       294

None


In [11]:
from sklearn.preprocessing import LabelEncoder

# Make copy to avoid changing original data 
label_X_train = X_train.copy()
label_X_valid = X_test.copy()

# Apply label encoder to each column with categorical data
label_encoder = LabelEncoder()
for col in object_cols:
    label_X_train[col] = label_encoder.fit_transform(X_train[col])
    label_X_valid[col] = label_encoder.transform(X_test[col])

print("Approach 2 (Label Encoding):") 
print(score_dataset(label_X_train, label_X_valid, y_train, y_test))

Approach 2 (Label Encoding):
             precision    recall  f1-score   support

         No       0.81      0.87      0.84       184
        Yes       0.75      0.65      0.70       110

avg / total       0.79      0.79      0.79       294

None


In [13]:
X_train[object_cols]

Unnamed: 0,Sex,Embarked
6,male,S
576,female,S
445,male,S
74,male,S
670,female,S
328,female,S
663,male,S
146,male,S
235,female,S
221,male,S


In [17]:
from sklearn.preprocessing import OneHotEncoder

# Apply one-hot encoder to each column with categorical data
OH_encoder = OneHotEncoder(handle_unknown='ignore', sparse=False, categories='auto')
#OH_cols_train = pd.DataFrame(OH_encoder.fit_transform(X_train[object_cols]))
#OH_cols_valid = pd.DataFrame(OH_encoder.transform(X_test[object_cols]))

for col in object_cols:
    X_train[col] = pd.Categorical(X_train[col])
    OH_cols_train[col] = OH_encoder.fit_transform(X_train[col].reshape(-1, 1))
    OH_cols_valid[col] = OH_encoder.transform(X_test[col])

# One-hot encoding removed index; put it back
OH_cols_train.index = X_train.index
OH_cols_valid.index = X_test.index

# Remove categorical columns (will replace with one-hot encoding)
num_X_train = X_train.drop(object_cols, axis=1)
num_X_valid = X_test.drop(object_cols, axis=1)

# Add one-hot encoded columns to numerical features
OH_X_train = pd.concat([num_X_train, OH_cols_train], axis=1)
OH_X_valid = pd.concat([num_X_valid, OH_cols_valid], axis=1)

print("MAE from Approach 3 (One-Hot Encoding):") 
print(score_dataset(OH_X_train, OH_X_valid, y_train, y_test))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':


ValueError: could not convert string to float: 'Sex'