# <center> Feature Engineering 2
**Summary of Actions**
* Created dummy variables for all categorical features
* Created Cabin Section, Family Size, and Is_Alone Features
### Import Preliminaries

In [1]:
%matplotlib inline

# Import modules
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder

# Import data
train_df = pd.DataFrame(pd.read_csv('Data/cleaned_train_df.csv'))
test_df = pd.DataFrame(pd.read_csv('Data/cleaned_test_df.csv'))

# Set pandas options
pd.set_option('precision',1)
pd.set_option('max_columns',100)
pd.set_option('max_rows',100)

dfs = [train_df, test_df]

In [2]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 889 entries, 0 to 888
Data columns (total 12 columns):
PassengerId    889 non-null int64
Survived       889 non-null int64
Pclass         889 non-null int64
Name           889 non-null object
Sex            889 non-null object
Age            889 non-null float64
SibSp          889 non-null int64
Parch          889 non-null int64
Ticket         889 non-null object
Fare           889 non-null float64
Cabin          889 non-null object
Embarked       889 non-null object
dtypes: float64(2), int64(5), object(5)
memory usage: 83.4+ KB


### Viewing Features

In [3]:
train_df.head(1)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2,Unknown,S


### Creating Cabin Section Feature

In [4]:
train_df['Sections'] = train_df.Cabin.str[0]
train_df['Sections'].value_counts()

U    687
C     59
B     45
D     33
E     32
A     15
F     13
G      4
T      1
Name: Sections, dtype: int64

### Creating Family Size Feature

In [5]:
train_df['Family_Size'] = train_df['SibSp'] + train_df['Parch']
train_df[['Family_Size','SibSp','Parch']].head(5)

Unnamed: 0,Family_Size,SibSp,Parch
0,1,1,0
1,1,1,0
2,0,0,0
3,1,1,0
4,0,0,0


### Creating Is Alone Feature

In [6]:
train_df['Is_Alone'] = 0
train_df.loc[train_df.Family_Size > 0,'Is_Alone'] = 1
train_df['Is_Alone'].head(10)

0    1
1    1
2    0
3    1
4    0
5    0
6    0
7    1
8    1
9    1
Name: Is_Alone, dtype: int64

### Replicating Features for Test_df

In [7]:
def feature_creation(df):
    df['Sections'] = df.Cabin.str[0]
    df['Family_Size'] = df['SibSp'] + df['Parch']
    df['Is_Alone'] = 0
    df.loc[df.Family_Size > 0,'Is_Alone'] = 1
    return df

In [8]:
test_df = feature_creation(test_df)

### Creating Dummy Variables

In [9]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
for df in dfs:
  
  object_columns = list(df.select_dtypes(
  include=[object]).columns)
  
  for col in df[object_columns]:
    df[col] = le.fit_transform(df[col])

### Exporting Data

In [10]:
train_df.to_csv('Data/featured_train_df.csv', index=False)
test_df.to_csv('Data/featured_test_df.csv', index=False)