In [5]:
import numpy as np
import pandas as pd

In [6]:
df = pd.read_csv("train.csv")
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [7]:
df.drop(columns=["PassengerId", "Pclass", "Name", "Sex", "Age", "Fare", "Embarked"], inplace=True)

In [8]:
df.head(1)

Unnamed: 0,Survived,SibSp,Parch,Ticket,Cabin
0,0,1,0,A/5 21171,


In [9]:
df["Number"] = df["SibSp"] + df["Parch"]
df.head(1)

Unnamed: 0,Survived,SibSp,Parch,Ticket,Cabin,Number
0,0,1,0,A/5 21171,,1


In [10]:
df.drop(columns=["SibSp", "Parch"], inplace=True)
df.sample(10)

Unnamed: 0,Survived,Ticket,Cabin,Number
405,0,28664,,1
256,1,PC 17585,,0
687,0,349228,,0
392,0,3101277,,2
471,0,315089,,0
261,1,347077,,6
86,0,W./C. 6608,,4
827,1,S.C./PARIS 2079,,2
600,1,243847,,3
77,0,374746,,0


In [11]:
df["Number"] = df["Number"].replace(0, "A")
df.head()

Unnamed: 0,Survived,Ticket,Cabin,Number
0,0,A/5 21171,,1
1,1,PC 17599,C85,1
2,1,STON/O2. 3101282,,A
3,1,113803,C123,1
4,0,373450,,A


In [12]:
df["Number"].unique()

array([1, 'A', 4, 2, 6, 5, 3, 7, 10], dtype=object)

### Look we have mixed values now - numerical and categorical in a single column

Extracting Numerical Part

In [13]:
df["Number_numerical"] = pd.to_numeric(df["Number"], errors="coerce", downcast="integer")


In [14]:
df["Number_categorical"] = np.where(df["Number_numerical"].isnull(), df["Number"], np.nan)

# np.where(condition, x, y) returns an array with elements from x in a row where condition is True, and elements from y at other rows

# x = np.where(
#     1 > 2,
#     "yes correct",
#     "wrong"
# )
# x will be wrong


In [15]:
df.head()

Unnamed: 0,Survived,Ticket,Cabin,Number,Number_numerical,Number_categorical
0,0,A/5 21171,,1,1.0,
1,1,PC 17599,C85,1,1.0,
2,1,STON/O2. 3101282,,A,,A
3,1,113803,C123,1,1.0,
4,0,373450,,A,,A


### Number column is handled. Now lets handle at Cabin and Ticket

### Cabin

In [16]:
pd.DataFrame(df["Cabin"].unique())

Unnamed: 0,0
0,
1,C85
2,C123
3,E46
4,G6
...,...
143,E17
144,A24
145,C50
146,B42


Here you see that each value contains both numeric and categorical

Extracting numerical part of Cabin

In [17]:
df["Cabin_numerical"] = df["Cabin"].str.extract("(\\d+)")
df.sample(10)

Unnamed: 0,Survived,Ticket,Cabin,Number,Number_numerical,Number_categorical,Cabin_numerical
774,1,29105,,4,4.0,,
693,0,2672,,A,,A,
193,1,230080,F2,2,2.0,,2.0
276,0,347073,,A,,A,
386,0,CA 2144,,7,7.0,,
830,1,2659,,1,1.0,,
203,0,2628,,A,,A,
334,1,PC 17611,,1,1.0,,
580,1,237789,,2,2.0,,
218,1,11813,D15,A,,A,15.0


Extracting categorical part of Cabin

In [18]:
df["Cabin_categorical"] = df["Cabin"].str[0]
df.sample(10)

Unnamed: 0,Survived,Ticket,Cabin,Number,Number_numerical,Number_categorical,Cabin_numerical,Cabin_categorical
198,1,370370,,A,,A,,
207,1,2699,,A,,A,,
658,0,29751,,A,,A,,
176,0,4133,,4,4.0,,,
831,1,29106,,2,2.0,,,
187,1,111428,,A,,A,,
269,1,PC 17760,C99,A,,A,99.0,C
697,1,35852,,A,,A,,
360,0,347088,,5,5.0,,,
124,0,35281,D26,1,1.0,,26.0,D


### Now Ticket

In [19]:
pd.DataFrame(df["Ticket"].unique()).head()

Unnamed: 0,0
0,A/5 21171
1,PC 17599
2,STON/O2. 3101282
3,113803
4,373450


Extracting the last bit of Ticket i.e the number

In [20]:
df["Ticket_numerical"] = df["Ticket"].apply(lambda s: s.split()[-1])

pd.DataFrame(df["Ticket_numerical"]).head()

Unnamed: 0,Ticket_numerical
0,21171
1,17599
2,3101282
3,113803
4,373450


In [21]:
df["Ticket_numerical"] = pd.to_numeric(df["Ticket_numerical"],
                                       errors="coerce",
                                       downcast="integer")

pd.DataFrame(df["Ticket_numerical"].head())

Unnamed: 0,Ticket_numerical
0,21171.0
1,17599.0
2,3101282.0
3,113803.0
4,373450.0


Extracting first part i.e. the letters

In [22]:
df["Ticket_categorical"] = df["Ticket"].apply(lambda s: s.split()[0])

pd.DataFrame(df["Ticket_categorical"]).head()

Unnamed: 0,Ticket_categorical
0,A/5
1,PC
2,STON/O2.
3,113803
4,373450


Notice, in rows where there were no letters, the numbers only were considered as the first part so they got into the dataframe

In [23]:
df["Ticket_categorical"] = np.where(df["Ticket_categorical"].str.isdigit(), np.nan, df["Ticket_categorical"])

pd.DataFrame(df["Ticket_categorical"]).head()

Unnamed: 0,Ticket_categorical
0,A/5
1,PC
2,STON/O2.
3,
4,
