In [1]:
import pandas as pd
import numpy as np
df = pd.read_csv('test.csv')
df.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


In [2]:
df.duplicated().sum()

0

In [3]:
# check if there are null datas in the dataset
df.isnull().sum()

PassengerId      0
Pclass           0
Name             0
Sex              0
Age             86
SibSp            0
Parch            0
Ticket           0
Fare             1
Cabin          327
Embarked         0
dtype: int64

In [4]:
#check for negative age
any(df['Age']<0)

False

In [5]:
# check the number of unique values in each column
df.nunique()

PassengerId    418
Pclass           3
Name           418
Sex              2
Age             79
SibSp            7
Parch            8
Ticket         363
Fare           169
Cabin           76
Embarked         3
dtype: int64

In [6]:
df.shape

(418, 11)

In [7]:
#Imputing missing values 

from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.impute import SimpleImputer
imp_S = SimpleImputer(strategy='most_frequent')
imp_I = IterativeImputer(max_iter=10, random_state=0,min_value=0,max_value=100)
df[['Pclass','Age','SibSp','Parch','Fare']]=imp_I.fit_transform(df[['Pclass','Age','SibSp','Parch','Fare']])
df[['Cabin','Embarked']]=imp_S.fit_transform(df[['Cabin','Embarked']])

In [8]:
# check if there are null datas in the dataset
df.isnull().sum()


PassengerId    0
Pclass         0
Name           0
Sex            0
Age            0
SibSp          0
Parch          0
Ticket         0
Fare           0
Cabin          0
Embarked       0
dtype: int64

In [9]:
#Classifing Age to 5 different catagories

from sklearn.preprocessing import FunctionTransformer
bins = [-1, 1, 13, 20, 60, np.inf]
labels = ['infant', 'kid', 'teen', 'adult', 'senior citizen']
age_trans = FunctionTransformer(pd.cut, kw_args={'bins': bins, 'labels': labels, 'retbins': False})
df['Age']=age_trans.fit_transform(df['Age'])

In [10]:
# testing catagories
df['Age'].unique()
#df.isnull().sum()

['adult', 'senior citizen', 'teen', 'kid', 'infant']
Categories (5, object): ['infant' < 'kid' < 'teen' < 'adult' < 'senior citizen']

In [11]:
df["Pclass"]=df["Pclass"].astype("category")
df["Sex"]=df["Sex"].astype("category")
df["Embarked"]=df["Embarked"].astype("category")
df.dtypes

PassengerId       int64
Pclass         category
Name             object
Sex            category
Age            category
SibSp           float64
Parch           float64
Ticket           object
Fare            float64
Cabin            object
Embarked       category
dtype: object

In [12]:
Group_ticket=pd.DataFrame(df.groupby('Ticket')['Ticket'].count().reset_index(name='person_per_Ticket'))
Group_ticket['Ticket_type']=Group_ticket['person_per_Ticket'].apply(lambda x: 'multi_person' if x>1 else 'single_person')
Group_ticket

Unnamed: 0,Ticket,person_per_Ticket,Ticket_type
0,110469,1,single_person
1,110489,1,single_person
2,110813,1,single_person
3,111163,1,single_person
4,112051,1,single_person
...,...,...,...
358,W./C. 14260,1,single_person
359,W./C. 14266,1,single_person
360,W./C. 6607,2,multi_person
361,W./C. 6608,1,single_person


In [13]:
df=df.merge(Group_ticket, how='left', left_on='Ticket', right_on='Ticket')

In [14]:
df.tail(20)

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,person_per_Ticket,Ticket_type
398,1290,3.0,"Larsson-Rondberg, Mr. Edvard A",male,adult,0.0,0.0,347065,7.775,B57 B59 B63 B66,S,1,single_person
399,1291,3.0,"Conlon, Mr. Thomas Henry",male,adult,0.0,0.0,21332,7.7333,B57 B59 B63 B66,Q,1,single_person
400,1292,1.0,"Bonnell, Miss. Caroline",female,adult,0.0,0.0,36928,164.8667,C7,S,2,multi_person
401,1293,2.0,"Gale, Mr. Harry",male,adult,1.0,0.0,28664,21.0,B57 B59 B63 B66,S,1,single_person
402,1294,1.0,"Gibson, Miss. Dorothy Winifred",female,adult,0.0,1.0,112378,59.4,B57 B59 B63 B66,C,2,multi_person
403,1295,1.0,"Carrau, Mr. Jose Pedro",male,teen,0.0,0.0,113059,47.1,B57 B59 B63 B66,S,1,single_person
404,1296,1.0,"Frauenthal, Mr. Isaac Gerald",male,adult,1.0,0.0,17765,27.7208,D40,C,1,single_person
405,1297,2.0,"Nourney, Mr. Alfred (Baron von Drachstedt"")""",male,teen,0.0,0.0,SC/PARIS 2166,13.8625,D38,C,1,single_person
406,1298,2.0,"Ware, Mr. William Jeffery",male,adult,1.0,0.0,28666,10.5,B57 B59 B63 B66,S,1,single_person
407,1299,1.0,"Widener, Mr. George Dunton",male,adult,1.0,1.0,113503,211.5,C80,C,4,multi_person


In [15]:
df.to_csv('trans_test.csv',index= False)

In [16]:
df.dtypes

PassengerId             int64
Pclass               category
Name                   object
Sex                  category
Age                  category
SibSp                 float64
Parch                 float64
Ticket                 object
Fare                  float64
Cabin                  object
Embarked             category
person_per_Ticket       int64
Ticket_type            object
dtype: object