In [None]:
# 1. Data pre-processing operations such as outliers and/or inconsistent data value
#  management.
#  a. Load the Titanic dataset from a CSV file or another data source into a Pandas
#  DataFrame.
#  b. Identify and handle missing values in the dataset
#  c. Create new features if necessary, e.g., 'FamilySize' by combining 'SibSp' and
#  'Parch.'
#  d. Encode categorical variables like 'Sex' and 'Embarked' as numerical values using
#  one-hot encoding or label encoding.
#  e.
#  f.
#  Check for and remove duplicate rows if they exist in the dataset.
#  Detect outliers (Z-Score, DBSCAN) in numerical columns such as 'Fare' or 'Age’
#  Also analyze the outliers generated by different methods.

In [2]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.cluster import DBSCAN
from scipy.stats import zscore
from sklearn.impute import SimpleImputer

In [3]:
df = pd.read_csv('titanic.csv')
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [4]:
df.describe()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,714.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,257.353842,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,0.0,2.0,20.125,0.0,0.0,7.9104
50%,446.0,0.0,3.0,28.0,0.0,0.0,14.4542
75%,668.5,1.0,3.0,38.0,1.0,0.0,31.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292


In [5]:
df.dtypes

PassengerId      int64
Survived         int64
Pclass           int64
Name            object
Sex             object
Age            float64
SibSp            int64
Parch            int64
Ticket          object
Fare           float64
Cabin           object
Embarked        object
dtype: object

In [3]:
print(df.isnull().sum())

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64


In [8]:
df.describe

<bound method NDFrame.describe of      PassengerId  Survived  Pclass  \
0              1         0       3   
1              2         1       1   
2              3         1       3   
3              4         1       1   
4              5         0       3   
..           ...       ...     ...   
886          887         0       2   
887          888         1       1   
888          889         0       3   
889          890         1       1   
890          891         0       3   

                                                  Name     Sex        Age  \
0                              Braund, Mr. Owen Harris    male  22.000000   
1    Cumings, Mrs. John Bradley (Florence Briggs Th...  female  38.000000   
2                               Heikkinen, Miss. Laina  female  26.000000   
3         Futrelle, Mrs. Jacques Heath (Lily May Peel)  female  35.000000   
4                             Allen, Mr. William Henry    male  35.000000   
..                                             

In [1]:
#take care of the missing values in Age column with mean

df['Age'].fillna(df['Age'].mean(), inplace=True)

NameError: name 'df' is not defined

In [None]:
import pandas as pd
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.cluster import DBSCAN
from scipy.stats import zscore

# a. Load the Titanic dataset
titanic_df = pd.read_csv('path/to/titanic.csv')

# Display the first few rows of the dataset
print("Initial dataset:")
print(titanic_df.head())

# b. Identify and handle missing values
# Check for missing values
print("\nMissing values before handling:")
print(titanic_df.isnull().sum())

# Handle missing values, for example, by filling missing age values with the median
imputer = SimpleImputer(strategy='median')
titanic_df['Age'] = imputer.fit_transform(titanic_df[['Age']])

# c. Create new features
titanic_df['FamilySize'] = titanic_df['SibSp'] + titanic_df['Parch']

# d. Encode categorical variables
# One-hot encode 'Sex' and 'Embarked'
encoder = OneHotEncoder(drop='first', sparse=False)
encoded_features = pd.DataFrame(encoder.fit_transform(titanic_df[['Sex', 'Embarked']]), columns=encoder.get_feature_names(['Sex', 'Embarked']))
titanic_df = pd.concat([titanic_df, encoded_features], axis=1)

# Label encode 'Pclass' if necessary
label_encoder = LabelEncoder()
titanic_df['Pclass'] = label_encoder.fit_transform(titanic_df['Pclass'])

# Drop the original categorical columns
titanic_df.drop(['Sex', 'Embarked'], axis=1, inplace=True)

# Display the updated dataset
print("\nDataset after encoding and creating new features:")
print(titanic_df.head())

# e. Check for and remove duplicate rows
titanic_df.drop_duplicates(inplace=True)

# Display the dataset after removing duplicates
print("\nDataset after removing duplicates:")
print(titanic_df.head())

# f. Detect outliers
# Z-Score method
z_scores = zscore(titanic_df[['Fare', 'Age']])
outliers_zscore = (z_scores > 3) | (z_scores < -3)

# DBSCAN method
dbscan = DBSCAN(eps=3, min_samples=2)
outliers_dbscan = dbscan.fit_predict(titanic_df[['Fare', 'Age']])

# Add outlier columns to the dataset
titanic_df['Outlier_ZScore'] = outliers_zscore.any(axis=1)
titanic_df['Outlier_DBSCAN'] = outliers_dbscan == -1

# Display the dataset with outlier information
print("\nDataset with outlier information:")
print(titanic_df.head())
