# Perform data preprocessing on a dataset (e.g., Titanic dataset) including cleaning, handling missing values, transformation, normalization, encoding, and feature engineering for predictive modeling.

In [92]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline


In [94]:
# Load the Titanic dataset
df = pd.read_csv('train.csv')


In [96]:
# Check the first few rows of the dataframe
print(df.head())

# Check for missing values and data types
print(df.info())


   PassengerId  Survived  Pclass  \
0            1         0       3   
1            2         1       1   
2            3         1       3   
3            4         1       1   
4            5         0       3   

                                                Name     Sex   Age  SibSp  \
0                            Braund, Mr. Owen Harris    male  22.0      1   
1  Cumings, Mrs. John Bradley (Florence Briggs Th...  female  38.0      1   
2                             Heikkinen, Miss. Laina  female  26.0      0   
3       Futrelle, Mrs. Jacques Heath (Lily May Peel)  female  35.0      1   
4                           Allen, Mr. William Henry    male  35.0      0   

   Parch            Ticket     Fare Cabin Embarked  
0      0         A/5 21171   7.2500   NaN        S  
1      0          PC 17599  71.2833   C85        C  
2      0  STON/O2. 3101282   7.9250   NaN        S  
3      0            113803  53.1000  C123        S  
4      0            373450   8.0500   NaN        S  
<c

### Handle Missing Values

In [99]:
# Check for missing values
missing_values = df.isnull().sum()
print(missing_values)


PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64


In [101]:
# Fill missing values in 'Age' with the median
df['Age'] = df['Age'].fillna(df['Age'].median())

# Fill missing values in 'Embarked' with the mode
df['Embarked'] = df['Embarked'].fillna(df['Embarked'].mode()[0])


In [103]:
# Drop columns that are not useful for the prediction
cols_to_drop = ['Name', 'Ticket']
df = df.drop(cols_to_drop, axis=1)


### Encode Categorical Variables

In [106]:
# Create dummy variables for 'Pclass', 'Sex', and 'Embarked'
df = pd.get_dummies(df, columns=['Pclass', 'Sex', 'Embarked'], drop_first=True)


### Check the Dataframe Again

In [109]:
print(df.info())


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Age          891 non-null    float64
 3   SibSp        891 non-null    int64  
 4   Parch        891 non-null    int64  
 5   Fare         891 non-null    float64
 6   Cabin        204 non-null    object 
 7   Pclass_2     891 non-null    bool   
 8   Pclass_3     891 non-null    bool   
 9   Sex_male     891 non-null    bool   
 10  Embarked_Q   891 non-null    bool   
 11  Embarked_S   891 non-null    bool   
dtypes: bool(5), float64(2), int64(4), object(1)
memory usage: 53.2+ KB
None


### Feature Engineering

In [112]:
# Feature Engineering: Creating new features
# Creating 'FamilySize' from 'SibSp' and 'Parch'
df['FamilySize'] = df['SibSp'] + df['Parch'] + 1

# Creating 'IsAlone' from 'FamilySize'
df['IsAlone'] = 0  # initialize to 0, assuming passengers are not alone
df.loc[df['FamilySize'] == 1, 'IsAlone'] = 1  # update to 1 if family size is 1



### Splitting the Dataset into Training and Test Set

In [115]:
# Separate the features (X) and the target (y)
X = df.drop(columns='Survived')
y = df['Survived']

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)


### Feature Scaling

In [128]:

# Ensure all data in X are numeric
non_numeric_columns = df.select_dtypes(include=['object']).columns
if len(non_numeric_columns) > 0:
    print(f"Non-numeric columns detected: {non_numeric_columns}")
    df[non_numeric_columns] = df[non_numeric_columns].apply(lambda col: pd.to_numeric(col, errors='coerce'))

# Handle any remaining non-numeric values
df = df.dropna()

# Initialize the MinMaxScaler
scaler = MinMaxScaler()

# Fit and transform the training data
X_train_scaled = scaler.fit_transform(X_train)

# Transform the test data
X_test_scaled = scaler.transform(X_test)

# Verify the scaled data
print("Scaled Training Data:")
print(X_train_scaled)
print("Scaled Test Data:")
print(X_test_scaled)


Scaled Training Data:
[[0.9640045  0.63443842 0.         ... 0.         0.         1.        ]
 [0.05849269 0.60922728 0.125      ... 0.         0.1        0.        ]
 [0.43419573 0.00415984 0.625      ... 1.         0.7        0.        ]
 ...
 [0.70753656 0.34451027 0.         ... 1.         0.         1.        ]
 [0.6287964  0.44535485 0.125      ... 1.         0.1        0.        ]
 [0.76940382 0.74788857 0.125      ... 0.         0.2        0.        ]]
Scaled Test Data:
[[0.5568054  0.34451027 0.         ... 1.         0.         1.        ]
 [0.72890889 0.34451027 0.         ... 1.         0.         1.        ]
 [0.31271091 0.07979327 0.5        ... 1.         0.5        0.        ]
 ...
 [0.69741282 0.3319047  0.125      ... 1.         0.1        0.        ]
 [0.88413948 0.21845456 0.         ... 1.         0.         1.        ]
 [0.071991   0.34451027 0.         ... 0.         0.         1.        ]]


### Check the Dataframe Again

In [28]:
print(df.info())


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 11 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Age          891 non-null    float64
 3   SibSp        891 non-null    int64  
 4   Parch        891 non-null    int64  
 5   Fare         891 non-null    float64
 6   Pclass_2     891 non-null    bool   
 7   Pclass_3     891 non-null    bool   
 8   Sex_male     891 non-null    bool   
 9   Embarked_Q   891 non-null    bool   
 10  Embarked_S   891 non-null    bool   
dtypes: bool(5), float64(2), int64(4)
memory usage: 46.2 KB
None
