<a href="https://colab.research.google.com/github/johngachara/titanic-data-analysis/blob/main/DataPreprocessing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import seaborn as sns
import numpy as np
from sklearn.preprocessing import MinMaxScaler
import pandas as pd

In [None]:
titanic = sns.load_dataset('titanic')
#check dataset structure
print(titanic.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 15 columns):
 #   Column       Non-Null Count  Dtype   
---  ------       --------------  -----   
 0   survived     891 non-null    int64   
 1   pclass       891 non-null    int64   
 2   sex          891 non-null    object  
 3   age          714 non-null    float64 
 4   sibsp        891 non-null    int64   
 5   parch        891 non-null    int64   
 6   fare         891 non-null    float64 
 7   embarked     889 non-null    object  
 8   class        891 non-null    category
 9   who          891 non-null    object  
 10  adult_male   891 non-null    bool    
 11  deck         203 non-null    category
 12  embark_town  889 non-null    object  
 13  alive        891 non-null    object  
 14  alone        891 non-null    bool    
dtypes: bool(2), category(2), float64(2), int64(4), object(5)
memory usage: 80.7+ KB
None


In [None]:
print(titanic.describe())

         survived      pclass         age       sibsp       parch        fare
count  891.000000  891.000000  714.000000  891.000000  891.000000  891.000000
mean     0.383838    2.308642   29.699118    0.523008    0.381594   32.204208
std      0.486592    0.836071   14.526497    1.102743    0.806057   49.693429
min      0.000000    1.000000    0.420000    0.000000    0.000000    0.000000
25%      0.000000    2.000000   20.125000    0.000000    0.000000    7.910400
50%      0.000000    3.000000   28.000000    0.000000    0.000000   14.454200
75%      1.000000    3.000000   38.000000    1.000000    0.000000   31.000000
max      1.000000    3.000000   80.000000    8.000000    6.000000  512.329200


In [None]:
missing = titanic.isnull().sum()
print(missing)

survived         0
pclass           0
sex              0
age            177
sibsp            0
parch            0
fare             0
embarked         2
class            0
who              0
adult_male       0
deck           688
embark_town      2
alive            0
alone            0
dtype: int64


In [None]:
titanic['age'].fillna(titanic['age'].median(),inplace=True)
print(titanic.isnull().sum())

survived         0
pclass           0
sex              0
age              0
sibsp            0
parch            0
fare             0
embarked         2
class            0
who              0
adult_male       0
deck           688
embark_town      2
alive            0
alone            0
dtype: int64


In [None]:
titanic.drop(columns=['deck'],inplace=True)

In [None]:

titanic['embark_town'].fillna(titanic['embark_town'].mode()[0],inplace=True)
titanic['embarked'].fillna(titanic['embarked'].mode()[0],inplace=True)
print(titanic.isnull().sum())

survived       0
pclass         0
sex            0
age            0
sibsp          0
parch          0
fare           0
embarked       0
class          0
who            0
adult_male     0
embark_town    0
alive          0
alone          0
dtype: int64


In [None]:
#z Score outlier
data = titanic['fare']
mean = np.mean(data)
std_dev = np.std(data) # calculates the standard deviation
Z_scores = (data - mean) / std_dev # computes the Z-scores
outliers = data[np.abs(Z_scores) > 3] # finds all the data points that are 3 standard deviations away from the mean
data2 = titanic['age']
mean2 = data2.mean()
std2 = data2.std()
z_score = (data2-mean2)/std2
outliers2 = data2[np.abs(z_score)>3]
print(outliers2)

96     71.0
116    70.5
493    71.0
630    80.0
672    70.0
745    70.0
851    74.0
Name: age, dtype: float64


In [None]:
Q1 = titanic['fare'].quantile(0.25) # calculates the first quartile
Q3 = titanic['fare'].quantile(0.75) # calculates the third quartile
IQR = Q3 - Q1 # computes the IQR

# Below, we find all the data points that fall below the lower bound or above the upper bound
outliers = titanic['fare'][
    (titanic['fare'] < (Q1 - 1.5 * IQR)) |   #less than lower bound
    (titanic['fare'] > (Q3 + 1.5 * IQR))     # greater than upper bound
]
print(outliers)

1       71.2833
27     263.0000
31     146.5208
34      82.1708
52      76.7292
         ...   
846     69.5500
849     89.1042
856    164.8667
863     69.5500
879     83.1583
Name: fare, Length: 116, dtype: float64


In [None]:
mean = np.mean(titanic['fare']) # calculates the mean
standard_deviation = np.std(titanic['fare']) # calculates the standard deviation
outliers = titanic['fare'][np.abs(titanic['fare'] - mean) > 3 * standard_deviation] # finds all the data points that are 3 standard deviations away from the mean

In [None]:
#cappping replacing outlier values with a certain maximum or minimum
# Drop rows with missing 'age' values
titanic_df = titanic.dropna(subset=['age'])

# Calculate the upper bound for 'age'
Q1 = titanic_df['age'].quantile(0.25)
Q3 = titanic_df['age'].quantile(0.75)
IQR = Q3 - Q1
upper_bound = Q3 + 1.5 * IQR

# Cap the outliers for 'age'
titanic_df['age'] = np.where(titanic_df['age'] > upper_bound, upper_bound, titanic_df['age'])

# Calculate the upper bound for 'fare'
Q1 = titanic_df['fare'].quantile(0.25)
Q3 = titanic_df['fare'].quantile(0.75)
IQR = Q3 - Q1
upper_bound = Q3 + 1.5 * IQR

# Cap the outliers for 'fare'
titanic_df['fare'] = np.where(titanic_df['fare'] > upper_bound, upper_bound, titanic_df['fare'])

In [None]:
#normalization

titanic_df[['age','fare']] = MinMaxScaler(feature_range=(0,10)).fit_transform(titanic_df[['age','fare']])
print(titanic_df[['age','fare']].head())

        age       fare
0  3.990385   1.104604
1  6.948964  10.000000
2  4.730030   1.207446
3  6.394231   8.090270
4  6.394231   1.226491


In [None]:
#one hot encoding of categorical features
encoded = pd.get_dummies(titanic_df,columns=['sex','embarked'],dtype=int)
print(encoded.head())

   survived  pclass       age  sibsp  parch       fare  class    who  \
0         0       3  3.990385      1      0   1.104604  Third    man   
1         1       1  6.948964      1      0  10.000000  First  woman   
2         1       3  4.730030      0      0   1.207446  Third  woman   
3         1       1  6.394231      1      0   8.090270  First  woman   
4         0       3  6.394231      0      0   1.226491  Third    man   

   adult_male  embark_town alive  alone  sex_female  sex_male  embarked_C  \
0        True  Southampton    no  False           0         1           0   
1       False    Cherbourg   yes  False           1         0           1   
2       False  Southampton   yes   True           1         0           0   
3       False  Southampton   yes  False           1         0           0   
4        True  Southampton    no   True           0         1           0   

   embarked_Q  embarked_S  
0           0           1  
1           0           0  
2           0       

In [None]:
titanic_df = pd.concat([titanic_df.drop(columns=['sex','embarked']),encoded],axis=1)
print(titanic_df.head())

   survived  pclass       age  sibsp  parch       fare  class    who  \
0         0       3  3.990385      1      0   1.104604  Third    man   
1         1       1  6.948964      1      0  10.000000  First  woman   
2         1       3  4.730030      0      0   1.207446  Third  woman   
3         1       1  6.394231      1      0   8.090270  First  woman   
4         0       3  6.394231      0      0   1.226491  Third    man   

   adult_male  embark_town  ...    who  adult_male  embark_town  alive  alone  \
0        True  Southampton  ...    man        True  Southampton     no  False   
1       False    Cherbourg  ...  woman       False    Cherbourg    yes  False   
2       False  Southampton  ...  woman       False  Southampton    yes   True   
3       False  Southampton  ...  woman       False  Southampton    yes  False   
4        True  Southampton  ...    man        True  Southampton     no   True   

   sex_female  sex_male  embarked_C embarked_Q embarked_S  
0           0       

In [None]:
#Standardization and normalization
# Import necessary libraries
import seaborn as sns
import pandas as pd
from sklearn.preprocessing import MinMaxScaler, StandardScaler

# Load the Titanic Dataset
titanic_df = sns.load_dataset('titanic')

# Create a MinMaxScaler object with a feature range of 0 to 100
age_scaler = MinMaxScaler(feature_range=(0, 100))
# TODO: Create a StandardScaler object for standardizing 'fare'
fare_scaler = StandardScaler()
# Fit the age scaler on the age data without NaN values
age_scaler.fit(titanic_df[['age']].dropna())
# TODO: Fit the scaler on the 'fare' data without NaN values
fare_scaler.fit(titanic_df[['fare']].dropna())
# Holds the indexes for the rows with non-NaN age and fare values
non_na_age_index = titanic_df['age'].dropna().index
# TODO: Replace the following line with a calculation of the non-NaN fare index values
non_na_fare_index = titanic_df['fare'].dropna().index

# Transform the 'age' and 'fare' columns in the original dataframe without NaN values
titanic_df.loc[non_na_age_index, 'norm_age'] = age_scaler.transform(titanic_df.loc[non_na_age_index, ['age']])
# TODO: Transform the 'fare' column using the StandardScaler and non-NaN indices
titanic_df.loc[non_na_fare_index, 'stand_fare'] = fare_scaler.transform(titanic_df.loc[non_na_fare_index, ['fare']])
# Display transformed 'age' and standardized 'fare' values
# TODO: Update this line to include the newly standardized 'fare' column
print(titanic_df[['age', 'norm_age', 'fare','stand_fare']])

In [None]:
# Import necessary libraries
import seaborn as sns
import pandas as pd
from sklearn.preprocessing import StandardScaler

# TODO: Load the Titanic Dataset
titanic_df = sns.load_dataset('titanic')
# TODO: Create a StandardScaler object
scaler = StandardScaler()
# TODO: Fit the scaler on the 'fare' data while handling NaN values properly
fare = titanic_df[['fare']].dropna()
scaler.fit(fare)
# TODO: Transform the 'fare' column creating a new column 'stand_fare' in the original dataframe without NaN values
fare_na_index = titanic_df['fare'].dropna().index
titanic_df.loc[fare_na_index,'stand_fare'] = scaler.transform(titanic_df.loc[fare_na_index,['fare']])
# TODO: Display standardized 'fare' values (the new 'stand_fare' column)
print(titanic_df['stand_fare'])

In [None]:
# Import necessary libraries
import seaborn as sns
import pandas as pd
import numpy as np

# Load the Titanic dataset
titanic_df = sns.load_dataset("titanic")

# Create a new feature 'family_size'
titanic_df['family_size'] = titanic_df['sibsp'] + titanic_df['parch'] + 1

# Modify 'fare' feature to 'log_fare' using natural logarithm
titanic_df['log_fare'] = np.log10(titanic_df['fare'] + 0.1)

# One-hot encode 'sex' feature
sex_dummies = pd.get_dummies(titanic_df['sex'], dtype=int)
titanic_df = pd.concat([titanic_df, sex_dummies], axis=1)

# Print the first 5 rows of the dataframe
print(titanic_df.head())

In [None]:
# Import necessary libraries
import seaborn as sns
import pandas as pd

# Load the Titanic dataset
titanic_df = sns.load_dataset("titanic")

# Create a new binary encoded feature 'embarked_southampton'
embark_town_col = pd.DataFrame([1 if i == 'Southampton' else 0 for i in titanic_df['embark_town']], columns=["embarked_southampton"])

# Join to the main dataframe with aligned indices
titanic_df = titanic_df.reset_index(drop=True)
embark_town_col = embark_town_col.reset_index(drop=True)
titanic_df = pd.concat([titanic_df, embark_town_col], axis=1)

# Print the first 5 rows of the dataframe
print(titanic_df.head())

In [None]:
# Import necessary libraries
import seaborn as sns
import pandas as pd
import numpy as np

# Load the Titanic dataset
titanic_df = sns.load_dataset("titanic")

# Create a new feature 'family_size'
titanic_df['family_size'] = titanic_df['sibsp'] + titanic_df['parch'] + 1

# TODO: Transform the 'fare' column into a new column 'log10_fare' using log base 10. Include an adjustment for zero fares.
titanic_df['log10_fare'] = np.log10(titanic_df['fare'] + 0.1)

# Print the first 5 rows of the dataframe
print(titanic_df.head())

In [None]:
# Import necessary libraries
import seaborn as sns
import pandas as pd

# TODO: Load the Titanic dataset and assign it to a variable named 'titanic_df'
titanic_df = sns.load_dataset('titanic')
# TODO: Perform one-hot encoding on the 'class' column to create binary columns for each class
class_col = titanic_df['class']
binary = pd.get_dummies(class_col,columns=['class'],dtype=int)
# TODO: Join the new binary columns to 'titanic_df'
titanic_df = pd.concat([titanic_df,binary],axis=1)
# TODO: Display the first 5 rows of the modified dataframe
print(titanic_df.head())

In [None]:
# Import necessary libraries
from sklearn.model_selection import train_test_split
import seaborn as sns

# Loading the Titanic dataset
titanic_df = sns.load_dataset('titanic')

# Splitting the full dataset into the training and testing datasets
train_data, test_data = train_test_split(titanic_df, test_size=0.4, random_state=42)

# Printing out the shapes of the datasets
print(f"Train data shape: {train_data.shape}")
print(f"Test data shape: {test_data.shape}")

In [None]:
# Importing necessary libraries
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.linear_model import LogisticRegression
import seaborn as sns
import pandas as pd

# Load and preprocess the Titanic dataset
titanic_df = sns.load_dataset('titanic')

# One-hot encode categorical variables using pandas get_dummies
titanic_preprocessed = pd.get_dummies(titanic_df, columns=['sex', 'embarked', 'class', 'who', 'adult_male', 'deck', 'embark_town', 'alive', 'alone'], drop_first=True)

# Handle any NaN values by filling them with the mean of the column
titanic_preprocessed = titanic_preprocessed.fillna(titanic_preprocessed.mean())

# Split the preprocessed dataset into the training and testing datasets with a 70%-30% split
train_data, test_data = train_test_split(titanic_preprocessed, test_size=0.3, random_state=42)

# Separate the target variable ("survived") from the rest of the training data
x_train = train_data.drop("survived", axis=1)
y_train = train_data["survived"]
x_test = test_data.drop("survived",axis=1)
y_test = test_data['survived']
# Initialize a Logistic Regression model
logreg = LogisticRegression(max_iter=1000)  # Increased max_iter for convergence

# Training the Logistic Regression model
logreg.fit(x_train, y_train)

# Using the model to make predictions on the testing dataset
predictions = logreg.predict(x_test)

# Displaying metrics
print("Classification Report:")
print(classification_report(y_test, predictions))  # Will produce incorrect results due to the bug

print("Confusion Matrix:")
print(confusion_matrix(y_test, predictions))  # Will produce incorrect results due to the bug

print("Accuracy Score:")
print(accuracy_score(y_test, predictions))  # Will produce incorrect results due to the bug

In [None]:
# Importing necessary libraries
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LogisticRegression
import seaborn as sns
import pandas as pd

# Load and preprocess the Titanic dataset
titanic_df = sns.load_dataset('titanic')

# Drop non-numeric columns for simplicity
titanic_df = titanic_df.select_dtypes(include=['float64', 'int64'])

# Handle any NaN values by filling them with the mean of the column
titanic_df = titanic_df.fillna(titanic_df.mean())

# TODO: Use MinMaxScaler to scale the numeric features into a standard range
# Hint: You will need to create an instance of MinMaxScaler, fit it on the data and transform the data
scaler = MinMaxScaler()
titanic_df_scaled = scaler.fit_transform(titanic_df)
titanic_df = pd.DataFrame(titanic_df_scaled,columns=titanic_df.columns)
# Split the preprocessed dataset into the training and testing datasets with a 70%-30% split
train_data, test_data = train_test_split(titanic_df, test_size=0.3, random_state=42)

# Separate the target variable ("survived") from the rest of the training data
x_train = train_data.drop("survived", axis=1)
y_train = train_data["survived"]

# Initialize a Logistic Regression model
logreg = LogisticRegression(max_iter=1000)

# Training the Logistic Regression model
logreg.fit(x_train, y_train)

# Separate the independent (x_test) and dependent (y_test) variables from the testing dataset
x_test = test_data.drop("survived", axis=1)
y_test = test_data["survived"]

# Using the model to make predictions on the testing dataset
predictions = logreg.predict(x_test)

# Displaying metrics
print("Classification Report:")
print(classification_report(y_test, predictions))

print("Accuracy Score:")
print(accuracy_score(y_test, predictions))

In [None]:
# Importing necessary libraries
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score  # Changed from precision_score to accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
import seaborn as sns
import pandas as pd

# Load the Titanic dataset
titanic_df = sns.load_dataset('titanic')

# Drop columns with strings and 'pclass', which is categorical but read as a numeric type
titanic_df = titanic_df.select_dtypes(exclude=['object', 'category'])

# Handle any NaN values by filling them with the mean of the column (ignoring 'pclass', which is categorical)
numeric_columns = titanic_df.columns.drop('pclass')
titanic_df[numeric_columns] = titanic_df[numeric_columns].fillna(titanic_df[numeric_columns].mean())

# Convert 'pclass' to integer type if it's not already
titanic_df['pclass'] = titanic_df['pclass'].astype(int)

# TODO: Split the dataset into training and testing sets with a 70%-30% split
train_data,test_data = train_test_split(titanic_df,test_size=0.3,random_state=42)
# TODO: Identify and separate the target variable 'survived' from the training and testing data
x_train = train_data.drop("survived",axis=1)
y_train = train_data['survived']
x_test = test_data.drop("survived",axis=1)
y_test = test_data['survived']
# TODO: Initialize StandardScaler and scale the features
scaler = StandardScaler()
x_train_scaled = scaler.fit_transform(x_train)
x_test_scaled = scaler.transform(x_test)#we fit one time
# TODO: Initialize the Logistic Regression model and train it on the scaled training data
logreg = LogisticRegression(max_iter=100)
logreg.fit(x_train_scaled,y_train)
# TODO: Use the trained model to make predictions on the scaled testing data
predictions = logreg.predict(x_test_scaled)
# TODO: Calculate and print the accuracy score
print(accuracy_score(y_test,predictions))

In [4]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
import numpy as np
from sklearn.preprocessing import StandardScaler
# Load the dataset

data_url = "http://lib.stat.cmu.edu/datasets/boston"
raw_df = pd.read_csv(data_url, sep="\s+", skiprows=22, header=None)
X = np.hstack([raw_df.values[::2, :], raw_df.values[1::2, :2]])
y = raw_df.values[1::2, 2]

# Convert the target into binary categories
# Let's say the threshold is the median price
threshold = np.median(y)
y_binary = np.where(y > threshold, 1, 0)  # 1 for 'expensive', 0 for 'not expensive'

# Split the dataset
X_train, X_test, y_binary_train, y_binary_test = train_test_split(X, y_binary, test_size=0.2, random_state=42)
scaler = StandardScaler()
scaled_x_train = scaler.fit_transform(X_train)
scaled_x_test = scaler.transform(X_test)
# Apply logistic regression
model = LogisticRegression(max_iter=1000)  # Increased max_iter for convergence
model.fit(scaled_x_train, y_binary_train)

# Now you can make predictions and evaluate the model
predictions = model.predict(scaled_x_test)
print(accuracy_score(y_binary_test,predictions))

# Evaluating the model would typically involve metrics like accuracy, precision, and recall.

0.8725490196078431
