In [None]:
import pandas as pd
from sklearn.decomposition import PCA
from sklearn.preprocessing import OneHotEncoder, StandardScaler
import matplotlib.pyplot as plt
import seaborn as sns

# Load dataset
data = pd.read_csv('https://raw.githubusercontent.com/datasciencedojo/datasets/master/titanic.csv')

# Preview data
print(data.head())

# Handle missing values by creating a copy to avoid chained assignment
data.loc[:, 'Age'] = data['Age'].fillna(data['Age'].median())
data.loc[:, 'Embarked'] = data['Embarked'].fillna(data['Embarked'].mode()[0])

# Drop columns that may not be useful for this analysis
data = data.drop(['Name', 'Ticket', 'Cabin'], axis=1)

# Encoding categorical data
encoder = OneHotEncoder()
categorical_features = ['Pclass', 'Sex', 'Embarked']
encoded_features = encoder.fit_transform(data[categorical_features]).toarray()
encoded_feature_names = encoder.get_feature_names_out(categorical_features)

# Create a DataFrame from encoded features and concatenate with the original dataset
encoded_df = pd.DataFrame(encoded_features, columns=encoded_feature_names)
data_encoded = pd.concat([data.drop(categorical_features, axis=1), encoded_df], axis=1)

# Scale data before applying PCA
scaler = StandardScaler()
data_scaled = scaler.fit_transform(data_encoded.drop('Survived', axis=1))

# Applying PCA
pca = PCA(n_components=2)  # reduce to two dimensions for visualization
principal_components = pca.fit_transform(data_scaled)
principal_df = pd.DataFrame(principal_components, columns=['PC1', 'PC2'])

# Combine with the target variable for plotting
final_df = pd.concat([principal_df, data_encoded[['Survived']]], axis=1)

# Plotting the results
plt.figure(figsize=(10, 8))
sns.scatterplot(data=final_df, x='PC1', y='PC2', hue='Survived', style='Survived', palette='viridis')
plt.title('PCA Result on Titanic Dataset')
plt.show()


In [4]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(principal_components, data['Survived'], test_size=0.2, random_state=42)

# Initialize and train the logistic regression model
model = LogisticRegression()
model.fit(X_train, y_train)

# Predict on the test set
y_pred = model.predict(X_test)

# Calculate the accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy of the logistic regression model: {accuracy:.2f}")


Accuracy of the logistic regression model: 0.77
