In [27]:
import pandas as pd

pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

# Read the CSV file into a DataFrame
titanic_df = pd.read_csv('./Resources/Titanic CSV.csv')

# Display the first 5 rows
titanic_df.head()


Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [44]:
# Show the number of passengers and survivors by sex, then add a new column called "Total Died"
passengers_by_sex = titanic_df.groupby('Sex').agg(
    Total_Passengers=('PassengerId', 'size'),
    Total_Survivors=('Survived', 'sum')
).reset_index()

# Add the 'Total Died' column
passengers_by_sex['Total Died'] = passengers_by_sex['Total_Passengers'] - passengers_by_sex['Total_Survivors']
passengers_by_sex


Unnamed: 0,Sex,Total_Passengers,Total_Survivors,Total Died
0,female,314,233,81
1,male,577,109,468


In [45]:
# Show the number of passengers and survivors by class, add a new column called "Total Died"
passengers_by_class = titanic_df.groupby('Pclass').agg(
    Total_Passengers=('PassengerId', 'size'),
    Total_Survivors=('Survived', 'sum')
).reset_index()

# Rename columns for clarity
passengers_by_class.columns = ['Passenger Class', 'Total Passengers', 'Total Survivors']

# Map passenger class numbers to names
passengers_by_class['Passenger Class'] = passengers_by_class['Passenger Class'].map({1: 'First', 2: 'Second', 3: 'Third'})

# Add the 'Total Died' column
passengers_by_class['Total Died'] = passengers_by_class['Total Passengers'] - passengers_by_class['Total Survivors']

passengers_by_class

Unnamed: 0,Passenger Class,Total Passengers,Total Survivors,Total Died
0,First,216,136,80
1,Second,184,87,97
2,Third,491,119,372


In [43]:
# Show the number of passengers and survivors by age group, add a new column for "Total Died"
def age_group(age):
    if age < 18:
        return 'Child'
    else:
        return 'Adult'
    
titanic_df['Age Group'] = titanic_df['Age'].apply(age_group)
age_group_stats = titanic_df.groupby('Age Group').agg(
    Total_Passengers=('PassengerId', 'size'),
    Total_Survivors=('Survived', 'sum')
).reset_index()

# Ensure column names are correctly assigned
age_group_stats.columns = ['Age Group', 'Total Passengers', 'Total Survivors']

# Add the 'Total Died' column
age_group_stats['Total Died'] = age_group_stats['Total Passengers'] - age_group_stats['Total Survivors']
age_group_stats


Unnamed: 0,Age Group,Total Passengers,Total Survivors,Total Died
0,Adult,778,281,497
1,Child,113,61,52


In [None]:
# Show the number of passengers and survivors by fare group, add a new column for "Total Died"
def fare_group(fare):
    if fare < 50:
        return 'Low Fare'
    elif fare < 100:
        return 'Medium Fare'
    else:
        return 'High Fare'
    
titanic_df['Fare Group'] = titanic_df['Fare'].apply(fare_group)
fare_group_stats = titanic_df.groupby('Fare Group').agg(
    Total_Passengers=('PassengerId', 'size'),
    Total_Survivors=('Survived', 'sum')
).reset_index()
fare_group_stats.columns = ['Fare Group', 'Total Passengers', 'Total Survivors']
fare_group_stats['Total Died'] = fare_group_stats['Total Passengers'] - fare_group_stats['Total Survivors']
fare_group_stats

Unnamed: 0,Fare Group,Total Passengers,Total Survivors,Total Died
0,High Fare,53,39,14
1,Low Fare,730,233,497
2,Medium Fare,108,70,38


In [46]:
# Show the number of passengers and survivors by embarked location, add a new column for "Total Died"
embarked_stats = titanic_df.groupby('Embarked').agg(
    Total_Passengers=('PassengerId', 'size'),
    Total_Survivors=('Survived', 'sum')
).reset_index()
embarked_stats.columns = ['Embarked', 'Total Passengers', 'Total Survivors']
embarked_stats['Total Died'] = embarked_stats['Total Passengers'] - embarked_stats['Total Survivors']
embarked_stats['Embarked'] = embarked_stats['Embarked'].map({'C': 'Cherbourg', 'Q': 'Queenstown', 'S': 'Southampton'})
embarked_stats

Unnamed: 0,Embarked,Total Passengers,Total Survivors,Total Died
0,Cherbourg,168,93,75
1,Queenstown,77,30,47
2,Southampton,644,217,427


In [49]:
# Show the number of passengers and survivors by cabin, add a new column for "Total Died"
cabin_stats = titanic_df.groupby('Cabin').agg(
    Total_Passengers=('PassengerId', 'size'),
    Total_Survivors=('Survived', 'sum')
).reset_index()
cabin_stats.columns = ['Cabin', 'Total Passengers', 'Total Survivors']
cabin_stats['Total Died'] = cabin_stats['Total Passengers'] - cabin_stats['Total Survivors']
cabin_stats['Cabin'] = cabin_stats['Cabin'].fillna('Unknown')
cabin_stats = cabin_stats[cabin_stats['Cabin'] != 'Unknown']  # Filter out unknown cabins
cabin_stats.head(10)

Unnamed: 0,Cabin,Total Passengers,Total Survivors,Total Died
0,A10,1,0,1
1,A14,1,0,1
2,A16,1,1,0
3,A19,1,0,1
4,A20,1,1,0
5,A23,1,1,0
6,A24,1,0,1
7,A26,1,1,0
8,A31,1,1,0
9,A32,1,0,1


In [52]:
# Save the dataframes to CSV files
passengers_by_sex.to_csv('./Resources/Passengers_By_Sex.csv', index=False)
passengers_by_class.to_csv('./Resources/Passengers_By_Class.csv', index=False)
age_group_stats.to_csv('./Resources/Age_Group_Stats.csv', index=False)
fare_group_stats.to_csv('./Resources/Fare_Group_Stats.csv', index=False)
embarked_stats.to_csv('./Resources/Embarked_Stats.csv', index=False)
cabin_stats.to_csv('./Resources/Cabin_Stats.csv', index=False)
titanic_df.to_csv('./Resources/Titanic.csv', index=False)



In [24]:
# Drop rows with missing values and reset the index
# titanic_df = titanic_df.dropna().reset_index(drop=True)
# titanic_df.head()


In [None]:
# # Count the number of rows in the DataFrame
# len(titanic_df) 

183

In [None]:
# # Drop the names column
# titanic_df = titanic_df.drop(columns=['Name'])

In [None]:
# # Display the total passengers
# total_passengers = len(titanic_df)
# print(f"Total passengers: {total_passengers}")

Total passengers: 183


In [None]:
#Create new dataframes that do not include the names column

In [None]:
# save the new dataframes and replace the old ones

In [None]:
# Create


In [None]:
# save the dataframes to CSV files *****REMOVE THIS LINE ONCE NEW DATAFRAMES ARE CREATED***
passengers_df.to_csv('passengers.csv', index=False)
tickets_df.to_csv('tickets.csv', index=False)
cabins_df.to_csv('cabins.csv', index=False)


In [69]:
import sqlite3

# Connect to SQLite database
conn = sqlite3.connect("titanic_db.sqlite")
cursor = conn.cursor()

In [None]:
#Create new tables based on the new csv files created above

In [None]:
# Create passengers table ***NEEDS TO BE UPDATED AFTER CSV FILES ARE CREATED***
cursor.execute('''
CREATE TABLE passengers (
    PassengerId INTEGER PRIMARY KEY,
    Survived INTEGER,
    Pclass INTEGER,
    Name TEXT,
    Sex TEXT,
    Age REAL,
    SibSp INTEGER,
    Parch INTEGER,
    Embarked TEXT
);
''')

# Create tickets table
cursor.execute('''
CREATE TABLE tickets (
    PassengerId INTEGER PRIMARY KEY,
    Ticket TEXT,
    Fare REAL,
    FOREIGN KEY (PassengerId) REFERENCES passengers(PassengerId)
);
''')

# Create cabins table
cursor.execute('''
CREATE TABLE cabins (
    PassengerId INTEGER PRIMARY KEY,
    Cabin TEXT,
    FOREIGN KEY (PassengerId) REFERENCES passengers(PassengerId)
);
''')

<sqlite3.Cursor at 0x1ed54f2bdc0>

In [71]:
# Load data from CSV files into tables
def load_csv_to_db(csv_path, table_name):
    df = pd.read_csv(csv_path)
    df.to_sql(table_name, conn, if_exists='append', index=False)

load_csv_to_db('C:/Users/joe/Project3_Practice/passengers.csv', 'passengers')
load_csv_to_db('C:/Users/joe/Project3_Practice/tickets.csv', 'tickets')
load_csv_to_db('C:/Users/joe/Project3_Practice/cabins.csv', 'cabins')

# Verify data loading
for table in ['passengers', 'tickets', 'cabins']:
    cursor.execute(f"SELECT COUNT(*) FROM {table}")
    print(f"Total records in {table}: ", cursor.fetchone()[0])

Total records in passengers:  183
Total records in tickets:  183
Total records in cabins:  183


In [62]:
# Display first 5 rows from passengers table in a pandas DataFrame
passengers_df = pd.read_sql_query("SELECT * FROM passengers LIMIT 5;", conn)
passengers_df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Embarked
0,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,C
1,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,S
2,7,0,1,"McCarthy, Mr. Timothy J",male,54.0,0,0,S
3,11,1,3,"Sandstrom, Miss. Marguerite Rut",female,4.0,1,1,S
4,12,1,1,"Bonnell, Miss. Elizabeth",female,58.0,0,0,S


In [67]:
# Delete the tables
cursor.execute("DROP TABLE IF EXISTS passengers;")
cursor.execute("DROP TABLE IF EXISTS tickets;")
cursor.execute("DROP TABLE IF EXISTS cabins;")
conn.commit()   

In [68]:
# Commit and close connection
conn.commit()
conn.close()


Additional Library Not Learned in Class

In [None]:
#Replace sklearn with either charts/graphs / bottle / fast api

In [45]:
# Import necessary libraries
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import make_column_transformer

# 1. Load the Data
df = pd.read_csv("Titanic CSV.csv")

# 2. Prepare the Data
# Handle missing values
df['Age'] = df['Age'].fillna(df['Age'].median())
df.dropna(subset=['Embarked'], inplace=True)  # Remove rows with missing Embarked

# Select features
features = ['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked']
target = 'Survived'

# Encode categorical variables
transformer = make_column_transformer(
    (OneHotEncoder(), ['Sex', 'Embarked']),
    remainder='passthrough')

transformed_features = transformer.fit_transform(df[features])
transformed_feature_names = transformer.get_feature_names_out(input_features=features)

X = pd.DataFrame(transformed_features, columns=transformed_feature_names)
y = df[target]

# 3. Split the Data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 4. Create a Model
model = LogisticRegression(solver='liblinear', random_state=42)

# 5. Train the Model
model.fit(X_train, y_train)

# 6. Make Predictions
y_pred = model.predict(X_test)

# 7. Evaluate the Model
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy}")

Accuracy: 0.7808988764044944


In [46]:
# print the model coefficients
coefficients = model.coef_[0]
print("Model Coefficients:")
for feature, coef in zip(X.columns, coefficients):
    print(f"{feature}: {coef:.4f}")

Model Coefficients:
onehotencoder__Sex_female: 2.2467
onehotencoder__Sex_male: -0.3974
onehotencoder__Embarked_C: 0.8462
onehotencoder__Embarked_Q: 0.6416
onehotencoder__Embarked_S: 0.3614
remainder__Pclass: -0.9758
remainder__Age: -0.0368
remainder__SibSp: -0.3628
remainder__Parch: -0.0706
remainder__Fare: 0.0032
