In [1]:
# imports
import sqlite3
import pandas as pd
import psycopg2

In [2]:
# Create database file if it doesn't exist
with sqlite3.connect('titanic.sqlite3') as conn:
    
    # 1. Read csv file, change column names
    df = pd.read_csv('titanic.csv', names = ('survived', 'pclass', 'name', 'sex', 'age', 'sib_sp_aboard', 'par_ch_aboard', 'fare'), header = 0)
    
    # 2. DROP TABLE review IF EXISTS
    drop_query = 'DROP TABLE IF EXISTS review'
    conn.cursor().execute(drop_query)
    
    # 3. INSERT TABLE review
    df.to_sql('titanic', conn, index=False)
    query = 'SELECT * FROM titanic'
    df = pd.read_sql(query, conn)
df.head()

Unnamed: 0,survived,pclass,name,sex,age,sib_sp_aboard,par_ch_aboard,fare
0,0,3,Mr. Owen Harris Braund,male,22.0,1,0,7.25
1,1,1,Mrs. John Bradley (Florence Briggs Thayer) Cum...,female,38.0,1,0,71.2833
2,1,3,Miss. Laina Heikkinen,female,26.0,0,0,7.925
3,1,1,Mrs. Jacques Heath (Lily May Peel) Futrelle,female,35.0,1,0,53.1
4,0,3,Mr. William Henry Allen,male,35.0,0,0,8.05


In [3]:
# create function to create SQL connection to database
def create_connection(db_file, verbose=False):
    conn = None
    try:
        conn = sqlite3.connect(db_file)
        if verbose:
            print(f'Using SQLite version: {sqlite3.version}')
            print(f'Creating Connection to {db_file}...')
        return conn
    except sqlite3.Error as e:
        print(e)

In [4]:
# create function to query a database
def select_all_query(db_file, query, verbose=False):

    conn = create_connection(db_file, verbose)
    cur = conn.cursor()
    if not query.startswith('SELECT'):
        raise ValueError('Query should begin with `SELECT`')
    
    cur.execute(query)
    rows = cur.fetchall()

    if verbose:
        for row in rows:
            print(row)

    return rows

In [5]:
# How many total Characters are there?
tot_surv = select_all_query('titanic.sqlite3', 'SELECT COUNT (*) FROM titanic WHERE survived IS 1')
print('Total Survivors:', tot_surv[0][0])

Total Survivors: 342


In [6]:
# How many total Characters are there?
tot_deaths = select_all_query('titanic.sqlite3', 'SELECT COUNT (*) FROM titanic WHERE survived IS 0')
print('Total Deaths:', tot_deaths[0][0])

Total Deaths: 545


In [7]:
# How many passengers survived/died within each class?
tot_surv_pclass1 = select_all_query('titanic.sqlite3', 'SELECT COUNT (*) FROM titanic \
                               WHERE survived IS 0 \
                               AND Pclass IS 1')

tot_death_pclass1 = select_all_query('titanic.sqlite3', 'SELECT COUNT (*) FROM titanic \
                               WHERE survived IS 1 \
                               AND Pclass IS 1')

tot_surv_pclass2 = select_all_query('titanic.sqlite3', 'SELECT COUNT (*) FROM titanic \
                               WHERE survived IS 0 \
                               AND Pclass IS 2')

tot_death_pclass2 = select_all_query('titanic.sqlite3', 'SELECT COUNT (*) FROM titanic \
                               WHERE survived IS 1 \
                               AND Pclass IS 2')

tot_surv_pclass3 = select_all_query('titanic.sqlite3', 'SELECT COUNT (*) FROM titanic \
                               WHERE survived IS 0 \
                               AND Pclass IS 3')

tot_death_pclass3 = select_all_query('titanic.sqlite3', 'SELECT COUNT (*) FROM titanic \
                               WHERE survived IS 1 \
                               AND Pclass IS 3')

print('Total Survivors, PClass One:', tot_surv_pclass1[0][0])
print('Total Deaths, PClass One:', tot_death_pclass1[0][0], '\n')
print('Total Survivors, PClass Two:', tot_surv_pclass2[0][0])
print('Total Deaths, PClass Two:', tot_death_pclass2[0][0], '\n')
print('Total Survivors, PClass Three:', tot_surv_pclass3[0][0])
print('Total Deaths, PClass Three:', tot_death_pclass3[0][0], '\n')

Total Survivors, PClass One: 80
Total Deaths, PClass One: 136 

Total Survivors, PClass Two: 97
Total Deaths, PClass Two: 87 

Total Survivors, PClass Three: 368
Total Deaths, PClass Three: 119 



In [8]:
# What was the average age of survivors vs nonsurvivors?

# create query to find average age of survivors vs non-survivors
query = '''SELECT survived, AVG(age)
FROM titanic
GROUP BY survived;'''

# connect to db
conn = create_connection('titanic.sqlite3')

# create dataframe
df = pd.read_sql(query, conn)
df

Unnamed: 0,survived,AVG(age)
0,0,30.138532
1,1,28.408392


In [9]:
# What was the average age of each passenger class?

# create query to find average age by plcass
query = '''SELECT pclass, AVG(age)
FROM titanic
GROUP BY pclass;'''

# connect to db
conn = create_connection('titanic.sqlite3')

# create dataframe
df = pd.read_sql(query, conn)
df

Unnamed: 0,pclass,AVG(age)
0,1,38.788981
1,2,29.868641
2,3,25.188747


In [10]:
# What was the average fare by passenger class?

query = '''SELECT pclass, AVG(fare)
FROM titanic
GROUP BY pclass;'''

# connect to db
conn = create_connection('titanic.sqlite3')

# create dataframe for average fare by Pclass
df = pd.read_sql(query, conn)
df

Unnamed: 0,pclass,AVG(fare)
0,1,84.154687
1,2,20.662183
2,3,13.707707


In [11]:
# What was the average fare by survival?

query = '''SELECT survived, AVG(fare)
FROM titanic
GROUP BY survived;'''

# connect to db
conn = create_connection('titanic.sqlite3')

# create dataframe for average fare by survival
df = pd.read_sql(query, conn)
df

Unnamed: 0,survived,AVG(fare)
0,0,22.208584
1,1,48.395408


In [12]:
# How many siblings/spouses aboard on average, by passenger class?

query = '''SELECT pclass, AVG(sib_sp_aboard)
FROM titanic
GROUP BY pclass;'''

# connect to db
conn = create_connection('titanic.sqlite3')

# create dataframe for average #sib/sp by pclass
df = pd.read_sql(query, conn)
df


Unnamed: 0,pclass,AVG(sib_sp_aboard)
0,1,0.416667
1,2,0.402174
2,3,0.620123


In [13]:
# How many siblings/spouses aboard on average, by survival?

query = '''SELECT survived, AVG(sib_sp_aboard)
FROM titanic
GROUP BY survived;'''

# connect to db
conn = create_connection('titanic.sqlite3')

# create dataframe for average #of siblings/spouses by survival
df = pd.read_sql(query, conn)
df

Unnamed: 0,survived,AVG(sib_sp_aboard)
0,0,0.557798
1,1,0.473684


In [14]:
# How many parents/children aboard on average, by passenger class?

query = '''SELECT pclass, AVG(par_ch_aboard)
FROM titanic
GROUP BY pclass;'''

# connect to db
conn = create_connection('titanic.sqlite3')

# create dataframe for average #par/ch aboard by pclass
df = pd.read_sql(query, conn)
df

Unnamed: 0,pclass,AVG(par_ch_aboard)
0,1,0.356481
1,2,0.380435
2,3,0.396304


In [15]:
# How many parents/children aboard on average, by survival?

query = '''SELECT survived, AVG(par_ch_aboard)
FROM titanic
GROUP BY survived;'''

# connect to db
conn = create_connection('titanic.sqlite3')

# create dataframe for average #par/ch aboard by survival
df = pd.read_sql(query, conn)
df

Unnamed: 0,survived,AVG(par_ch_aboard)
0,0,0.33211
1,1,0.464912


In [16]:
# Do any passengers have the same name?
query = '''SELECT name FROM titanic WHERE name NOT IN (SELECT DISTINCT name FROM titanic);
'''
# connect to db
conn = create_connection('titanic.sqlite3')

# select distinct names from dataframe
df = pd.read_sql(query, conn)
df

Unnamed: 0,name
