# EDA with sqlalchemy

## Get dataset from [Titanic extended dataset (Kaggle + Wikipedia)](https://www.kaggle.com/pavlofesenko/titanic-extended)

## References
- [SQLAlchemy ORM Tutorial for Python Developers](https://auth0.com/blog/sqlalchemy-orm-tutorial-for-python-developers/?utm_source=medium&utm_medium=sc&utm_campaign=sqlalchemy_python)
- [SQL로 맛보는 데이터 전처리 분석](http://www.yes24.com/Product/Goods/86544423)

# Load packages

In [1]:
%load_ext lab_black

In [2]:
import pandas as pd
import sqlalchemy as db
import pymysql
from _secrets import PASSWORD
from pathlib import Path

# Connect MySQL DB

In [3]:
IP = "localhost"
DB_NAME = "testdb"
TABLE_NAME = "titanic"
USER = "root"
db_url = f"mysql+pymysql://{USER}:{PASSWORD}@{IP}/{DB_NAME}"
engine = db.create_engine(db_url)
connection = engine.connect()
print(engine.table_names())

['test_table', 'titanic', 'user']


  


# Inject data 

In [4]:
filepath = Path("archive/full.csv")
df = pd.read_csv(filepath)

In [5]:
# To resovle unicode error. For details, check the link below
# https://stackoverflow.com/questions/65012603/removing-rows-contains-non-english-words-in-pandas-dataframe
def is_ascii(s):
    try:
        s.encode(encoding="utf-8").decode("ascii")
    except UnicodeDecodeError:
        return False
    except:
        return True
    else:
        return True

target_columns = ["Name", "Name_wiki", "Hometown", "Boarded", "Destination"]

for column in target_columns:
    df = df[df[column].map(lambda x: is_ascii(x))]

Name
Name_wiki
Hometown
Boarded
Destination


In [6]:
# Drop if table already exists
query = f"DROP TABLE IF EXISTS {TABLE_NAME}"
engine.execute(query)

# IF not, inject
df.to_sql(TABLE_NAME, con=engine, index=True)

# 요인별 생존 여부 관계

In [50]:
metadata = db.MetaData()
titanic = db.Table("titanic", metadata, autoload=True, autoload_with=engine)
columns = [x.replace("titanic.", "") for x in titanic.columns.keys()]

In [52]:
def execute_query(query: str) -> pd.DataFrame:
    result_proxy = connection.execute(query)
    result_set = result_proxy.fetchall()
    result_df = pd.DataFrame(
        result_set,
        columns=columns,
    ).set_index("index")
    return result_df

## 성별

In [53]:
query = "SELECT * FROM titanic LIMIT 10;"
execute_query(query)

Unnamed: 0_level_0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,...,Embarked,WikiId,Name_wiki,Age_wiki,Hometown,Boarded,Destination,Lifeboat,Body,Class
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,1,0.0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,...,S,691.0,"Braund, Mr. Owen Harris",22.0,"Bridgerule, Devon, England",Southampton,"Qu'Appelle Valley, Saskatchewan, Canada",,,3.0
4,5,0.0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,...,S,627.0,"Allen, Mr. William Henry",35.0,"Birmingham, West Midlands, England",Southampton,New York City,,,3.0
5,6,0.0,3,"Moran, Mr. James",male,,0,0,330877,8.4583,...,Q,785.0,"Doherty, Mr. William John (aka ""James Moran"")",22.0,"Cork, Ireland",Queenstown,New York City,,,3.0
6,7,0.0,1,"McCarthy, Mr. Timothy J",male,54.0,0,0,17463,51.8625,...,S,200.0,"McCarthy, Mr. Timothy J.",54.0,"Dorchester, Massachusetts, US",Southampton,"Dorchester, Massachusetts, US",,175MB,1.0
11,12,1.0,1,"Bonnell, Miss. Elizabeth",female,58.0,0,0,113783,26.55,...,S,35.0,"Bonnell, Miss Elizabeth",61.0,"Youngstown, Ohio, US",Southampton,"Youngstown, Ohio, US",8,,1.0
12,13,0.0,3,"Saundercock, Mr. William Henry",male,20.0,0,0,A/5. 2151,8.05,...,S,1196.0,"Saundercock, Mr. William Henry",19.0,"St Austell, Cornwall, England",Southampton,New York City,,,3.0
16,17,0.0,3,"Rice, Master. Eugene",male,2.0,4,1,382652,29.125,...,Q,1151.0,"Rice, Master Eugene Francis",2.0,"Athlone, Westmeath, Ireland",Queenstown,"Spokane, Washington, US",,,3.0
17,18,1.0,2,"Williams, Mr. Charles Eugene",male,,0,0,244373,13.0,...,S,604.0,"Williams, Mr. Charles Eugene",23.0,"Harrow, London, England",Southampton,"Chicago, Illinois, US",14,,2.0
19,20,1.0,3,"Masselmani, Mrs. Fatima",female,,0,0,2649,7.225,...,C,1053.0,"Muslamani, Mrs. Fatimah",22.0,"Tebnine, Lebanon",Cherbourg,"Michigan City, Indiana, US",C,,3.0
20,21,0.0,2,"Fynney, Mr. Joseph J",male,35.0,0,0,239865,26.0,...,S,416.0,"Fynney, Mr. Joseph J.",35.0,"Liverpool, Merseyside, England, UK",Southampton,"Montreal, Quebec, Canada",,322M,2.0


In [86]:
# query = "SELECT COUNT(PASSENGERID) N_PASSENGGERS, COUNT(DISTINCT PASSENGERID) N_D_PASSENGERS FROM titanic;"
query = db.select(
    [
        db.func.count(titanic.columns.PassengerId).label("N_PASSENGERS"),
        db.func.count(titanic.columns.PassengerId.distinct()).label("N_D_PASSENGERS"),
    ]
)
result_proxy = connection.execute(query)
result_set = result_proxy.fetchall()
pd.DataFrame(result_set, columns=["N_PASSENGERS", "N_D_PASSENGERS"])

Unnamed: 0,N_PASSENGERS,N_D_PASSENGERS
0,911,911
