# Project 1. SQL

- Data: Lahman 1871 - 2023 baseball database
- Format: CSV
- Date: 3 Feb 2025

https://cs186.gitbook.io/project/assignments/proj1/your-tasks

In [1]:
pip install pandas sqlalchemy psycopg2

Collecting pandas
  Downloading pandas-2.2.3-cp313-cp313-win_amd64.whl.metadata (19 kB)
Collecting sqlalchemy
  Downloading SQLAlchemy-2.0.38-cp313-cp313-win_amd64.whl.metadata (9.9 kB)
Collecting psycopg2
  Downloading psycopg2-2.9.10-cp313-cp313-win_amd64.whl.metadata (4.8 kB)
Collecting numpy>=1.26.0 (from pandas)
  Downloading numpy-2.2.3-cp313-cp313-win_amd64.whl.metadata (60 kB)
Collecting pytz>=2020.1 (from pandas)
  Downloading pytz-2025.1-py2.py3-none-any.whl.metadata (22 kB)
Collecting tzdata>=2022.7 (from pandas)
  Downloading tzdata-2025.1-py2.py3-none-any.whl.metadata (1.4 kB)
Collecting greenlet!=0.4.17 (from sqlalchemy)
  Downloading greenlet-3.1.1-cp313-cp313-win_amd64.whl.metadata (3.9 kB)
Collecting typing-extensions>=4.6.0 (from sqlalchemy)
  Downloading typing_extensions-4.12.2-py3-none-any.whl.metadata (3.0 kB)
Downloading pandas-2.2.3-cp313-cp313-win_amd64.whl (11.5 MB)
   ---------------------------------------- 0.0/11.5 MB ? eta -:--:--
   ----------------------


[notice] A new release of pip is available: 24.3.1 -> 25.0.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [None]:
import os
import pandas as pd
from sqlalchemy import create_engine

DB_USER = "postgres"
DB_PASS = "221216"
DB_HOST = "localhost"
DB_PORT = "5432"
DB_NAME = "lahman_baseball"

engine = create_engine(f"postgresql://{DB_USER}:{DB_PASS}@{DB_HOST}:{DB_PORT}/{DB_NAME}")

In [None]:
def execute(query):
    return pd.read_sql(query, engine)

folder_path = r"D:\github\CS186_Introduction_to_database_systems\RESOURCES\Lahman_baseball_statistics_database\lahman_1871-2023_csv"

csv_files = [f for f in os.listdir(folder_path) if f.endswith(".csv")]

existing_tables = execute("""
SELECT table_name FROM information_schema.tables 
WHERE table_schema = 'public'
""")["table_name"].tolist()

In [18]:
for file in csv_files:
    table_name = file.replace(".csv", "").lower()
    
    if table_name in existing_tables:
        print(f"⏭️ Skipping {file}, table {table_name} already exists")
        continue

    file_path = os.path.join(folder_path, file)

    try:
        df = pd.read_csv(file_path, encoding="utf-8", encoding_errors="ignore", 
                         on_bad_lines="skip")  
    except Exception as e:
        print(f"❌ Error loading {file}: {e}")
        continue

    df.to_sql(table_name, engine, if_exists="replace", index=False)
    print(f"✅ Loaded {file} into table {table_name}")

for table in csv_files:
    table_name = table.replace(".csv", "").lower()
    
    col_info = execute(f"""
    SELECT column_name, data_type
    FROM information_schema.columns
    WHERE table_name = '{table_name}'
    """)
    
    print(f"📊 Table: {table_name}")

⏭️ Skipping AllstarFull.csv, table allstarfull already exists
⏭️ Skipping Appearances.csv, table appearances already exists
⏭️ Skipping AwardsManagers.csv, table awardsmanagers already exists
⏭️ Skipping AwardsPlayers.csv, table awardsplayers already exists
⏭️ Skipping AwardsShareManagers.csv, table awardssharemanagers already exists
⏭️ Skipping AwardsSharePlayers.csv, table awardsshareplayers already exists
⏭️ Skipping Batting.csv, table batting already exists
⏭️ Skipping BattingPost.csv, table battingpost already exists
⏭️ Skipping CollegePlaying.csv, table collegeplaying already exists
⏭️ Skipping Fielding.csv, table fielding already exists
⏭️ Skipping FieldingOF.csv, table fieldingof already exists
⏭️ Skipping FieldingOFsplit.csv, table fieldingofsplit already exists
⏭️ Skipping FieldingPost.csv, table fieldingpost already exists
⏭️ Skipping HallOfFame.csv, table halloffame already exists
⏭️ Skipping HomeGames.csv, table homegames already exists
⏭️ Skipping Managers.csv, table mana

In [12]:
query = """
SELECT table_name FROM information_schema.tables WHERE table_schema = 'public';
"""

overview = execute(query)
overview

Unnamed: 0,table_name
0,schools
1,seriespost
2,teams
3,teamsfranchises
4,teamshalf
5,allstarfull
6,appearances
7,awardsmanagers
8,awardsplayers
9,awardssharemanagers


In [14]:
query2 = """
SELECT column_name, data_type FROM information_schema.columns WHERE table_schema = 'public' AND table_name = 'people';
"""
execute(query2)

Unnamed: 0,column_name,data_type
0,ID,bigint
1,playerID,text
2,birthYear,double precision
3,birthMonth,double precision
4,birthDay,double precision
5,birthCity,text
6,birthCountry,text
7,birthState,text
8,deathYear,double precision
9,deathMonth,double precision


In [15]:
query3 = """
SELECT column_name, data_type FROM information_schema.columns WHERE table_schema = 'public' AND table_name = 'batting';
"""
execute(query3)

Unnamed: 0,column_name,data_type
0,playerID,text
1,yearID,bigint
2,stint,bigint
3,teamID,text
4,lgID,text
5,G,bigint
6,G_batting,double precision
7,AB,bigint
8,R,bigint
9,H,bigint


In [16]:
query4 = """
SELECT column_name, data_type FROM information_schema.columns WHERE table_schema = 'public' AND table_name = 'pitching';
"""
execute(query4)

Unnamed: 0,column_name,data_type
0,playerID,text
1,yearID,bigint
2,stint,bigint
3,teamID,text
4,lgID,text
5,W,bigint
6,L,bigint
7,G,bigint
8,GS,bigint
9,CG,bigint


In [17]:
query5 = """
SELECT column_name, data_type FROM information_schema.columns WHERE table_schema = 'public' AND table_name = 'fielding';
"""
execute(query5)

Unnamed: 0,column_name,data_type
0,playerID,text
1,yearID,bigint
2,stint,bigint
3,teamID,text
4,lgID,text
5,POS,text
6,G,bigint
7,GS,double precision
8,InnOuts,double precision
9,PO,bigint


What's the highest earned run average recorded in baseball history?

In [None]:
query =