In [1]:
# import packages
import pandas as pd
import os
from dotenv import load_dotenv
import mysql.connector

In [4]:
# Read the CSV file, specifying the latitude and longitude columns as strings
df = pd.read_csv('../data/processed/Categories.csv')

In [5]:
df.head()

Unnamed: 0,Category,Subcategory
0,Personal Care,health_beauty
1,Personal Care,perfumery
2,Personal Care,diapers_and_hygiene
3,Electronics,computers_accessories
4,Electronics,telephony


In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 69 entries, 0 to 68
Data columns (total 2 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   Category     69 non-null     object
 1   Subcategory  69 non-null     object
dtypes: object(2)
memory usage: 1.2+ KB


In [7]:
df.isna().sum()

Category       0
Subcategory    0
dtype: int64

In [8]:
# assess missing values
df[df.isna().any(axis=1)]

Unnamed: 0,Category,Subcategory


In [9]:
# check for duplicate primary key
df['Subcategory'].duplicated().sum()

0

In [10]:
# Create table, login and out of MySQL, and load data

# Load environment variables from .env file
load_dotenv()

# Get the password from the environment variable
db_password = os.getenv('DB_PASSWORD')

# Define the function
def create_and_load_table(connection, table_name, columns_def, df):
    cursor = connection.cursor()

    # Create table if it doesn't exist
    create_table_query = f"""
    CREATE TABLE IF NOT EXISTS {table_name} (
        {columns_def}
    )
    """
    cursor.execute(create_table_query)

    # Prepare the SQL query to insert data
    columns = ', '.join(df.columns)
    placeholders = ', '.join(['%s'] * len(df.columns))
    insert_query = f"INSERT INTO {table_name} ({columns}) VALUES ({placeholders})"

    # Insert DataFrame values into MySQL table
    for index, row in df.iterrows():
        cursor.execute(insert_query, tuple(row))

    # Commit the transaction
    connection.commit()

    # Close the cursor
    cursor.close()

    
# Get the password from the environment variable
db_password = os.getenv('DB_PASSWORD')

if db_password is None:
    raise ValueError("DB_PASSWORD environment variable is not set")

connection = mysql.connector.connect(
    host='localhost',
    user='root',
    password=db_password,
    database='olist_db'
)

# input: enter the the variable name for the desired dataframe to load
table_name = 'categories' #edit this line

# input: define the columns (edit the below)
columns_def = """
Category VARCHAR(100), 
Subcategory VARCHAR(100) PRIMARY KEY
"""

# Call the function
create_and_load_table(connection, table_name, columns_def, df)

# Close the connection
connection.close()