In [1]:
# import packages
import pandas as pd
import os
from dotenv import load_dotenv
import mysql.connector

In [2]:
# Read the CSV file, specifying the latitude and longitude columns as strings
df = pd.read_csv('../data/raw/olist_order_reviews_dataset.csv')

In [3]:
df.head()

Unnamed: 0,review_id,order_id,review_score,review_comment_title,review_comment_message,review_creation_date,review_answer_timestamp
0,7bc2406110b926393aa56f80a40eba40,73fc7af87114b39712e6da79b0a377eb,4,,,2018-01-18 00:00:00,2018-01-18 21:46:59
1,80e641a11e56f04c1ad469d5645fdfde,a548910a1c6147796b98fdf73dbeba33,5,,,2018-03-10 00:00:00,2018-03-11 03:05:13
2,228ce5500dc1d8e020d8d1322874b6f0,f9e4b658b201a9f2ecdecbb34bed034b,5,,,2018-02-17 00:00:00,2018-02-18 14:36:24
3,e64fb393e7b32834bb789ff8bb30750e,658677c97b385a9be170737859d3511b,5,,Recebi bem antes do prazo estipulado.,2017-04-21 00:00:00,2017-04-21 22:02:06
4,f7c4243c7fe1938f181bec41a392bdeb,8e6bfb81e283fa7e4f11123a3fb894f1,5,,Parabéns lojas lannister adorei comprar pela I...,2018-03-01 00:00:00,2018-03-02 10:26:53


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 99224 entries, 0 to 99223
Data columns (total 7 columns):
 #   Column                   Non-Null Count  Dtype 
---  ------                   --------------  ----- 
 0   review_id                99224 non-null  object
 1   order_id                 99224 non-null  object
 2   review_score             99224 non-null  int64 
 3   review_comment_title     11568 non-null  object
 4   review_comment_message   40977 non-null  object
 5   review_creation_date     99224 non-null  object
 6   review_answer_timestamp  99224 non-null  object
dtypes: int64(1), object(6)
memory usage: 5.3+ MB


In [5]:
df.isna().sum()

review_id                      0
order_id                       0
review_score                   0
review_comment_title       87656
review_comment_message     58247
review_creation_date           0
review_answer_timestamp        0
dtype: int64

In [6]:
# assess missing values
df[order_reviews.isna().any(axis=1)]

Unnamed: 0,review_id,order_id,review_score,review_comment_title,review_comment_message,review_creation_date,review_answer_timestamp
0,7bc2406110b926393aa56f80a40eba40,73fc7af87114b39712e6da79b0a377eb,4,,,2018-01-18 00:00:00,2018-01-18 21:46:59
1,80e641a11e56f04c1ad469d5645fdfde,a548910a1c6147796b98fdf73dbeba33,5,,,2018-03-10 00:00:00,2018-03-11 03:05:13
2,228ce5500dc1d8e020d8d1322874b6f0,f9e4b658b201a9f2ecdecbb34bed034b,5,,,2018-02-17 00:00:00,2018-02-18 14:36:24
3,e64fb393e7b32834bb789ff8bb30750e,658677c97b385a9be170737859d3511b,5,,Recebi bem antes do prazo estipulado.,2017-04-21 00:00:00,2017-04-21 22:02:06
4,f7c4243c7fe1938f181bec41a392bdeb,8e6bfb81e283fa7e4f11123a3fb894f1,5,,Parabéns lojas lannister adorei comprar pela I...,2018-03-01 00:00:00,2018-03-02 10:26:53
...,...,...,...,...,...,...,...
99219,574ed12dd733e5fa530cfd4bbf39d7c9,2a8c23fee101d4d5662fa670396eb8da,5,,,2018-07-07 00:00:00,2018-07-14 17:18:30
99220,f3897127253a9592a73be9bdfdf4ed7a,22ec9f0669f784db00fa86d035cf8602,5,,,2017-12-09 00:00:00,2017-12-11 20:06:42
99221,b3de70c89b1510c4cd3d0649fd302472,55d4004744368f5571d1f590031933e4,5,,"Excelente mochila, entrega super rápida. Super...",2018-03-22 00:00:00,2018-03-23 09:10:43
99222,1adeb9d84d72fe4e337617733eb85149,7725825d039fc1f0ceb7635e3f7d9206,4,,,2018-07-01 00:00:00,2018-07-02 12:59:13


In [7]:
# check for duplicate primary key
df['review_id'].duplicated().sum()

814

In [8]:
dups = order_reviews[df['review_id'].duplicated()]

# appears to be multiple orders per review_id

In [9]:
#check for duplicate review_id + order_id
duplicates = df.duplicated(subset=['review_id','order_id'], keep=False)

duplicates.sum()

0

In [None]:
# Replace NaN with None
import numpy as np
df = df.replace({np.nan: None})

In [10]:
# Create table, login and out of MySQL, and load data

# Load environment variables from .env file
load_dotenv()

# Get the password from the environment variable
db_password = os.getenv('DB_PASSWORD')

# Define the function
def create_and_load_table(connection, table_name, columns_def, df):
    cursor = connection.cursor()

    # Create table if it doesn't exist
    create_table_query = f"""
    CREATE TABLE IF NOT EXISTS {table_name} (
        {columns_def}
    )
    """
    cursor.execute(create_table_query)

    # Prepare the SQL query to insert data
    columns = ', '.join(df.columns)
    placeholders = ', '.join(['%s'] * len(df.columns))
    insert_query = f"INSERT INTO {table_name} ({columns}) VALUES ({placeholders})"

    # Insert DataFrame values into MySQL table
    for index, row in df.iterrows():
        cursor.execute(insert_query, tuple(row))

    # Commit the transaction
    connection.commit()

    # Close the cursor
    cursor.close()

    
# Get the password from the environment variable
db_password = os.getenv('DB_PASSWORD')

if db_password is None:
    raise ValueError("DB_PASSWORD environment variable is not set")

connection = mysql.connector.connect(
    host='localhost',
    user='root',
    password=db_password,
    database='olist_db'
)

# input: enter the the variable name for the desired dataframe to load
table_name = 'order_reviews' #edit this line

# input: define the columns (edit the below)
columns_def = """
review_id VARCHAR(200), 
order_id VARCHAR(200), 
review_score INT, 
review_comment_title VARCHAR(200), 
review_comment_message VARCHAR(255), 
review_creation_date DATETIME, 
review_answer_timestamp DATETIME, 
PRIMARY KEY (review_id, order_id)
"""

# Call the function
create_and_load_table(connection, table_name, columns_def, df)

# Close the connection
connection.close()

ProgrammingError: 1054 (42S22): Unknown column 'nan' in 'field list'