In [1]:
!pip install google-cloud-bigquery pandas-gbq



In [2]:
import seaborn as sns
import matplotlib.pyplot as plt

In [3]:
import pandas as pd
from google.cloud import bigquery
from google.colab import auth

auth.authenticate_user()

project_id = 'querytest2-444200'

# Construct a BigQuery client object.
client = bigquery.Client(project=project_id)

# Query the public dataset
query = """
    SELECT *
    FROM `bigquery-public-data.thelook_ecommerce.orders`
    LIMIT 10000
"""

# Run the query
query_job = client.query(query)

# Get the results
results = query_job.result()

# Convert to a Pandas DataFrame
orders = results.to_dataframe()

orders.head()

Unnamed: 0,order_id,user_id,status,gender,created_at,returned_at,shipped_at,delivered_at,num_of_item
0,4,4,Cancelled,F,2022-03-11 01:32:00+00:00,NaT,NaT,NaT,1
1,15,15,Cancelled,F,2023-05-04 10:00:00+00:00,NaT,NaT,NaT,1
2,51,39,Cancelled,F,2024-11-21 07:35:00+00:00,NaT,NaT,NaT,1
3,64,47,Cancelled,F,2021-01-20 07:20:00+00:00,NaT,NaT,NaT,1
4,90,76,Cancelled,F,2022-11-19 08:04:00+00:00,NaT,NaT,NaT,1


In [14]:
orders.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 6 columns):
 #   Column       Non-Null Count  Dtype              
---  ------       --------------  -----              
 0   order_id     10000 non-null  Int64              
 1   user_id      10000 non-null  Int64              
 2   status       10000 non-null  object             
 3   gender       10000 non-null  object             
 4   created_at   10000 non-null  datetime64[us, UTC]
 5   num_of_item  10000 non-null  Int64              
dtypes: Int64(3), datetime64[us, UTC](1), object(2)
memory usage: 498.2+ KB


In [4]:
orders.isna().sum()

Unnamed: 0,0
order_id,0
user_id,0
status,0
gender,0
created_at,0
returned_at,10000
shipped_at,9409
delivered_at,9409
num_of_item,0


The vast majority of values in the `returned_at`, `shipped_at` and `delivered_at` columns are empty. Since there is no way for us to impute these, we will drop these columns:

In [13]:
orders.columns

Index(['order_id', 'user_id', 'status', 'gender', 'created_at', 'num_of_item'], dtype='object')

In [None]:
orders.drop(['returned_at', 'shipped_at', 'delivered_at'], inplace=True)

In [15]:
orders.isna().sum()

Unnamed: 0,0
order_id,0
user_id,0
status,0
gender,0
created_at,0
num_of_item,0


In [17]:
orders.duplicated().sum()

0

We no longer have empty values or duplicates rows.

In [18]:
orders.head()

Unnamed: 0,order_id,user_id,status,gender,created_at,num_of_item
0,4,4,Cancelled,F,2022-03-11 01:32:00+00:00,1
1,15,15,Cancelled,F,2023-05-04 10:00:00+00:00,1
2,51,39,Cancelled,F,2024-11-21 07:35:00+00:00,1
3,64,47,Cancelled,F,2021-01-20 07:20:00+00:00,1
4,90,76,Cancelled,F,2022-11-19 08:04:00+00:00,1


**Saving CSV file**

In [22]:
orders = orders.to_csv(path_or_buf='/content/drive/MyDrive/2024BusinessAnalysisProject/Data/orders.csv', index=False)