In [3]:
# The dataset gives us electronics sales data at Amazon. 

# It contains user ratings for various electronics items sold, along with category of each item and time of sell.

# The dataset is available at https://www.kaggle.com/datasets/edusanketdk/electronics

# Importing the libraries

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt



In [4]:
# visualization

import seaborn as sns


In [8]:
# Importing the dataset

dataset = pd.read_csv(r'../data/raw/electronics.csv')



In [9]:
# list of first five rows

dataset.head()

Unnamed: 0,item_id,user_id,rating,timestamp,model_attr,category,brand,year,user_attr,split
0,0,0,5.0,1999-06-13,Female,Portable Audio & Video,,1999,,0
1,0,1,5.0,1999-06-14,Female,Portable Audio & Video,,1999,,0
2,0,2,3.0,1999-06-17,Female,Portable Audio & Video,,1999,,0
3,0,3,1.0,1999-07-01,Female,Portable Audio & Video,,1999,,0
4,0,4,2.0,1999-07-06,Female,Portable Audio & Video,,1999,,0


In [10]:
# list of last five rows

dataset.tail()

Unnamed: 0,item_id,user_id,rating,timestamp,model_attr,category,brand,year,user_attr,split
1292949,9478,1157628,1.0,2018-09-26,Female,Headphones,Etre Jeune,2017,,0
1292950,9435,1157629,5.0,2018-09-26,Female,Computers & Accessories,,2017,,0
1292951,9305,1157630,3.0,2018-09-26,Female,Computers & Accessories,,2016,,0
1292952,9303,1157631,5.0,2018-09-29,Male,Headphones,,2018,,0
1292953,9478,1157632,1.0,2018-10-01,Female,Headphones,Etre Jeune,2017,Female,0


In [11]:
# shape

dataset.shape

(1292954, 10)

In [12]:
# It is also a good practice to know the columns and their corresponding data types
# along with finding whether they contain null values or not.

dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1292954 entries, 0 to 1292953
Data columns (total 10 columns):
 #   Column      Non-Null Count    Dtype  
---  ------      --------------    -----  
 0   item_id     1292954 non-null  int64  
 1   user_id     1292954 non-null  int64  
 2   rating      1292954 non-null  float64
 3   timestamp   1292954 non-null  object 
 4   model_attr  1292954 non-null  object 
 5   category    1292954 non-null  object 
 6   brand       331120 non-null   object 
 7   year        1292954 non-null  int64  
 8   user_attr   174124 non-null   object 
 9   split       1292954 non-null  int64  
dtypes: float64(1), int64(4), object(5)
memory usage: 98.6+ MB


In [13]:
# We can see that the dataset contains 5 columns and 10000 rows.    

# The columns are as follows:

# 1. User ID

# 2. Product ID

# 3. Rating

# 4. Timestamp

# 5. Category

# The data types of the columns are as follows:

# 1. User ID - int64

# 2. Product ID - object

# 3. Rating - int64

# 4. Timestamp - int64

# 5. Category - object

# We can see that the columns User ID and Rating are of int64 data type, while the columns Product ID and Category are of object data type.

# We can also see that there are no null values in the dataset.

# We can also see that the column Timestamp is of int64 data type, but it is actually a timestamp.

# We can convert it to a timestamp using the following code:

from datetime import datetime

pd.to_datetime(dataset['timestamp'])

0         1999-06-13
1         1999-06-14
2         1999-06-17
3         1999-07-01
4         1999-07-06
             ...    
1292949   2018-09-26
1292950   2018-09-26
1292951   2018-09-26
1292952   2018-09-29
1292953   2018-10-01
Name: timestamp, Length: 1292954, dtype: datetime64[ns]

In [14]:
# We can also see that the column Product ID is of object data type, but it is actually a string.

# We can convert it to a string using the following code:

dataset['brand'] = dataset['brand'].astype(str)

In [15]:
# We can also see that the column Category is of object data type, but it is actually a string.

# We can convert it to a string using the following code:

dataset['category'] = dataset['category'].astype(str)

In [16]:
# We can also see that the column Rating is of int64 data type, but it is actually a float.

# We can convert it to a float using the following code:

dataset['rating'] = dataset['rating'].astype(float)

In [17]:
# We can also see that the column User ID is of int64 data type, but it is actually a string.

# We can convert it to a string using the following code:

dataset['user_id'] = dataset['user_id'].astype(str)

In [18]:
# We can also see that the column Product ID is of object data type, but it is actually a string.

# We can convert it to a string using the following code:

dataset['item_id'] = dataset['item_id'].astype(str)

In [19]:
# to get a better understanding of the dataset,

# we can also see the statistical summary of the dataset.

dataset.describe()

Unnamed: 0,rating,year,split
count,1292954.0,1292954.0,1292954.0
mean,4.051482,2012.938,0.1747587
std,1.379732,2.643513,0.550681
min,1.0,1999.0,0.0
25%,4.0,2012.0,0.0
50%,5.0,2014.0,0.0
75%,5.0,2015.0,0.0
max,5.0,2018.0,2.0
