In [2]:
import pandas as pd
import numpy as np

In [3]:
path_retail = 'Pandas Course Resources/retail/retail_2016_2017.csv'
retail_df = pd.read_csv(path_retail)

retail_df.head(10)

Unnamed: 0,id,date,store_nbr,family,sales,onpromotion
0,1945944,2016-01-01,1,AUTOMOTIVE,0.0,0
1,1945945,2016-01-01,1,BABY CARE,0.0,0
2,1945946,2016-01-01,1,BEAUTY,0.0,0
3,1945947,2016-01-01,1,BEVERAGES,0.0,0
4,1945948,2016-01-01,1,BOOKS,0.0,0
5,1945949,2016-01-01,1,BREAD/BAKERY,0.0,0
6,1945950,2016-01-01,1,CELEBRATION,0.0,0
7,1945951,2016-01-01,1,CLEANING,0.0,0
8,1945952,2016-01-01,1,DAIRY,0.0,0
9,1945953,2016-01-01,1,DELI,0.0,0


##### <b> Identifying Duplicate Row </b></br> .duplicated() method identifies duplicate rows of data - This means every column of that row is duplicated exactly in another row </br> - specify `subset=column(s)` to look for duplicates across a subset of columns (so only duplicate values in the rows of that column(s)) </br> if number of unique values (`.nunique()`) is less than the total number of rows, then that column contains duplicate values


In [4]:
# create duplicated value DataFrame using dictionary {key:values}
products_df = pd.DataFrame(
    {'product': ['Dairy', 'Dairy', 'Dairy', 'Vegetable', 'Fruits'],
    'price': [2.56, 2.56, 4.55, 2.74, 5.44]        
    }
)
products_df

Unnamed: 0,product,price
0,Dairy,2.56
1,Dairy,2.56
2,Dairy,4.55
3,Vegetable,2.74
4,Fruits,5.44


In [5]:
# shape of DataFrame
products_df.shape

(5, 2)

In [6]:
# Number of Unique Values in each DataFrame Columns
# number of uniques values (produce: 3, price: 4) is less than number of rows (5)
products_df.nunique()


product    3
price      4
dtype: int64

In [7]:
# the .duplicated() method returns True for the second row because it is an exact duplicate of the first row
products_df.duplicated()

0    False
1     True
2    False
3    False
4    False
dtype: bool

In [8]:
# to find a duplicate value in a specific column use (subset=) argument
products_df.duplicated(subset='product')

0    False
1     True
2     True
3    False
4    False
dtype: bool

##### <b>Drop Duplicate Rows </b></br> .drop_duplicates() method drops duplicate rows (where there are rows that are exact duplicates of each other) </br> - specify `subset=column(s)` to drop duplicate rows based on specfic column(s)</br> Passable arguments: - `keep=`('first, 'last', False): first is default, False drops all duplicates </br> - `inplace=`(True, False): False is default, True removes on current DataFrame, False returns a DataFrame copy with removing done </br> - `ignore_index=`(True, False): False is default, True re-indexes the resulting DataFrame at 0


In [9]:
# remove duplicates in 'product' column, keep the last of the duplicates (not the first), ignore_index resets index, and this is done as a copy, not to the original DataFrame (that requires inplace=True)
products_df.drop_duplicates(subset='product', keep='last', ignore_index=True)

Unnamed: 0,product,price
0,Dairy,4.55
1,Vegetable,2.74
2,Fruits,5.44


In [10]:
# orginal DataFrame still contains all rows
products_df

Unnamed: 0,product,price
0,Dairy,2.56
1,Dairy,2.56
2,Dairy,4.55
3,Vegetable,2.74
4,Fruits,5.44


##### <b> Identifying Missing Data </b></br> Can identify by column using the `.isna()` and `.sum()` methods. The `.info()` can also identify null values

In [17]:
# create NAduplicated value DataFrame using dictionary {key:values} with NA pandas and NAN from numpy 
productsNA_df = pd.DataFrame(
    {'product': [pd.NA, 'Dairy', 'Dairy', np.NAN, 'Fruits'],
    'price': [2.56, pd.NA, 4.55, 2.74, np.NaN],
    'product_id': [1, 2, 3, 4, 5]
    }
)
productsNA_df

Unnamed: 0,product,price,product_id
0,,2.56,1
1,Dairy,,2
2,Dairy,4.55,3
3,,2.74,4
4,Fruits,,5


In [18]:
# isna identifies both pd.NA and np.NAN in the DataFrame
# ideal to use
productsNA_df.isna().sum()

product       2
price         2
product_id    0
dtype: int64

In [19]:
# can use .info()
productsNA_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5 entries, 0 to 4
Data columns (total 3 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   product     3 non-null      object
 1   price       3 non-null      object
 2   product_id  5 non-null      int64 
dtypes: int64(1), object(2)
memory usage: 252.0+ bytes


##### <b> Handling Missing Data </b></br> The `.dropna()` method will remove rows with na values and `.fillna()` method will replace na values with new values </br> The standard `.fillna()` method will fill all na's in every column with the same value can is an issue </br> to replace values in a specific column, pass the argument as a dictionary into .fillna() method - `.fillna({'key':'value})`

In [28]:
# using .fillna() method to fill values in specific columns using a dictionary {key:value} and can have multiple keys:values separated by commas. Does not alter original Dataframe unless assigning to itself or new one
productsNA_df.fillna({'product':'Unknown', 'price':0 })


  productsNA_df.fillna({'product':'Unknown', 'price':0 })


Unnamed: 0,product,price,product_id
0,Unknown,2.56,1
1,Dairy,0.0,2
2,Dairy,4.55,3
3,Unknown,2.74,4
4,Fruits,0.0,5


In [29]:
new_products_df = productsNA_df.fillna({'product':'Unknown', 'price':0 })
new_products_df

  new_products_df = productsNA_df.fillna({'product':'Unknown', 'price':0 })


Unnamed: 0,product,price,product_id
0,Unknown,2.56,1
1,Dairy,0.0,2
2,Dairy,4.55,3
3,Unknown,2.74,4
4,Fruits,0.0,5


In [30]:
# original na DataFrame is unchanged
productsNA_df

Unnamed: 0,product,price,product_id
0,,2.56,1
1,Dairy,,2
2,Dairy,4.55,3
3,,2.74,4
4,Fruits,,5
