In [4]:
import pandas as pd
import matplotlib.pyplot as plt

In [8]:
"""
Attempts to load a CSV file with error handling.

Args:
    filename (str): The path to the CSV file.

Returns:
    pandas.DataFrame: The loaded DataFrame if successful, None otherwise.
"""

# Define error handling function for file loading
def handle_file_loading_error(filename):
    try:
        data = pd.read_csv(filename,delimiter=';')
        return data
    except FileNotFoundError:
        print(f"Error: File '{filename}' not found. Please check the file path.")
    except pd.errors.ParserError as e:
        print(f"Error parsing CSV file '{filename}': {e}")
    return None  # Return None to indicate an error occurred

# Load the CSV data with error handling
data = handle_file_loading_error("sales_data.csv")
print(data.head())  # Display the first few rows


   TransactionID  ProductID  Quantity SalePrice  PurchasePrice
0              1     1124.0       5.0     13.38           9.91
1              2     1146.0      14.0     26.88          25.60
2              3     1128.0       8.0     15.89          13.24
3              4     1101.0       8.0       2.9           2.42
4              5     1112.0       2.0      7.23           4.82


In [9]:
#rawdata meta-information
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2574 entries, 0 to 2573
Data columns (total 5 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   TransactionID  2574 non-null   int64  
 1   ProductID      2573 non-null   float64
 2   Quantity       2573 non-null   float64
 3   SalePrice      2573 non-null   object 
 4   PurchasePrice  2574 non-null   float64
dtypes: float64(3), int64(1), object(1)
memory usage: 100.7+ KB


In [10]:
#dataset shape
print("dataset dimension:")
print(data.shape)
#null values
print("\nnull values in each column:")
print(data.isnull().sum())  # Count of null values in each column
print('\nnoisy and garbage data:')
print(data[data.isnull().any(axis=1) | data.apply(pd.isna, axis=1).any(axis=1)])
print(data[data['SalePrice'] == '-'])


dataset dimension:
(2574, 5)

null values in each column:
TransactionID    0
ProductID        1
Quantity         1
SalePrice        1
PurchasePrice    0
dtype: int64

noisy and garbage data:
      TransactionID  ProductID  Quantity SalePrice  PurchasePrice
471             470     1130.0       NaN     35.49          28.39
2011           2008     1133.0      11.0       NaN          16.39
2060           2057        NaN       9.0     32.65          28.39
      TransactionID  ProductID  Quantity SalePrice  PurchasePrice
2573           2570     1190.0       5.0         -            6.1


In [11]:
#removing duplicate rows
# Remove duplicate rows based on 'TransactionID' while keeping the first occurrence
data = data.drop_duplicates(subset='TransactionID', keep='first')
#fixing null values
# Filter null rows with purchasePrice 28.39
# Find rows with missing ProductID and purchase price 28.39
filtered_df = data[(data['ProductID'].isnull()) | (data['PurchasePrice'] == 28.39)]

# Assuming there's only one missing value (replace with your logic if needed)
if not filtered_df.empty:
  missing_index = filtered_df.index[0]  # Get the index of the row with missing value
  product_id = data[data['PurchasePrice'] == 28.39]['ProductID'].mode().iloc[0]  # Find the most frequent product ID for purchase price 28.39
  data.loc[missing_index, 'ProductID'] = product_id  # Update the missing value with the mode

#dropping other 2 rows as,to find the missing quantity and salesprice we need complete order transaction details
raw_null_fix=data.dropna()


# Drop rows with '-' in PurchasePrice
silver_data = raw_null_fix.drop(raw_null_fix[raw_null_fix['SalePrice'] == '-'].index)

# Convert 'ProductID' and 'Quantity' to integer
silver_data['ProductID'] = silver_data['ProductID'].astype(int)
silver_data['Quantity'] = silver_data['Quantity'].astype(int)
silver_data['SalePrice'] = silver_data['SalePrice'].astype(float)

In [15]:
#final cleaned dataset inforamtion
print(silver_data.tail())
silver_data.info()
# Save the DataFrame to a CSV file named 'sales_data_cleaned.csv' in the current working directory (pwd)
silver_data.to_csv('sales_data_cleaned.csv', index=False)
print("\nDataframe saved successfully locally!")


      TransactionID  ProductID  Quantity  SalePrice  PurchasePrice
2568           2565       1107         9       3.44           2.99
2569           2566       1100         4       2.80           2.00
2570           2567       1146         8      30.72          25.60
2571           2568       1132         3      18.70          12.90
2572           2569       1137        15       2.21           2.10
<class 'pandas.core.frame.DataFrame'>
Index: 2566 entries, 0 to 2572
Data columns (total 5 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   TransactionID  2566 non-null   int64  
 1   ProductID      2566 non-null   int32  
 2   Quantity       2566 non-null   int32  
 3   SalePrice      2566 non-null   float64
 4   PurchasePrice  2566 non-null   float64
dtypes: float64(2), int32(2), int64(1)
memory usage: 100.2 KB

Dataframe saved successfully locally!


In [16]:
# Calculate the average Sale Price for each combination of ProductID and Quantity
average_sale_price_df = silver_data.groupby(['ProductID', 'Quantity'])['SalePrice'].mean().reset_index(name='AverageSalePrice')

# Display the new DataFrame
print(average_sale_price_df)

     ProductID  Quantity  AverageSalePrice
0         1100         2               3.0
1         1100         3               2.9
2         1100         4               2.8
3         1100         5               2.7
4         1100         6               2.6
..         ...       ...               ...
662       1150        12               2.1
663       1150        13               2.1
664       1150        14               2.1
665       1150        20               2.1
666       1150        21               2.1

[667 rows x 3 columns]
