In [28]:
# Import necessary libraries
import pandas as pd
from sklearn.preprocessing import StandardScaler, LabelEncoder
from datetime import datetime
import numpy as np

# Load datasets
category_data = pd.read_csv('Category_Data.csv')
department_data = pd.read_csv('Department_Data.csv')
store_data = pd.read_csv('Store_Data.csv')
sales_data = pd.read_csv('General Sales Data.csv')

# Data Cleaning
# Check for missing values
missing_values = sales_data.isnull().sum()
print("Missing Values:\n", missing_values)

# Handle missing values (replace with mean for numeric columns, mode for categorical columns)
sales_data['Sale Quantity'].fillna(sales_data['Sale Quantity'].mean(), inplace=True)
sales_data['Gross Value for single Product (Single Sale)'].fillna(sales_data['Gross Value for single Product (Single Sale)'].mean(), inplace=True)
# Repeat for other relevant columns...

# Identify and handle outliers
# Assume 'Sale Quantity' and 'Gross Value for single Product (Single Sale)' are relevant columns for outlier detection
sales_data = sales_data[(sales_data['Sale Quantity'] <= sales_data['Sale Quantity'].quantile(0.95)) & (sales_data['Gross Value for single Product (Single Sale)'] <= sales_data['Gross Value for single Product (Single Sale)'].quantile(0.95))]

# Ensure consistency in date formats
sales_data['SalesDate'] = pd.to_datetime(sales_data['SalesDate'])

# Feature Engineering
# Create new features: day of the week, month, year
sales_data['DayOfWeek'] = sales_data['SalesDate'].dt.dayofweek
sales_data['Month'] = sales_data['SalesDate'].dt.month
sales_data['Year'] = sales_data['SalesDate'].dt.year

# Convert relevant columns to numeric data types
department_data['Department ID'] = pd.to_numeric(department_data['Department ID'], errors='coerce')
sales_data['Department ID'] = sales_data['Department ID'].astype('int64')
print(department_data['Department ID'].dtype)
print(department_data['Department ID'].isnull().sum())
print(np.isinf(department_data['Department ID']).sum())
department_data = department_data.dropna(subset=['Department ID'])
department_data = department_data[~np.isinf(department_data['Department ID'])]
department_data = department_data.fillna({'Department ID': 0})
department_data.loc[np.isinf(department_data['Department ID']), 'Department ID'] = 0
department_data['Department ID'] = department_data['Department ID'].astype('int64')


# Merge datasets
sales_data = pd.merge(sales_data, category_data, on='Category ID', how='left')
sales_data = pd.merge(sales_data, department_data, on='Department ID', how='left')
sales_data = pd.merge(sales_data, store_data, on='Store ID', how='left')

# Data Transformation
# Apply normalization to relevant numerical features
scaler = StandardScaler()
numerical_features = ['Sale Quantity', 'Gross Value for single Product (Single Sale)', 'Retail Value for single Product (Single Sale)', 'Cost', 'Tax per unit', 'TotalQTY', 'TotalSales', 'Buying price per product']
sales_data[numerical_features] = scaler.fit_transform(sales_data[numerical_features])

# Encode categorical variables using Label Encoding
label_encoder = LabelEncoder()
categorical_features = ['Category Name', 'Department Name', 'Store Name', 'Location']
for feature in categorical_features:
    sales_data[feature] = label_encoder.fit_transform(sales_data[feature])

# Display the cleaned and transformed dataset
sales_data.head()


import pandas as pd
import numpy as np

# Assuming you have already loaded your datasets
# For example:
# department_data = pd.read_csv('Department_Data.csv')

# Check for missing values
missing_values_department = department_data.isnull().sum()
print("Missing Values in Department Data:\n", missing_values_department)

# Check for infinity (considering 'Department ID' is a numeric column)
is_inf_mask = np.isinf(department_data['Department ID'])
inf_values_department = department_data[is_inf_mask]
print("Infinity Values in Department Data:\n", inf_values_department)

# Handle missing values (replace with mean for numeric columns, mode for categorical columns)
# Example: department_data['Department ID'].fillna(department_data['Department ID'].mean(), inplace=True)

# Handle infinity values (replace with a specific value or drop the rows)
# Example: department_data.replace([np.inf, -np.inf], np.nan, inplace=True)
# or: department_data = department_data[~is_inf_mask]

# Print data type and number of missing values for 'Department ID'
print(department_data['Department ID'].dtype)
print(department_data['Department ID'].isnull().sum())



import matplotlib.pyplot as plt
import seaborn as sns

# Descriptive Statistics
descriptive_stats = sales_data.describe()
print("Descriptive Statistics:\n", descriptive_stats)

# Visualizations

# Distribution of Sale Quantity
plt.figure(figsize=(10, 6))
sns.histplot(sales_data['Sale Quantity'], bins=30, kde=True)
plt.title('Distribution of Sale Quantity')
plt.xlabel('Sale Quantity')
plt.ylabel('Frequency')
plt.show()

# Correlation Heatmap
correlation_matrix = sales_data.corr()
plt.figure(figsize=(12, 8))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt=".2f", linewidths=.5)
plt.title('Correlation Heatmap')
plt.show()

# Boxplot of Sale Quantity by Category Name
plt.figure(figsize=(14, 8))
sns.boxplot(x='Category Name', y='Sale Quantity', data=sales_data)
plt.title('Sale Quantity by Category Name')
plt.xlabel('Category Name')
plt.ylabel('Sale Quantity')
plt.xticks(rotation=45)
plt.show()

# Time-based Analysis
sales_data['YearMonth'] = sales_data['SalesDate'].dt.to_period('M')
monthly_sales = sales_data.groupby('YearMonth')['TotalSales'].sum()

plt.figure(figsize=(14, 6))
monthly_sales.plot(kind='line', marker='o')
plt.title('Monthly Total Sales Over Time')
plt.xlabel('Year-Month')
plt.ylabel('Total Sales')
plt.show()

import pandas as pd
import numpy as np

# Assuming you have already loaded your datasets
# For example:
# department_data = pd.read_csv('Department_Data.csv')

# Check for missing values
missing_values_department = department_data.isnull().sum()
print("Missing Values in Department Data:\n", missing_values_department)

# Check for infinity (considering 'Department ID' is a numeric column)
numeric_columns = department_data.select_dtypes(include=np.number).columns
for col in numeric_columns:
    is_inf_mask = np.isinf(department_data[col])
    inf_values_department = department_data[is_inf_mask]
    print(f"Infinity Values in {col} column:\n", inf_values_department)

# Handle missing values (replace with mean for numeric columns, mode for categorical columns)
# Example: department_data['Department ID'].fillna(department_data['Department ID'].mean(), inplace=True)

# Print data type and number of missing values for 'Department ID'
print(department_data['Department ID'].dtype)
print(department_data['Department ID'].isnull().sum())


Missing Values:
 Department ID                                   0
Category ID                                     0
Store ID                                        0
Sale Quantity                                   0
Gross Value for single Product (Single Sale)    0
Retail Value (Single Sale)                      0
Tax with Revenue                                0
Cost                                            0
Tax per unit                                    0
TotalQTY                                        0
TotalSales                                      0
SalesDate                                       0
Sales Time                                      0
Buying price per product                        0
dtype: int64
float64
1
0


KeyError: "['Retail Value for single Product (Single Sale)', 'TotalQTY'] not in index"