In [None]:
import pandas as pd

# Load dataset
df = pd.read_excel("/bin/r/Online Retail (2).xlsx")
df.head()

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Clean the data
df = df.dropna(subset=['CustomerID'])         # Remove missing customer IDs
df = df[df['Quantity'] > 0]                   # Remove negative quantities
df = df[df['UnitPrice'] > 0]                  # Remove zero or negative prices

# Add new features
df['InvoiceDate'] = pd.to_datetime(df['InvoiceDate'])
df['Hour'] = df['InvoiceDate'].dt.hour
df['Day'] = df['InvoiceDate'].dt.day_name()
df['TotalPrice'] = df['Quantity'] * df['UnitPrice']

# 📊 Plot 1: Orders by Day
plt.figure(figsize=(8, 5))
sns.countplot(x='Day', data=df, order=[
    'Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday'])
plt.title("🗓️ Orders by Day of the Week")
plt.xticks(rotation=45)
plt.grid(True)
plt.show()

# 📊 Plot 2: Orders by Hour
plt.figure(figsize=(8, 5))
sns.histplot(df['Hour'], bins=24, kde=False)
plt.title("🕒 Orders by Hour of the Day")
plt.xlabel("Hour")
plt.ylabel("Number of Orders")
plt.grid(True)
plt.show()


In [None]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score

# Create a binary target: High orders hour or not
# Let's define "high order hour" as hours with orders above median

hour_counts = df['Hour'].value_counts()
median_orders = hour_counts.median()

# Map each hour to whether it has high order count or not
high_order_hours = hour_counts > median_orders
high_order_hours = high_order_hours.astype(int)

# Prepare dataset for classification
X = df[['Hour']]               # Feature: hour of order
y = df['Hour'].map(high_order_hours)  # Target: high order hour (1) or not (0)

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Train model
model = RandomForestClassifier(random_state=42)
model.fit(X_train, y_train)

# Predict and evaluate
y_pred = model.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))


In [None]:
import pandas as pd
from datetime import datetime

# Sample data similar to Online Retail dataset
data = {
    'InvoiceNo': ['536365', '536366', '536367', '536368', '536369'],
    'StockCode': ['85123A', '71053', '84406B', '84029G', '84029E'],
    'Description': [
        'WHITE HANGING HEART T-LIGHT HOLDER',
        'WHITE METAL LANTERN',
        'CREAM CUPID HEARTS COAT HANGER',
        'KNITTED UNION FLAG HOT WATER BOTTLE',
        'RED WOOLLY HOTTIE WHITE HEART.'
    ],
    'Quantity': [6, 6, 8, 6, 6],
    'InvoiceDate': [
        datetime(2010, 12, 1, 8, 26),
        datetime(2010, 12, 1, 8, 28),
        datetime(2010, 12, 1, 8, 34),
        datetime(2010, 12, 1, 8, 35),
        datetime(2010, 12, 1, 8, 45)
    ],
    'UnitPrice': [2.55, 3.39, 2.75, 3.39, 3.39],
    'CustomerID': [17850, 17850, 13047, 13047, 13047],
    'Country': ['United Kingdom'] * 5
}

df = pd.DataFrame(data)

# Add derived columns
df['Hour'] = df['InvoiceDate'].dt.hour
df['Day'] = df['InvoiceDate'].dt.day_name()
df['TotalPrice'] = df['Quantity'] * df['UnitPrice']

# Save cleaned dataset to CSV
df.to_csv('cleaned_online_retail.csv', index=False)

print("File 'cleaned_online_retail.csv' has been created in your Colab environment.")


In [None]:
import pandas as pd
from datetime import datetime
from google.colab import files

# Sample data similar to Online Retail
data = {
    'InvoiceNo': ['536365', '536366', '536367', '536368', '536369'],
    'StockCode': ['85123A', '71053', '84406B', '84029G', '84029E'],
    'Description': [
        'WHITE HANGING HEART T-LIGHT HOLDER',
        'WHITE METAL LANTERN',
        'CREAM CUPID HEARTS COAT HANGER',
        'KNITTED UNION FLAG HOT WATER BOTTLE',
        'RED WOOLLY HOTTIE WHITE HEART.'
    ],
    'Quantity': [6, 6, 8, 6, 6],
    'InvoiceDate': [
        datetime(2010, 12, 1, 8, 26),
        datetime(2010, 12, 1, 8, 28),
        datetime(2010, 12, 1, 8, 34),
        datetime(2010, 12, 1, 8, 35),
        datetime(2010, 12, 1, 8, 45)
    ],
    'UnitPrice': [2.55, 3.39, 2.75, 3.39, 3.39],
    'CustomerID': [17850, 17850, 13047, 13047, 13047],
    'Country': ['United Kingdom'] * 5
}

df = pd.DataFrame(data)

# Add derived columns
df['Hour'] = df['InvoiceDate'].dt.hour
df['Day'] = df['InvoiceDate'].dt.day_name()
df['TotalPrice'] = df['Quantity'] * df['UnitPrice']

# Save to CSV
filename = 'cleaned_online_retail.csv'
df.to_csv(filename, index=False)

# Download file link
files.download(filename)


# Task
Prepare the data from "/content/cleaned_online_retail.csv" for import into Power BI Desktop by performing necessary data cleaning and transformations to avoid errors.

## Load the data

### Subtask:
Load the data from the `cleaned_online_retail.csv` file into a pandas DataFrame.


**Reasoning**:
Load the cleaned data from the CSV file into a pandas DataFrame.



In [None]:
df_cleaned = pd.read_csv('cleaned_online_retail.csv')

## Check data types

### Subtask:
Examine the data types of each column and convert them to appropriate types if necessary.


**Reasoning**:
Display the data types of each column to identify columns that might need type conversion for Power BI.



In [None]:
df_cleaned.info()

**Reasoning**:
Convert the 'InvoiceDate' column to datetime objects as it is currently of type object and should be a datetime type for proper handling in Power BI. The other columns seem to have appropriate data types.



In [None]:
df_cleaned['InvoiceDate'] = pd.to_datetime(df_cleaned['InvoiceDate'])
df_cleaned.info()

## Handle missing values

### Subtask:
Identify and handle any missing values in the DataFrame.


**Reasoning**:
Check for missing values in each column of the DataFrame.



In [None]:
print("Missing values before handling:")
print(df_cleaned.isnull().sum())

## Check for duplicates

### Subtask:
Identify and remove any duplicate rows if necessary.


**Reasoning**:
Check for duplicate rows in the DataFrame and remove them if found, then print the row counts before and after removal.



In [None]:
print("Number of rows before removing duplicates:", len(df_cleaned))
df_cleaned.drop_duplicates(inplace=True)
print("Number of rows after removing duplicates:", len(df_cleaned))

## Data consistency checks

### Subtask:
Perform any necessary checks to ensure data consistency (e.g., check for negative quantities or prices if not already handled).


**Reasoning**:
Check for and remove rows with non-positive Quantity or UnitPrice and report the number of rows before and after the operation.



In [None]:
print("Number of rows before checking Quantity and UnitPrice:", len(df_cleaned))

# Filter out rows with non-positive Quantity or UnitPrice
df_cleaned = df_cleaned[(df_cleaned['Quantity'] > 0) & (df_cleaned['UnitPrice'] > 0)]

print("Number of rows after checking Quantity and UnitPrice:", len(df_cleaned))

## Save cleaned data

### Subtask:
Save the cleaned and prepared data to a new CSV file that is ready for import into Power BI.


**Reasoning**:
Save the cleaned DataFrame to a CSV file for Power BI.



In [None]:
filename = 'cleaned_online_retail_for_powerbi.csv'
df_cleaned.to_csv(filename, index=False)

## Summary:

### Data Analysis Key Findings

*   The initial data loaded from `cleaned_online_retail.csv` contained an `InvoiceDate` column with an 'object' data type, which was successfully converted to `datetime64[ns]`.
*   There were no missing values found in the dataset.
*   No duplicate rows were identified or removed from the dataset.
*   All entries in the `Quantity` and `UnitPrice` columns already contained positive values, and no rows were removed during the consistency check for these columns.

### Insights or Next Steps

*   The data is now in a clean and appropriate format (`cleaned_online_retail_for_powerbi.csv`) for direct import and analysis in Power BI Desktop.
*   Further analysis in Power BI can now focus on creating visualizations and reports without needing significant data cleaning steps within Power BI itself.


## Save cleaned data

### Subtask:
Save the cleaned and prepared data to a new CSV file that is ready for import into Power BI.

**Reasoning**:
Save the cleaned DataFrame to a CSV file for Power BI.

In [None]:
filename = 'cleaned_online_retail_for_powerbi.csv'
df_cleaned.to_csv(filename, index=False)