In [1]:
# Loading data with Pandas
import pandas as pd
from pathlib import Path

# Define the correct path to the CSV file
file_path = Path("/home/charles/github/repos/learn-pandas/data/mock_data.csv")

# Load the data

data = pd.read_csv(file_path)

# Display the first 5 rows of the data
print(data.head())

         Date  Product  Sales  Quantity
0  1974-11-02      car     35        66
1  2017-11-11      tax     77        35
2  2001-08-17    small     86        25
3  1984-05-26   reveal     54        39
4  1987-05-21  society     28        70


In [4]:
# Cleaning and transforming data with Pandas
import pandas as pd
from pathlib import Path

# Define the correct path to the CSV file
file_path = Path("/home/charles/github/repos/learn-pandas/data/mock_data.csv")

# Check if the file exists before reading
if not file_path.exists():
    raise FileNotFoundError(f"File not found: {file_path}")

# Load the data
data = pd.read_csv(file_path)

# Convert 'Date' column to datetime and handle any parsing errors
if "Date" in data.columns:
    data["Date"] = pd.to_datetime(
        data["Date"], errors="coerce"
    )  # Coerce invalid dates to NaT
else:
    raise KeyError("'Date' column not found in the dataset.")

# Handle missing values (if any)
data = data.dropna()

# Filter data for a specific time range
filtered_data = data[
    (data["Date"] > "2019-03-22") & (data["Date"] < "2019-10-26")
].copy()  # Use .copy() to avoid SettingWithCopyWarning

# Add a new column for Total Sales only if 'Sales' and 'Quantity' columns exist
if "Sales" in filtered_data.columns and "Quantity" in filtered_data.columns:
    filtered_data.loc[:, "Total Sales"] = (
        filtered_data["Sales"] * filtered_data["Quantity"]
    )  # Use .loc to avoid the warning
else:
    raise KeyError("One or both of the columns 'Sales' and 'Quantity' are missing.")

# Display the first few rows to verify the transformations
print(filtered_data.head())

         Date    Product  Sales  Quantity  Total Sales
77 2019-07-01  community     29        40         1160


In [7]:
import pandas as pd
from pathlib import Path

# Define the correct path to the CSV file
file_path = Path("/home/charles/github/repos/learn-pandas/data/mock_data.csv")

# Check if the file exists before reading
if not file_path.exists():
    raise FileNotFoundError(f"File not found: {file_path}")

# Load the data
data = pd.read_csv(file_path)

# Assuming 'Total Sales' column exists in data, if not, create it
if (
    "Total Sales" not in data.columns
    and "Sales" in data.columns
    and "Quantity" in data.columns
):
    data["Total Sales"] = data["Sales"] * data["Quantity"]

# Group the data by 'Product' and calculate the total sales for each product
product_sales = data.groupby("Product", as_index=False)["Total Sales"].sum()

# Sort the grouped data by 'Total Sales' in descending order
sorted_product_sales = product_sales.sort_values(by="Total Sales", ascending=False)

# Display the sorted results
print(sorted_product_sales)

       Product  Total Sales
3       around         7722
59        rest         6880
57     receive         6507
28        goal         6375
86        vote         6318
..         ...          ...
87        week          760
88       while          713
53  production          644
8     building          594
62     section          540

[93 rows x 2 columns]


In [1]:
import pandas as pd
from pathlib import Path
import plotly.express as px

# Define the correct path to the CSV file
file_path = Path("/home/charles/github/repos/learn-pandas/data/mock_data.csv")

# Load the data
data = pd.read_csv(file_path)

# Assuming 'Total Sales' column exists in data, if not, create it
if (
    "Total Sales" not in data.columns
    and "Sales" in data.columns
    and "Quantity" in data.columns
):
    data["Total Sales"] = data["Sales"] * data["Quantity"]

# Group the data by 'Product' and calculate the total sales for each product
product_sales = data.groupby("Product", as_index=False)["Total Sales"].sum()

# Create a bar chart using Plotly Express
fig = px.bar(
    product_sales, x="Product", y="Total Sales", title="Total Sales by Product"
)  # Note: 'Total Sales' is used here

# Show the chart
fig.show()

In [5]:
import pandas as pd
from pathlib import Path
import plotly.express as px

# Define the correct path to the CSV file
file_path = Path("/home/charles/github/repos/learn-pandas/data/mock_data.csv")

# Load the data
data = pd.read_csv(file_path)

# Convert 'Date' column to datetime
data["Date"] = pd.to_datetime(data["Date"])

# Filter data for a specific time range
filtered_data = data[
    (data["Date"] > "2019-03-22") & (data["Date"] < "2019-10-26")
].copy()

# Create 'Total Sales' column in filtered_data if it doesn't exist
if (
    "Total Sales" not in filtered_data.columns
    and "Sales" in filtered_data.columns
    and "Quantity" in filtered_data.columns
):
    filtered_data["Total Sales"] = filtered_data["Sales"] * filtered_data["Quantity"]

# Group data by month
filtered_data.set_index("Date", inplace=True)  # Set 'Date' as index to use resample()
monthly_sales = (
    filtered_data.resample("ME")["Total Sales"].sum().reset_index()
)  # 'ME' for month-end frequency

# Create a line chart using Plotly Express
fig = px.line(monthly_sales, x="Date", y="Total Sales", title="Monthly Sales Trend")

# Show the chart
fig.show()