In [18]:
import pandas as pd

# Load the data from the unzipped file with a different encoding
try:
    df = pd.read_csv('/content/Sample - Superstore.csv', encoding='latin1')
    print("Data loaded successfully!")
    display(df.head())
except FileNotFoundError:
    print("Error: Sample - Superstore.csv not found. Please check the file path.")
except UnicodeDecodeError:
    print("Error: Could not decode the file with latin1 encoding. You might need to try a different encoding.")

Data loaded successfully!


Unnamed: 0,Row ID,Order ID,Order Date,Ship Date,Ship Mode,Customer ID,Customer Name,Segment,Country,City,...,Postal Code,Region,Product ID,Category,Sub-Category,Product Name,Sales,Quantity,Discount,Profit
0,1,CA-2016-152156,11/8/2016,11/11/2016,Second Class,CG-12520,Claire Gute,Consumer,United States,Henderson,...,42420,South,FUR-BO-10001798,Furniture,Bookcases,Bush Somerset Collection Bookcase,261.96,2,0.0,41.9136
1,2,CA-2016-152156,11/8/2016,11/11/2016,Second Class,CG-12520,Claire Gute,Consumer,United States,Henderson,...,42420,South,FUR-CH-10000454,Furniture,Chairs,"Hon Deluxe Fabric Upholstered Stacking Chairs,...",731.94,3,0.0,219.582
2,3,CA-2016-138688,6/12/2016,6/16/2016,Second Class,DV-13045,Darrin Van Huff,Corporate,United States,Los Angeles,...,90036,West,OFF-LA-10000240,Office Supplies,Labels,Self-Adhesive Address Labels for Typewriters b...,14.62,2,0.0,6.8714
3,4,US-2015-108966,10/11/2015,10/18/2015,Standard Class,SO-20335,Sean O'Donnell,Consumer,United States,Fort Lauderdale,...,33311,South,FUR-TA-10000577,Furniture,Tables,Bretford CR4500 Series Slim Rectangular Table,957.5775,5,0.45,-383.031
4,5,US-2015-108966,10/11/2015,10/18/2015,Standard Class,SO-20335,Sean O'Donnell,Consumer,United States,Fort Lauderdale,...,33311,South,OFF-ST-10000760,Office Supplies,Storage,Eldon Fold 'N Roll Cart System,22.368,2,0.2,2.5164


In [16]:
import zipfile

# Unzip the file
with zipfile.ZipFile('/content/archive.zip', 'r') as zip_ref:
    zip_ref.extractall('/content/')

print("File unzipped successfully!")

File unzipped successfully!


In [19]:
# Check data types
print("Data types:")
print(df.dtypes)

# Check for missing values
print("\nMissing values:")
print(df.isnull().sum())

Data types:
Row ID             int64
Order ID          object
Order Date        object
Ship Date         object
Ship Mode         object
Customer ID       object
Customer Name     object
Segment           object
Country           object
City              object
State             object
Postal Code        int64
Region            object
Product ID        object
Category          object
Sub-Category      object
Product Name      object
Sales            float64
Quantity           int64
Discount         float64
Profit           float64
dtype: object

Missing values:
Row ID           0
Order ID         0
Order Date       0
Ship Date        0
Ship Mode        0
Customer ID      0
Customer Name    0
Segment          0
Country          0
City             0
State            0
Postal Code      0
Region           0
Product ID       0
Category         0
Sub-Category     0
Product Name     0
Sales            0
Quantity         0
Discount         0
Profit           0
dtype: int64


In [20]:
# Convert date columns to datetime objects
df['Order Date'] = pd.to_datetime(df['Order Date'])
df['Ship Date'] = pd.to_datetime(df['Ship Date'])

print("Data types after conversion:")
print(df[['Order Date', 'Ship Date']].dtypes)

Data types after conversion:
Order Date    datetime64[ns]
Ship Date     datetime64[ns]
dtype: object


In [22]:
import plotly.express as px

# Aggregate sales by month
sales_trend = df.set_index('Order Date').resample('ME')['Sales'].sum().reset_index()

# Create an interactive line plot of sales trend
fig = px.line(sales_trend, x='Order Date', y='Sales', title='Monthly Sales Trend')
fig.show()

In [24]:
from sklearn.linear_model import LinearRegression
import numpy as np
import pandas as pd

# Prepare data for the model
# We will use the monthly sales data we already aggregated
sales_trend['Month_Num'] = np.arange(len(sales_trend))

# Reshape the data for the model
X = sales_trend[['Month_Num']]
y = sales_trend['Sales']

# Train a linear regression model
model = LinearRegression()
model.fit(X, y)

# Predict sales for the next month
next_month_num = len(sales_trend)
next_month_sales_prediction = model.predict(pd.DataFrame({'Month_Num': [next_month_num]}))

print(f"Predicted sales for the next month: ${next_month_sales_prediction[0]:,.2f}")

Predicted sales for the next month: $69,957.54
