In [None]:
import os
import math
import sys
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
df = pd.read_csv('features.csv')
df

In [None]:
df.columns

In [None]:
df.info()

In [None]:
df.describe


In [None]:
sns.set(style="whitegrid")

# Plot histograms for numerical variables
df.hist(bins=30, figsize=(15, 14))
plt.suptitle('Histograms of Numerical Variables', fontsize=20)
plt.show()

In [None]:
df.groupby('IsHoliday').size().plot(kind='pie', autopct='%.2f')

Unemployment rate v/s time

In [None]:
df['Date'] = pd.to_datetime(df['Date'])  

df_grouped = df.groupby('Date')['Unemployment'].mean().reset_index()

plt.figure(figsize=(12, 6))
plt.plot(df_grouped['Date'], df_grouped['Unemployment'], marker='o', linestyle='-')


plt.title('Unemployment Rate Over Time')
plt.xlabel('Date')
plt.ylabel('Unemployment Rate (%)')

plt.grid(True)
plt.show()


CPI Index v/s time

In [None]:
df_grouped_cpi = df.groupby('Date')['CPI'].mean().reset_index()

# Plotting the data
plt.figure(figsize=(12, 6))
plt.plot(df_grouped_cpi['Date'], df_grouped_cpi['CPI'], marker='o', linestyle='-')

# Adding titles and labels
plt.title('Consumer Price Index (CPI) Over Time')
plt.xlabel('Date')
plt.ylabel('CPI')

# Show plot
plt.grid(True)
plt.show()

Fuel Price v/s time

In [None]:
df_grouped_fuel_price = df.groupby('Date')['Fuel_Price'].mean().reset_index()

# Plotting the data
plt.figure(figsize=(12, 6))
plt.plot(df_grouped_fuel_price['Date'], df_grouped_fuel_price['Fuel_Price'], marker='o', linestyle='-')

# Adding titles and labels
plt.title('Fuel Price Over Time')
plt.xlabel('Date')
plt.ylabel('Fuel Price')

# Show plot
plt.grid(True)
plt.show()

In [None]:
df['CPI'].fillna(df['CPI'].median(),inplace=True)
df['Unemployment'].fillna(df['Unemployment'].median(),inplace=True)

In [None]:
for i in range(1, 6):
    df["MarkDown" + str(i)] = df["MarkDown" + str(i)].apply(lambda x: 0 if x < 0 else x)
    df["MarkDown" + str(i)].fillna(value=0, inplace=True)

In [None]:
df

In [None]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()

df['IsHoliday'] = le.fit_transform(df['IsHoliday'])
df['StoreType'] = le.fit_transform(df['StoreType'])
df

In [None]:
df['IsHoliday'].nunique()

In [None]:
df['StoreType'].nunique()

In [None]:
df_train = pd.read_csv('train.csv')
df_train

In [438]:
# Convert store columns to string to ensure consistency
df['Store'] = df['Store'].astype(str)
df_train['Store'] = df_train['Store'].astype(str)

# Convert date columns to datetime if they are not already
df['Date'] = pd.to_datetime(df['Date'])
df_train['Date'] = pd.to_datetime(df_train['Date'])

# Strip any leading/trailing spaces in the store columns
df['Store'] = df['Store'].str.strip()
df_train['Store'] = df_train['Store'].str.strip()


In [None]:
final_df = pd.merge(df, df_train, on=['Store', 'Date'], how='inner')
final_df

In [None]:
final_df['IsHoliday_x'].isin(final_df['IsHoliday_y']).all()


In [None]:
agg_data = final_df.groupby(['Store', 'Dept']).Weekly_Sales.agg(['max', 'min', 'mean', 'median', 'std']).reset_index()
agg_data.head()

In [442]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()

final_df['IsHoliday_x'] = le.fit_transform(final_df['IsHoliday_x'])
final_df['Storeype'] = le.fit_transform(final_df['StoreType'])

Plotting Stores by Weekly Sales

In [None]:
plt.figure(figsize=(12,8))
sns.barplot(x='Store',y='Weekly_Sales',data=final_df)
plt.show()

In [None]:
final_df['Store'] = final_df['Store'].astype(int)

# Aggregating weekly sales by stores
total_sales = final_df.groupby('Store')['Weekly_Sales'].sum().reset_index()

# Sorting stores by total weekly sales in descending order
total_sales = total_sales.sort_values(by='Weekly_Sales', ascending=False)

total_sales

In [None]:
from sklearn.linear_model import LinearRegression
model = LinearRegression()
model.fit(X_train, y_train)


In [None]:
# Optimized visualization: Randomly sample a subset of data for scatter plots
sampled_df = final_df.sample(n=10000, random_state=42)  # Randomly sample 10,000 records

# Pairplot for key features vs Weekly_Sales (sampled data)
key_features = ['Temperature', 'Fuel_Price', 'CPI', 'Unemployment']
sns.pairplot(sampled_df, x_vars=key_features, y_vars='Weekly_Sales', height=4, aspect=0.8, kind='reg')
plt.suptitle('Relationships Between Key Features and Weekly Sales (Sampled Data)', y=1.02)
plt.show()

In [None]:
# Function to plot weekly sales trend for a given store
def plot_weekly_sales(store_id):
    # Filter data for the selected store
    store_data = final_df[final_df['Store'] == store_id]
    store_data = store_data.sort_values(by='Date')  # Sort by date

    # Line plot of Weekly_Sales over time
    plt.figure(figsize=(12, 6))
    plt.plot(store_data['Date'], store_data['Weekly_Sales'], marker='o', linestyle='-', label=f'Store {store_id}')
    plt.title(f'Weekly Sales Trend Over Time (Store {store_id})')
    plt.xlabel('Date')
    plt.ylabel('Weekly Sales')
    plt.grid()
    plt.legend()
    plt.tight_layout()
    plt.show()

# Plot for Store 1
plot_weekly_sales(store_id=1)

# Plot for Store 2
plot_weekly_sales(store_id=2)

# Plot for Store 3
plot_weekly_sales(store_id=3)

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

# Select the features for the model (excluding the target variable 'weekly_sales' and 'date')
features = final_df.drop(columns=['Weekly_Sales', 'Date'])

# Target variable
target = final_df['Weekly_Sales']
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=42)

# Initialize and train the Linear Regression model
model = LinearRegression()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print(mse)
print(r2)

Linear Regression does not yield good results because the data does not exhibit a linear relationship.

Random Forest Regression

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split

# Fit linear regression model
model = LinearRegression()
model.fit(X_train, y_train)

# Predict and evaluate
y_pred = model.predict(X_test)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
print(f'RMSE: {rmse}')


In [None]:
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

# Load your dataset
data = final_df
# Preprocess and select relevant features (example)
features = data[['CPI', 'Fuel_Price', 'Unemployment']]
target = data['Weekly_Sales']

# Split into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=42)

# Initialize the model
model = RandomForestRegressor(n_estimators=100, random_state=42)

# Train the model
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)

# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
print(f'Mean Squared Error: {mse}')


In [None]:
import matplotlib.pyplot as plt

# Plot actual vs predicted sales
plt.figure(figsize=(10,6))
plt.plot(y_test.values, label='Actual Sales')
plt.plot(y_pred, label='Predicted Sales')
plt.title('Random Forest Regression: Actual vs Predicted Sales')
plt.xlabel('Index')
plt.ylabel('Sales')
plt.legend()
plt.show()
