<a href="https://colab.research.google.com/github/habrev/Rossmann-Pharmaceuticals/blob/task-2/notebooks/prediction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Dependency

In [1]:
import pandas as pd
import matplotlib as plt

# Preprocessing

# file imported

In [8]:
df = pd.read_csv('test.csv')

# one-hot-encoding for numerical columns

In [9]:
from sklearn.preprocessing import LabelEncoder

# Display the first few rows of the dataset
print("Original Data:")
print(df.head())

# Identify non-numerical columns
non_numerical_columns = df.select_dtypes(include=['object', 'category']).columns

# Convert all non-numerical columns to strings (if necessary)
df[non_numerical_columns] = df[non_numerical_columns].astype(str)

# Perform One-Hot Encoding on all non-numerical columns
df_encoded = pd.get_dummies(df, columns=non_numerical_columns)

# Optionally, save the encoded data to a new file
output_path = 'one_hot_encoded_test.csv'
df_encoded.to_csv(output_path, index=False)
print(f"\nEncoded file saved to {output_path}")


Original Data:
   Id  Store  DayOfWeek        Date  Open  Promo StateHoliday  SchoolHoliday
0   1      1          4  2015-09-17   1.0      1            0              0
1   2      3          4  2015-09-17   1.0      1            0              0
2   3      7          4  2015-09-17   1.0      1            0              0
3   4      8          4  2015-09-17   1.0      1            0              0
4   5      9          4  2015-09-17   1.0      1            0              0

Encoded file saved to one_hot_encoded_test.csv


In [10]:
df = pd.read_csv('one_hot_encoded_test.csv')

In [11]:
from sklearn.preprocessing import StandardScaler
# Identify numerical columns to scale
numerical_columns = df.select_dtypes(include=['int64', 'float64']).columns

# Initialize the scaler
scaler = StandardScaler()

# Scale the numerical columns
df_scaled = df.copy()
df_scaled[numerical_columns] = scaler.fit_transform(df[numerical_columns])

# Save the scaled dataset
scaled_output_path = 'scaled_dataset_test.csv'
df_scaled.to_csv(scaled_output_path, index=False)

# Display a preview of the scaled DataFrame
print(f"Scaled Data saved to {scaled_output_path}")
print(df_scaled.head())

Scaled Data saved to scaled_dataset_test.csv
         Id     Store  DayOfWeek      Open     Promo  SchoolHoliday  \
0 -1.732009 -1.732596   0.010337  0.412939  1.235442      -0.892695   
1 -1.731924 -1.726351   0.010337  0.412939  1.235442      -0.892695   
2 -1.731840 -1.713862   0.010337  0.412939  1.235442      -0.892695   
3 -1.731756 -1.710739   0.010337  0.412939  1.235442      -0.892695   
4 -1.731671 -1.707617   0.010337  0.412939  1.235442      -0.892695   

   Date_2015-08-01  Date_2015-08-02  Date_2015-08-03  Date_2015-08-04  ...  \
0            False            False            False            False  ...   
1            False            False            False            False  ...   
2            False            False            False            False  ...   
3            False            False            False            False  ...   
4            False            False            False            False  ...   

   Date_2015-09-10  Date_2015-09-11  Date_2015-09-12  Date_

# Model building with sklearn pipelines

In [None]:
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
import joblib

# Load the dataset
file_path = "scaled_dataset.csv"
df = pd.read_csv(file_path)

# Define target and feature columns
target = 'Sales'  # Target column
features = [col for col in df.columns if col != target]  # All columns except the target

# Split the dataset into training and testing sets
X = df[features]
y = df[target]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define the pipeline
pipeline = Pipeline([
    ('scaler', StandardScaler()),  # Scale the features
    ('model', RandomForestRegressor(random_state=42))  # Random Forest Regressor
])

# Train the pipeline on the training data
pipeline.fit(X_train, y_train)

# Evaluate the model on the training data
y_train_pred = pipeline.predict(X_train)
mse_train = mean_squared_error(y_train, y_train_pred)
r2_train = r2_score(y_train, y_train_pred)
print(f"Training Performance:\nMean Squared Error: {mse_train:.4f}\nR^2 Score: {r2_train:.4f}")

# Evaluate the model on the testing data
y_test_pred = pipeline.predict(X_test)
mse_test = mean_squared_error(y_test, y_test_pred)
r2_test = r2_score(y_test, y_test_pred)
print(f"\nTesting Performance:\nMean Squared Error: {mse_test:.4f}\nR^2 Score: {r2_test:.4f}")

# Save the pipeline for future use
pipeline_path = 'random_forest_pipeline.pkl'
joblib.dump(pipeline, pipeline_path)
print(f"\nPipeline saved to {pipeline_path}")

# (Optional) Feature Importance
feature_importances = pipeline.named_steps['model'].feature_importances_
feature_importance_df = pd.DataFrame({
    'Feature': features,
    'Importance': feature_importances
}).sort_values(by='Importance', ascending=False)

print("\nFeature Importances:")
print(feature_importance_df)
