<a href="https://colab.research.google.com/github/kshitijagarwal183/Codsoft/blob/main/SALES%20PREDICTION%20USING%20PYTHON%20.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Linear regression model

In [4]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

# Load the data
file_path = 'advertising.csv'  # Update with the correct path
data = pd.read_csv(file_path)

# Split the data into features (X) and target (y)
X = data[['TV', 'Radio', 'Newspaper']]
y = data['Sales']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize the linear regression model
model = LinearRegression()

# Train the model on the training data
model.fit(X_train, y_train)

# Make predictions on the test data
y_pred = model.predict(X_test)

# Evaluate the model's performance
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"Mean Squared Error (MSE): {mse}")
print(f"R-squared (R²): {r2}")

# Select a few rows from the test set for prediction
sample_data = X_test.head()
sample_actual_sales = y_test.head()

# Predict sales using the trained model
sample_predictions = model.predict(sample_data)

# Create a DataFrame to compare actual sales and predicted sales
comparison_df = pd.DataFrame({
    'TV': sample_data['TV'],
    'Radio': sample_data['Radio'],
    'Newspaper': sample_data['Newspaper'],
    'Actual Sales': sample_actual_sales,
    'Predicted Sales': sample_predictions
})

print(comparison_df)


Mean Squared Error (MSE): 2.9077569102710896
R-squared (R²): 0.9059011844150826
        TV  Radio  Newspaper  Actual Sales  Predicted Sales
95   163.3   31.6       52.9          16.9        17.034772
15   195.4   47.7       52.9          22.4        20.409740
30   292.9   28.3       43.2          21.4        23.723989
158   11.7   36.9       45.2           7.3         9.272785
128  220.3   49.0        3.2          24.7        21.682719


Polynomial Regression Model

In [5]:
from sklearn.preprocessing import PolynomialFeatures

# Transform the features into polynomial features
poly = PolynomialFeatures(degree=2)
X_poly = poly.fit_transform(X)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_poly, y, test_size=0.2, random_state=42)

# Train the model
model = LinearRegression()
model.fit(X_train, y_train)

# Evaluate the model
y_pred = model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print(f"Mean Squared Error (MSE): {mse}")
print(f"R-squared (R²): {r2}")

Mean Squared Error (MSE): 1.44254263690159
R-squared (R²): 0.9533174341074723


Regularization (Ridge and Lasso)

In [6]:
from sklearn.linear_model import Ridge, Lasso

# Ridge Regression
ridge_model = Ridge(alpha=1.0)
ridge_model.fit(X_train, y_train)
ridge_pred = ridge_model.predict(X_test)
ridge_mse = mean_squared_error(y_test, ridge_pred)
ridge_r2 = r2_score(y_test, ridge_pred)
print(f"Ridge - MSE: {ridge_mse}, R²: {ridge_r2}")

# Lasso Regression
lasso_model = Lasso(alpha=0.1)
lasso_model.fit(X_train, y_train)
lasso_pred = lasso_model.predict(X_test)
lasso_mse = mean_squared_error(y_test, lasso_pred)
lasso_r2 = r2_score(y_test, lasso_pred)
print(f"Lasso - MSE: {lasso_mse}, R²: {lasso_r2}")


Ridge - MSE: 1.4425528162458723, R²: 0.9533171046905815
Lasso - MSE: 1.453996925440471, R²: 0.9529467583535709


Tree-based Methods

In [7]:
from sklearn.ensemble import RandomForestRegressor

# Train a random forest model
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)

# Evaluate the model
rf_pred = rf_model.predict(X_test)
rf_mse = mean_squared_error(y_test, rf_pred)
rf_r2 = r2_score(y_test, rf_pred)
print(f"Random Forest - MSE: {rf_mse}, R²: {rf_r2}")


Random Forest - MSE: 1.2804824000000004, R²: 0.9585619152716246


Cross-validation

In [8]:
from sklearn.model_selection import cross_val_score

# Perform cross-validation on the linear regression model
cv_scores = cross_val_score(LinearRegression(), X, y, cv=5, scoring='r2')
print(f"Cross-validated R² scores: {cv_scores}")
print(f"Mean Cross-validated R²: {cv_scores.mean()}")


Cross-validated R² scores: [0.87556263 0.93177791 0.92150403 0.84554586 0.90247132]
Mean Cross-validated R²: 0.8953723525274103
