In [3]:
# Multiple Linear Regression

# Importing the libraries
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd


In [4]:
# Importing the dataset
dataset = pd.read_csv('50_Startups.csv')
X = dataset.iloc[:, :-1].values
y = dataset.iloc[:, 4].values


In [11]:
dataset['State'].value_counts()

California    17
New York      17
Florida       16
Name: State, dtype: int64

In [12]:
# Print the categorical columns
cols = dataset.columns
num_cols = dataset._get_numeric_data().columns
categorical_cols = list(set(cols) - set(num_cols))


In [16]:
for cols in categorical_cols:
    print(dataset[cols].value_counts())

California    17
New York      17
Florida       16
Name: State, dtype: int64


In [17]:
dataset.columns

Index(['R&D Spend', 'Administration', 'Marketing Spend', 'State', 'Profit'], dtype='object')

In [18]:
## Convert catergorical data into dummy values
dataset = pd.get_dummies(dataset, columns = categorical_cols, drop_first=True)
dataset.columns

Index(['R&D Spend', 'Administration', 'Marketing Spend', 'Profit',
       'State_Florida', 'State_New York'],
      dtype='object')

In [20]:
## Split X and Y
# Dependent variable 'traffic_volumne' in dataset - y
y = dataset.Profit.values

# Other Independent variables in dataset - X
dataset.drop(['Profit'], inplace=True, axis=1)
X = dataset.values

In [19]:
from sklearn.model_selection import train_test_split

In [21]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

In [22]:
# Fitting Multiple Linear Regression to the Training set
from sklearn.linear_model import LinearRegression
regressor = LinearRegression()
regressor.fit(X_train, y_train)

# Predicting the Test set results
y_pred = regressor.predict(X_test)

In [23]:
## Evaluating the model performance
from sklearn import metrics

In [24]:
print('MAE:', metrics.mean_absolute_error(y_test, y_pred))
print('MSE:', metrics.mean_squared_error(y_test, y_pred))
print('RMSE:', np.sqrt(metrics.mean_squared_error(y_test, y_pred)))

MAE: 7437.241858373783
MSE: 86972050.78508586
RMSE: 9325.880697558052


## using OLS method

In [27]:
from statsmodels import api as sm

In [28]:
model = sm.OLS(y_train, X_train).fit()

In [31]:
y_pred1 = model.predict(X_test)

In [32]:
print('MAE:', metrics.mean_absolute_error(y_test, y_pred1))
print('MSE:', metrics.mean_squared_error(y_test, y_pred1))
print('RMSE:', np.sqrt(metrics.mean_squared_error(y_test, y_pred1)))

MAE: 13100.541134057956
MSE: 215541387.79562777
RMSE: 14681.327862139302
