In [127]:
# Importing required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
%matplotlib inline


In [128]:
# Importing the dataset
companies = pd.read_csv('../data/1000_companies_data.csv')

In [129]:
# Extracting the independent and dependent variables
x = companies.iloc[:, :-1].values
y = companies.iloc[:, -1].values

In [130]:
companies.head()

Unnamed: 0,R&D Spend,Administration,Marketing Spend,State,Profit
0,165349.2,136897.8,471784.1,New York,192261.83
1,162597.7,151377.59,443898.53,California,191792.06
2,153441.51,101145.55,407934.54,Florida,191050.39
3,144372.41,118671.85,383199.62,New York,182901.99
4,142107.34,91391.77,366168.42,Florida,166187.94


In [131]:
# Encoding the categorical data

column_transformer = ColumnTransformer(
    transformers=[
        ('encoder', OneHotEncoder(), [3])
    ],
    remainder='passthrough'
)
x = column_transformer.fit_transform(x)
print(x[:5])
print(y[:5])


[[0.0 0.0 1.0 165349.2 136897.8 471784.1]
 [1.0 0.0 0.0 162597.7 151377.59 443898.53]
 [0.0 1.0 0.0 153441.51 101145.55 407934.54]
 [0.0 0.0 1.0 144372.41 118671.85 383199.62]
 [0.0 1.0 0.0 142107.34 91391.77 366168.42]]
[192261.83 191792.06 191050.39 182901.99 166187.94]


In [132]:
# Avoiding the dummy variable trap
x = x[:, 1:]

In [133]:
# Splitting the dataset into the Training set and Test set
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=0)

In [134]:
# Fitting Multiple Linear Regression to the Training set
from sklearn.linear_model import LinearRegression
regressor = LinearRegression()
regressor.fit(x_train, y_train)


In [136]:
# Predicting the Test set results
y_pred = regressor.predict(x_test)
y_pred

array([ 89790.61532916,  88427.07187362,  94894.67836972, 175680.86725609,
        83411.7304209 , 110571.90200074, 132145.22936439,  91473.37719687,
       164597.05380605,  53222.82667403,  66950.19050991, 150566.43987004,
       126915.20858596,  59337.85971052, 177513.9105306 ,  75316.28143052,
       118248.14406603, 164574.40699901, 170937.28981069, 182069.11645082,
       118845.0325269 ,  85669.95112229, 180992.59396142,  84145.08220146,
       105005.83769214, 101233.56772747,  53831.07669094,  56881.41475226,
        68896.39346906, 210040.00765881, 120778.72270894, 111724.87157655,
       101487.90541518, 137959.02649623,  63969.95996745, 108857.91214127,
       186014.72531986, 171442.64130746, 174644.26529203, 117671.49128195,
        96731.37857434, 165452.25779408, 107724.34331255,  50194.54176915,
       116513.89532179,  58632.48986822, 158416.46827608,  78541.48521611,
       159727.66671742, 131137.87699643, 184880.70924514, 174609.08266878,
        93745.66352059,  

In [137]:
# Calculating the Coefficients
print(regressor.coef_)

[-8.80536598e+02 -6.98169073e+02  5.25845857e-01  8.44390881e-01
  1.07574255e-01]


In [138]:
# Calculating the Intercept
print(regressor.intercept_)

-51035.229724003526


In [139]:
# Calculating the R squared value
from sklearn.metrics import r2_score
r2_score(y_test, y_pred)

0.9112695892268797

In [142]:
# Calculating the Adjusted R squared value
def adjusted_r_squared(x, y, y_pred):
    r2 = r2_score(y, y_pred)
    n = len(y)
    p = x.shape[1]
    return 1 - (1 - r2) * (n - 1) / (n - p - 1)
print(adjusted_r_squared(x_test, y_test, y_pred))


0.9089827229698405
