# Multilinear Regression

### Importing the libraries

In [3]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
np.set_printoptions(suppress=True)

### Importing the dataset

In [2]:
dataset = pd.read_csv('50_Startups.csv')

FileNotFoundError: [Errno 2] File 50_Startups.csv does not exist: '50_Startups.csv'

###  Lets look at the dataset

In [None]:
dataset.head(8)

In [None]:
dataset.State.value_counts()

### Encoding Categorical Data

The `State` column contains categorical features. This needs to be converted into Dummy Variables

In [None]:
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
labelencoder = LabelEncoder()

# Taking the categorical column and label encoding it
state = dataset.State  
dataset['State_Encoded'] = labelencoder.fit_transform(state.values)

In [None]:
dataset.head()

In [None]:
# Perform OneHotEncoding on the Label Encoded Column
onehotencoder = OneHotEncoder(categories = 'auto')
ohe = onehotencoder.fit_transform(dataset['State_Encoded'].values.reshape(-1, 1)).toarray()

In [None]:
ohe

In [None]:
list(state.sort_values().unique())[1]

In [None]:
# Adding the dummy variables to the dataset
new_columns = list(state.sort_values().unique())
for index, column in enumerate(new_columns):
    dataset[column] = ohe[:,index]

In [None]:
dataset.head()

In [None]:
# Re-arranging the required columns
dataset = dataset.iloc[:, [0,1,2,6,7,8,4]]  

# Removing the intermediate variables (Optional)
del ohe, state, column, index, new_columns

### After pre-processing, the dataset looks like this:

In [None]:
dataset.head(7)

### Splitting the Independent and Dependent Variables

In [None]:
X = dataset.iloc[:, :-1].values
y = dataset.iloc[:, -1].values

### Avoiding the Dummy Variable Trap (Optional, the library already does this)

In [None]:
X = X[:, :-1]

### Splitting the dataset into the Training set and Test set

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)

### Fitting Multiple Linear Regression to the Training set
The library is same as it was for Simple Linear Regression

In [None]:
from sklearn.linear_model import LinearRegression
regressor = LinearRegression()
regressor.fit(X_train, y_train)

### Predicting the Test set results

In [None]:
y_pred = regressor.predict(X_test)

### R-Squared value to evaluate Model performance

In [None]:
print("R-Sq Value = {}".format(regressor.score(X_test, y_test)))

In [None]:
regressor.intercept_

In [None]:
regressor.coef_

##  Backward Elimination of unnecessary columns

### Adding an intercept at the beginning

In [None]:
X = np.append(arr = np.ones((50, 1)).astype(int), values = X, axis = 1)

In [None]:
X[0]

### Building the optimal model using Backward Elimination

In [None]:
import statsmodels.regression.linear_model as sm
X_opt = X[:, [0, 1, 2, 3, 4, 5]]
regressor_OLS = sm.OLS(endog = y, exog = X_opt).fit()
print(regressor_OLS.summary())

#### As we can see in the above summary, the biggest p-value is for column with index number `4`. We will remove this column and run the model again

In [None]:
# Removing the column with index 4
X_opt = X[:, [0, 1, 2, 3, 5]]
regressor_OLS = sm.OLS(endog = y, exog = X_opt).fit()
print(regressor_OLS.summary())

#### As we can see in the above summary, the biggest p-value is for column with index number `4`. We will remove this column and run the model again

In [None]:
# Removing the column with index 4
X_opt = X[:, [0, 1, 2, 3]]
regressor_OLS = sm.OLS(endog = y, exog = X_opt).fit()
print(regressor_OLS.summary())

#### As we can see in the above summary, the biggest p-value is for column with index number `2`. We will remove this column and run the model again

In [None]:
X_opt = X[:, [0, 1, 3]]
regressor_OLS = sm.OLS(endog = y, exog = X_opt).fit()
print(regressor_OLS.summary())

#### As we can see in the above summary, the biggest p-value is for column with index number `2`. This is still above 5% significance level. Hence we need to remove this column too.

In [None]:
X_opt = X[:, [0, 1]]
regressor_OLS = sm.OLS(endog = y, exog = X_opt).fit()
print(regressor_OLS.summary())

### All the columns in X_opt seem to now have p-values less than 5%. Hence we will consider only the one column to be actually helpful in making the model.

## Hence, only the R&D Spends column is actually of use.