In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
df = pd.read_csv('50_startups.csv')
df.head(5)

Unnamed: 0,R&D Spend,Administration,Marketing Spend,State,Profit
0,165349.2,136897.8,471784.1,New York,192261.83
1,162597.7,151377.59,443898.53,California,191792.06
2,153441.51,101145.55,407934.54,Florida,191050.39
3,144372.41,118671.85,383199.62,New York,182901.99
4,142107.34,91391.77,366168.42,Florida,166187.94


In [3]:
df.shape

(50, 5)

In [4]:
df.columns

Index(['R&D Spend', 'Administration', 'Marketing Spend', 'State', 'Profit'], dtype='object')

In [5]:
df.ndim

2

In [6]:
df.describe()

Unnamed: 0,R&D Spend,Administration,Marketing Spend,Profit
count,50.0,50.0,50.0,50.0
mean,73721.6156,121344.6396,211025.0978,112012.6392
std,45902.256482,28017.802755,122290.310726,40306.180338
min,0.0,51283.14,0.0,14681.4
25%,39936.37,103730.875,129300.1325,90138.9025
50%,73051.08,122699.795,212716.24,107978.19
75%,101602.8,144842.18,299469.085,139765.9775
max,165349.2,182645.56,471784.1,192261.83


## Data Cleaning

In [7]:
df['State'].value_counts()

California    17
New York      17
Florida       16
Name: State, dtype: int64

In [8]:
df.dtypes

R&D Spend          float64
Administration     float64
Marketing Spend    float64
State               object
Profit             float64
dtype: object

In [9]:
df.isnull().sum()

R&D Spend          0
Administration     0
Marketing Spend    0
State              0
Profit             0
dtype: int64

## EDA on Dataset

In [10]:
#Histgram on Profit
sns.distplot(dataset['Profit'],bins=5,kde=True)

NameError: name 'dataset' is not defined

In [None]:
#Correlation  chart on different variables for comparision 
# Profit Vs R & Spend is very linear and almost same for Marketing spend
# Profit spend vs Administration distribution is very scattered 
sns.pairplot(dataset)

In [None]:
# profit split in State level - Looks Florida has the maximum Profit
sns.barplot(x='State',y='Profit',data=dataset, palette="Blues_d")
#sns.lineplot(x='State',y='Profit',data=dataset)

In [None]:
#gives positive & negative relation between categories
sns.heatmap(dataset.corr(), annot=True)

In [None]:
# spread of profit against state 
g=sns.FacetGrid(dataset, col='State')
g=g.map(sns.kdeplot,'Profit')

In [None]:
#Separating the Independent and the dependent variables
X = df.iloc[ : , :-1].values
y = df.iloc[ : , 4].values

In [None]:
from sklearn.preprocessing import LabelEncoder,OneHotEncoder
from sklearn.compose import ColumnTransformer

labelencoder_X = LabelEncoder()
X[:, 3] = labelencoder_X.fit_transform(X[:, 3])

# Country column
columnTransformer = ColumnTransformer([('State', OneHotEncoder(), [3])],     remainder='passthrough')
X=columnTransformer.fit_transform(X)

# Here index column 3 has categorical variable

In [None]:
# Avoiding the dummy variable trap
X = X[:, 1:]
X

In [None]:
X

## Data Partitioning

In [None]:
#Split the dataset into train and test
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0 )

In [None]:
X_train,y_train

In [None]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [None]:
#Fitting Multiple Linear Regression to the training set
from sklearn.linear_model import LinearRegression
linearregression = LinearRegression()
linearregression.fit(X_train_scaled,y_train)

#print('Coefficients: \n', lr.coef_)
linearregression.score(X_train_scaled, y_train)

#predicting the test set results
y_pred = linearregression.predict(X_test)

print(linearregression.score(X_train_scaled, y_train))
print(linearregression.score(X_test_scaled, y_test))

In [None]:
import statsmodels.api as sm
# (bydefault it's not take constant(thetas 0 ,we have to put theta_0 * X0 = 1
# that's why we are creating col. of 1's and trying to put in the starting of X)
X_one = np.append(arr  = np.ones((40,1)).astype(int),values = X_train_scaled, axis = 1)# we are adding 1 extra col. in the starting  of X
print(X_one)

#np.append(values = X, np.ones((50,1)), axis = 1) # it will add col. at the last of X dataset

In [None]:
X_opt =X_one[:, [0,1,2,3,4,5]].astype(float)
regressor_OLS = sm.OLS(y_train,X_train_scaled)
regressor_OLS=regressor_OLS.fit()  
regressor_OLS.summary()

In [None]:
regressor_OLS.predict(X_test_scaled)

In [None]:
plt.plot(df.iloc[:,0], df.iloc[:, 4], color = 'green')
plt.xlabel('R&D Spends')
plt.ylabel('Profits')
plt.title('Relation b/w the R&D spend and Profits')
plt.grid()

In [None]:
plt.scatter(X_train, y_train, color = 'red')
plt.plot(X_train, regressor_OLS.predict(X_train), color = 'blue')
plt.title('R&D spend vs Profit (Training set)')
plt.xlabel('R&D spend')
plt.ylabel('Profit')
plt.grid(color='gold', linestyle='-.', linewidth=0.7)
plt.show()