# Libreria

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import LinearRegression

random_state = 49

# Datasets

In [2]:
#Carga los datos y muestras las 10 primeras filas
datas = pd.read_csv("50_Startups.csv")

X = datas.iloc[:, :-1].values
y = datas.iloc[:, -1].values

print("Dataframe:")
print(datas.head(), end=2*'\n')
print("5 primero valores de X:")
print(X[:5, :], end=2*'\n')
print("5 primero valores de y:")
print(y[:5])

Dataframe:
   R&D Spend  Administration  Marketing Spend       State     Profit
0  165349.20       136897.80        471784.10    New York  192261.83
1  162597.70       151377.59        443898.53  California  191792.06
2  153441.51       101145.55        407934.54     Florida  191050.39
3  144372.41       118671.85        383199.62    New York  182901.99
4  142107.34        91391.77        366168.42     Florida  166187.94

5 primero valores de X:
[[165349.2 136897.8 471784.1 'New York']
 [162597.7 151377.59 443898.53 'California']
 [153441.51 101145.55 407934.54 'Florida']
 [144372.41 118671.85 383199.62 'New York']
 [142107.34 91391.77 366168.42 'Florida']]

5 primero valores de y:
[192261.83 191792.06 191050.39 182901.99 166187.94]


R&D Spend, Administration, Marketing Spend y State -> son variables independientes
Profit son variables dependientes
State es una variable categórica, habrá que tenerlo en cuenta.
Además faltan algunos datos, por lo que habrá que tenerlo en cuenta.

In [3]:
## Encoding the State variable
transf = [('encoder', OneHotEncoder(), [3])]
ct = ColumnTransformer(transformers=transf, remainder='passthrough')
X = ct.fit_transform(X)

# Split dataset

In [4]:
#Se divide el dataset en datos de entrenamiento y test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=random_state)

print("Dimensión de X_train:")
print(X_train.shape)
print("Dimensión de X_test:")
print(X_test.shape)
print("Dimensión de y_train:")
print(y_train.shape)
print("Dimensión de y_test:")
print(y_test.shape)

Dimensión de X_train:
(40, 6)
Dimensión de X_test:
(10, 6)
Dimensión de y_train:
(40,)
Dimensión de y_test:
(10,)


# Entrenamiento del modelo

In [5]:
model = LinearRegression().fit(X_train, y_train)

# Predicciones

In [6]:
y_pred = model.predict(X_test)
np.set_printoptions(precision=2)

In [7]:
#Reshape lo usamos para representar los datos de forma vertical en vez de horizontal.
print(np.concatenate((y_pred.reshape(len(y_pred),1), y_test.reshape(len(y_test), 1)), axis=1))

[[101644.61 107404.34]
 [ 99418.22  97483.56]
 [ 81896.34  81005.76]
 [116162.83 126992.93]
 [151662.3  132602.65]
 [115787.69 118474.03]
 [ 45620.    42559.73]
 [132138.67 125370.37]
 [194553.65 192261.83]
 [116169.55 105008.31]]


## Making a single prediction (for example the profit of a startup with R&D Spend = 160000, Administration Spend = 130000, Marketing Spend = 300000 and State = 'California')

In [8]:
print(model.predict([[1, 0, 0, 160000, 130000, 300000]]))

[183040.14]


## Getting the final linear regression equation with the values of the coefficients

In [9]:
print(model.coef_)
print(model.intercept_)

[-1.42e+03  7.17e+02  6.98e+02  8.03e-01 -1.95e-03  2.98e-02]
47300.11651571962
