## Regresión lineal paso a paso

####  Statsmodels

In [None]:
import pandas as pd
#data

In [1]:
import statsmodels.formula.api as smf

lm=smf.ols(formula="y~x", data=data).fit()
lm.params #Intercept, X, dtype
lm.pvalues #p-value por parámetro
lm.rsquared #r^2
lm.rsquared_adj #r^2 adj
lm.summary() #resumen de todos los coeficientes

In [None]:
y_pred=lm.predict(pd.DataFrame(data["column"])) #hay que generar un DF del vector

In [None]:
import matplotlib.pyplot as plt
%matplotlib inline
data.plot(kind="scatter", x="x_value", y="y_act")
plt.plot(kind="scatter", x=pd.DataFrame(data["column"]), y="y_pred")

In [None]:
data["y_pred"] = 7.032594 + 0.047537*data["column"]
data["RSE"] = (data["y_act"]-data["y_pred"])**2
SSD = sum(data["RSE"])
RSE = np.sqrt(SSD/(len(data)-2))
sales_m = np.mean(data["y_act"])
error = RSE/sales_m ##porcentaje que no es capaz de explicar mi modelo
plt.hist((data["y_act"]-data["y_pred"])) #hist de errores, debe ser gaussiano

### Regresión lineal múltiple

####  Statsmodels

In [None]:
lm_mult = smf.ols(formula="y~x1+x2+x3", data = data).fit()
#mismo proceso
#RSE = np.sqrt(SSD/(len(data)-n-1)) n=número de predictores
#multicolinealidad: solución con ACP

### Validación del modelo

In [None]:
from sklearn.model_selection import train_test_split
train, test = train_test_split(data, test_size = 0.2)

import statsmodels.formula.api as smf
lm = smf.ols(formula="y~x1+x2", data=training).fit()

y_pred = lm.predict(testing)

#calculo de parámetros
SSD = sum((testing["y"]-y_pred)**2)
RSE = np.sqrt(SSD/(len(testing)-2-1))
sales_mean = np.mean(testing["y"])
error = RSE/sales_mean

## Linnear regression - Scikit-learn

In [None]:
#Automatización completa de la regresión

from sklearn.feature_selection import RFE 
from sklearn.svm import SVR #support vector machine
import pandas as pd
import numpy as np

data
feature_cols = ["x1", "x2", "x3"]

#separar del dataset las variables predictoras del target
X = data[feature_cols]
Y = data["y_act"]

In [None]:
#creación de modelo
estimator = SVR(kernel="linear")
selector = RFE(estimator, n_features_to_select=2, step=1) #2 variables y en un paso
selector = selector.fit(X,Y)

In [None]:
selector.support_ #predictores seleccionados en TRUE
selector.ranking_ #orden decreciente en función de significatividad

### Sklearn.linear_model

In [None]:
### from sklearn.linear_model import LinearRegression

#lm implementado
X_pred = X[["x1", "x2"]]
lm = LinearRegression()
lm.fit(X_pred, Y)

#coeficientes
lm.intercept_
lm.coef_
lm.score(X_pred, Y)

#y_pred = lm.predict(testing)

## LM con variables categóricas

In [None]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression

df=data

In [None]:
#Pasar las variables categóricas a dummies
dummy_gender = pd.get_dummies(df["Gender"], prefix = "Gender")
dummy_city_tier = pd.get_dummies(df["City Tier"], prefix = "City")

### Eliminar variables dummies redundantes

In [None]:
#usar .iloc[:,1:] para eliminar la 0-ésima columna
dummy_gender = pd.get_dummies(df["Gender"], prefix="Gender").iloc[:,1:] 
dummy_city_tier = pd.get_dummies(df["City Tier"], prefix="City").iloc[:,1:] 

In [None]:
#Añadimos las variables dummies al DF original
column_names = df.columns.values.tolist()
df_new = df[column_names].join(dummy_gender)
column_names = df_new.columns.values.tolist()
df_new = df_new[column_names].join(dummy_city_tier)

In [None]:
#Separar las variables predictoras de la target

feature_cols = ["Monthly Income", "Transaction Time", 
                "Gender_Female", "Gender_Male", 
                "City_Tier 1", "City_Tier 2", "City_Tier 3",
                "Record"]

X = df_new[feature_cols]
Y = df_new["Total Spend"]

In [None]:
lm = LinearRegression()
lm.fit(X,Y)

#lista de coeficientes
list(zip(feature_cols, lm.coef_)) #zip: método para juntar valores
lm.score(X,Y)

#df_new["prediction"] = lm.predict(pd.DataFrame(df_new[feature_cols]))

### Conversión a No lineal

In [None]:
X = data_auto["x1"].fillna(data_auto["x1"].mean()).to_numpy()
Y = data_auto["y_act"].fillna(data_auto["y_act"].mean())

##Modelo cuadrático: X=X**2 para hacerlo manual
##X_data = X[:,np.newaxis] para pasar tipo de dato a DF, necesario para el lm

#Modelo polinómico
from sklearn.preprocessing import PolynomialFeatures
from sklearn import linear_model

#Hay que generar un polinomio de la variable predictora de grado n
poly = PolynomialFeatures(degree=2)
#X es un vector y debe pasarse a formato DF
X = np.asarray(X)
X_data = poly.fit_transform(X[:,np.newaxis])

#Se invoca el método lm.fit sobre el polinomio de grado n generado para el ajuste
lm = linear_model.LinearRegression()
lm.fit(X_data, Y)

## Función: validación de regresión

In [None]:
# Devuelve los parámetros del ajuste
def regresion_validation(X_data, Y, Y_pred):
    SSD = np.sum((Y - Y_pred)**2)
    RSE = np.sqrt(SSD/(len(X_data)-1))
    y_mean = np.mean(Y)
    error = RSE/y_mean
    print("SSD: "+str(SSD)+", RSE: " +str(RSE) + ", Y_mean: " +str(y_mean) +", error: " + str(error*100)+ "%")

In [None]:
for d in range(2,12):
    poly = PolynomialFeatures(degree=d)
    X_data = poly.fit_transform(X[:,np.newaxis])
    lm = linear_model.LinearRegression()
    lm.fit(X_data, Y)
    print("Regresión de grado "+str(d))
    print("R2:" +str(lm.score(X_data, Y)))
    print(lm.intercept_)
    print(lm.coef_)
    regresion_validation(X_data, Y, lm.predict(X_data))