In [1]:
# Misc
import warnings

# Core
import pandas as pd
import numpy as np

# Machine Learning
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split

In [2]:
# Evitamos los molestos warnings
warnings.simplefilter("ignore")

In [3]:
data = pd.read_csv('https://raw.githubusercontent.com/jorge-robledo11/Datasets/main/Datasets/ecom-expense/Ecom%20Expense.csv')
data.head()

Unnamed: 0,Transaction ID,Age,Items,Monthly Income,Transaction Time,Record,Gender,City Tier,Total Spend
0,TXN001,42,10,7313,627.668127,5,Female,Tier 1,4198.385084
1,TXN002,24,8,17747,126.904567,3,Female,Tier 2,4134.976648
2,TXN003,47,11,22845,873.469701,2,Male,Tier 2,5166.614455
3,TXN004,50,11,18552,380.219428,7,Female,Tier 1,7784.447676
4,TXN005,60,2,14439,403.374223,2,Female,Tier 2,3254.160485


In [4]:
header = ['Transaction ID', 'Age', 'Items', 'Monthly Income','Transaction Time', 'Record','Gender','City Tier','Total Spend']
data.columns = header

In [5]:
data.isnull().sum()

Transaction ID      0
Age                 0
Items               0
Monthly Income      0
Transaction Time    0
Record              0
Gender              0
City Tier           0
Total Spend         0
dtype: int64

In [6]:
data = pd.get_dummies(data, columns = ["Gender"], drop_first = True)
data = data.rename(columns={'Gender_Male':'Gender'})

In [7]:
data['Age'] = data['Age'].astype(int)
data['Gender'] = data['Gender'].astype(int)

In [8]:
# Segmentar y etiquetar

"""
0 ~ 11 años = 1
12 ~ 18 años = 2
19 ~ 35 años = 3
36 ~ 60 años = 4
61 ~ 100 años = 5
"""

bins = [0, 12, 18, 35, 60, 100]
names = ["1","2","3","4","5"]

data["Age"] = pd.cut(data["Age"], bins, labels = names)

In [9]:
data.head()

Unnamed: 0,Transaction ID,Age,Items,Monthly Income,Transaction Time,Record,City Tier,Total Spend,Gender
0,TXN001,4,10,7313,627.668127,5,Tier 1,4198.385084,0
1,TXN002,3,8,17747,126.904567,3,Tier 2,4134.976648,0
2,TXN003,4,11,22845,873.469701,2,Tier 2,5166.614455,1
3,TXN004,4,11,18552,380.219428,7,Tier 1,7784.447676,0
4,TXN005,4,2,14439,403.374223,2,Tier 2,3254.160485,0


In [10]:
# Contar los valores por categorías

data['City Tier'].value_counts()

Tier 1    815
Tier 2    782
Tier 3    765
Name: City Tier, dtype: int64

In [11]:
# Mapear, cambiar las categorías

tiers = {'Tier 1': 1, 'Tier 2': 2, 'Tier 3': 3}
data['City Tier'] = data['City Tier'].map(tiers)

In [12]:
data['City Tier'].value_counts()

1    815
2    782
3    765
Name: City Tier, dtype: int64

In [13]:
# Creamos un nuevo dataframe con las variables dummies

dummy_city_tier = pd.get_dummies(data['City Tier'], prefix='City')
dummy_city_tier.shape

(2362, 3)

In [14]:
# Lo concatenamos con nuestro dataframe original

data_new = pd.concat([data, dummy_city_tier], axis=1)

In [15]:
# Dropeamos la columna ya no nos interesa

data_new.drop(['City Tier'], axis=1, inplace=True)
data_new

Unnamed: 0,Transaction ID,Age,Items,Monthly Income,Transaction Time,Record,Total Spend,Gender,City_1,City_2,City_3
0,TXN001,4,10,7313,627.668127,5,4198.385084,0,1,0,0
1,TXN002,3,8,17747,126.904567,3,4134.976648,0,0,1,0
2,TXN003,4,11,22845,873.469701,2,5166.614455,1,0,1,0
3,TXN004,4,11,18552,380.219428,7,7784.447676,0,1,0,0
4,TXN005,4,2,14439,403.374223,2,3254.160485,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...
2357,TXN2358,4,7,5705,460.157207,3,2909.619546,1,0,1,0
2358,TXN2359,3,11,11202,851.924751,8,7968.633136,1,0,1,0
2359,TXN2360,3,5,21335,435.145358,8,8816.406448,0,0,0,1
2360,TXN2361,4,12,19294,658.439838,7,7915.595856,0,1,0,0


In [16]:
# Downcasteamos

data_new['Gender'] = pd.to_numeric(data_new['Gender'], downcast='integer')
data_new['Record'] = pd.to_numeric(data_new['Record'], downcast='integer')
data_new['Items'] = pd.to_numeric(data_new['Items'], downcast='integer')
data_new['Total Spend'] = pd.to_numeric(data_new['Total Spend'], downcast='float')
data_new['Transaction Time'] = pd.to_numeric(data_new['Transaction Time'], downcast='float')
data_new['Monthly Income'] = pd.to_numeric(data_new['Monthly Income'], downcast='integer')

In [17]:
# Cuando downcasteamos reducimos espacio en memoria del tipo de objeto que tenemos

data_new.dtypes

Transaction ID        object
Age                 category
Items                   int8
Monthly Income         int16
Transaction Time     float32
Record                  int8
Total Spend          float32
Gender                  int8
City_1                 uint8
City_2                 uint8
City_3                 uint8
dtype: object

In [18]:
data_new

Unnamed: 0,Transaction ID,Age,Items,Monthly Income,Transaction Time,Record,Total Spend,Gender,City_1,City_2,City_3
0,TXN001,4,10,7313,627.668152,5,4198.385254,0,1,0,0
1,TXN002,3,8,17747,126.904564,3,4134.976562,0,0,1,0
2,TXN003,4,11,22845,873.469727,2,5166.614258,1,0,1,0
3,TXN004,4,11,18552,380.219421,7,7784.447754,0,1,0,0
4,TXN005,4,2,14439,403.374237,2,3254.160400,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...
2357,TXN2358,4,7,5705,460.157196,3,2909.619629,1,0,1,0
2358,TXN2359,3,11,11202,851.924744,8,7968.633301,1,0,1,0
2359,TXN2360,3,5,21335,435.145355,8,8816.406250,0,0,0,1
2360,TXN2361,4,12,19294,658.439819,7,7915.595703,0,1,0,0


In [19]:
# Definimos features y target

X = data_new.drop(['Transaction ID', 'Total Spend'], axis=1)
y = data_new['Total Spend']

In [20]:
# Separamos los datos de entrenamiento y testing

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=25)

In [21]:
# Dimensionamos

display(X.shape)
display(y.shape)

(2362, 9)

(2362,)

In [22]:
X.columns

Index(['Age', 'Items', 'Monthly Income', 'Transaction Time', 'Record',
       'Gender', 'City_1', 'City_2', 'City_3'],
      dtype='object')

In [23]:
# Definimos el estimador y lo entranamos con los datos de entrenamiento

linear_model = LinearRegression().fit(X_train, y_train)

In [24]:
# Intercepto y Parámetros

display(f'Intercepto: {linear_model.intercept_}')
print(f'Parámetros: {linear_model.coef_}')

'Intercepto: -947.6857440628928'

Parámetros: [ 1.06571835e+02  3.91097121e+01  1.49872211e-01  1.98283868e-01
  7.69427964e+02  2.82004302e+02  8.18814866e+01  3.89346563e+01
 -1.20816143e+02]


In [25]:
# Empaquetamos

dict(zip(X, linear_model.coef_))

{'Age': 106.57183466665157,
 'Items': 39.109712125038065,
 'Monthly Income': 0.14987221136720308,
 'Transaction Time': 0.19828386843195078,
 'Record': 769.4279636080081,
 'Gender': 282.0043017490621,
 'City_1': 81.8814866327437,
 'City_2': 38.93465627390436,
 'City_3': -120.81614290664788}

In [26]:
# R2

R2 = linear_model.score(X_test, y_test)
display(R2)

0.9212324545732028

In [44]:
# Predicción

y_pred = linear_model.predict(X_test)
y_pred = pd.DataFrame(y_pred, columns=['Predicción'])
y_pred.head()

Unnamed: 0,Predicción
0,5401.317321
1,10031.972994
2,7061.526782
3,4730.308818
4,7281.684032


In [28]:
# Definimos nuevos features y target

X2 = data_new.drop(['Transaction ID', 'Total Spend', 'Monthly Income', 'Transaction Time'], axis=1)
y2 = data_new['Total Spend']

In [29]:
# Separamos los datos de entrenamiento y testing

X_train2, X_test2, y_train2, y_test2 = train_test_split(X2, y2, test_size=0.2, random_state=25)

In [30]:
# Definimos el segundo estimador y lo entranamos con los datos de entrenamiento

linear_model2 = LinearRegression().fit(X_train2, y_train2)

In [31]:
# Intercepto y Parámetros

display(f'Intercepto: {linear_model2.intercept_}')
print(f'Parámetros: {linear_model2.coef_}')

'Intercepto: 1279.864961528403'

Parámetros: [162.67346714  37.31190794 776.59940158 334.70871459  81.68850998
   6.47638218 -88.16489217]


In [32]:
# Empaquetamos

dict(zip(X2, linear_model2.coef_))

{'Age': 162.67346714037313,
 'Items': 37.31190793642588,
 'Record': 776.5994015787551,
 'Gender': 334.70871459019105,
 'City_1': 81.68850998476947,
 'City_2': 6.476382184027997,
 'City_3': -88.164892168797}

In [33]:
# R2

R2_2 = linear_model2.score(X_test2, y_test2)
display(R2_2)

0.741795808369287

In [45]:
# Predicción del modelo 2

y_pred2 = linear_model2.predict(X_test2)
y_pred2 = pd.DataFrame(y_pred2, columns=['Predicción 2'])
y_pred2.head()

Unnamed: 0,Predicción 2
0,5345.796696
1,9224.92509
2,8598.161632
3,5769.882789
4,5975.720372


In [61]:
# Número de datos

print(len(y), len(y2))
print(len(y_pred), len(y_pred2))

2362 2362
473 473


<a style='text-decoration:none;line-height:16px;display:flex;color:#5B5B62;padding:10px;justify-content:end;' href='https://deepnote.com?utm_source=created-in-deepnote-cell&projectId=4ec84517-f553-446b-9032-1da3132bd62a' target="_blank">
 </img>
Created in <span style='font-weight:600;margin-left:4px;'>Deepnote</span></a>