# Regresión logística para predecir el OS que usa un usuario que visita un sitio web

In [1]:
# Importar librerias
import pandas as pd
import numpy as np
from sklearn import linear_model
from sklearn import model_selection
from sklearn.metrics import classification_report 
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
import matplotlib.pyplot as plt
import seaborn as sb

In [3]:
# Leer el archivo CSV y guardarlo en un dataframe de pandas
data = pd.read_csv('usuarios_win_mac_lin.csv')

In [4]:
data.head()

Unnamed: 0,duracion,paginas,acciones,valor,clase
0,7.0,2,4,8,2
1,21.0,2,6,6,2
2,57.0,2,4,4,2
3,101.0,3,6,12,2
4,109.0,2,6,12,2


In [5]:
data.describe()

Unnamed: 0,duracion,paginas,acciones,valor,clase
count,170.0,170.0,170.0,170.0,170.0
mean,111.075729,2.041176,8.723529,32.676471,0.752941
std,202.4532,1.500911,9.136054,44.751993,0.841327
min,1.0,1.0,1.0,1.0,0.0
25%,11.0,1.0,3.0,8.0,0.0
50%,13.0,2.0,6.0,20.0,0.0
75%,108.0,2.0,10.0,36.0,2.0
max,898.0,9.0,63.0,378.0,2.0


In [7]:
# Ver cuantos renglones tengo de cada sistema operativo que esos valores están guardados en la columna "clase"
data.groupby('clase').size()

clase
0    86
1    40
2    44
dtype: int64

In [8]:
# Separando en matriz de entrenamiento (X) y vector de valores correctos o labels (y)
X = np.array(data.drop(['clase'], axis=1))

In [9]:
y = np.array(data['clase'])

In [10]:
X.shape

(170, 4)

## Creando el modelo predictivo

In [11]:
model = linear_model.LogisticRegression()

In [12]:
model.fit(X,y)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


## Predicciones

In [13]:
predictions = model.predict(X)

In [15]:
predictions[:5]

array([2, 2, 2, 2, 2])

In [16]:
y[:5]

array([2, 2, 2, 2, 2])

In [17]:
model.score(X,y)

0.7823529411764706

## Ahora lo haremos de nuevo de la manera correcta

In [19]:
# Separando el dataset en conjuntos de entrenamiento y conjuntos de validación
validation_size = 0.20
seed = 7
X_train, X_validation, y_train, y_validation = model_selection.train_test_split(X, y, test_size=validation_size, random_state=seed)

In [23]:
# Entrenar modelo
name = 'Logistic Regression'
kfold = model_selection.KFold(n_splits=10)
cv_results = model_selection.cross_val_score(model, X_train, y_train, cv=kfold, scoring='accuracy')
msg = name, ": ", cv_results.mean(), "(", cv_results.std()

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

In [24]:
print(msg)

('Logistic Regression', ': ', 0.7285714285714284, '(', 0.09418550477897196)


## Predicciones con el conjunto de validaciòn

In [25]:
predictions = model.predict(X_validation)

## Evaluación del modelo

In [26]:
accuracy_score(y_validation, predictions)

0.8529411764705882

In [27]:
confusion_matrix(y_validation, predictions)

array([[16,  0,  2],
       [ 3,  3,  0],
       [ 0,  0, 10]])

In [29]:
print(classification_report(y_validation, predictions))

              precision    recall  f1-score   support

           0       0.84      0.89      0.86        18
           1       1.00      0.50      0.67         6
           2       0.83      1.00      0.91        10

    accuracy                           0.85        34
   macro avg       0.89      0.80      0.81        34
weighted avg       0.87      0.85      0.84        34



## Predicciòn de un escenario simulado de alguien que entrò a mi sitio web

In [31]:
X_new = pd.DataFrame({'duracion':[10],'paginas':[3],'acciones':[5],'valor':[9]})

In [32]:
model.predict(X_new)



array([2])