# Logistic Regression - Beer Analysis

## Importing the Libraries

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

## Importing the consumo-cervaja dataset (renamed for better access), the dataset can be found at https://github.com/joshtrivedi/Beer-Consumption/blob/main/beer_data.csv 
## Fine tuning the data involving analysing the dataset and removing the NaN values

In [None]:
dataset = pd.read_csv('beer_data.csv')

In [None]:
dataset.head()

Unnamed: 0,Data,Temperatura Media (C),Temperatura Minima (C),Temperatura Maxima (C),Precipitacao (mm),Final de Semana,Consumo de cerveja (litros)
0,2015-01-01,273,239,325,0,0.0,25.461
1,2015-01-02,2702,245,335,0,0.0,28.972
2,2015-01-03,2482,224,299,0,1.0,30.814
3,2015-01-04,2398,215,286,12,1.0,29.799
4,2015-01-05,2382,21,283,0,0.0,28.9


In [None]:
#remobing the NaN values
dataset = dataset.dropna()

In [None]:
dataset.shape

(365, 7)

In [None]:
dataset.describe()

Unnamed: 0,Final de Semana,Consumo de cerveja (litros)
count,365.0,365.0
mean,0.284932,25.401367
std,0.452001,4.399143
min,0.0,14.343
25%,0.0,22.008
50%,0.0,24.867
75%,1.0,28.631
max,1.0,37.937


## Segregating the data into input features and target variables

In [None]:
input_features = dataset.iloc[:,-1]
input_features

0      25.461
1      28.972
2      30.814
3      29.799
4      28.900
        ...  
360    32.307
361    26.095
362    22.309
363    20.467
364    22.446
Name: Consumo de cerveja (litros), Length: 365, dtype: float64

In [None]:
target_variable = dataset.iloc[:,-2]
target_variable

0      0.0
1      0.0
2      1.0
3      1.0
4      0.0
      ... 
360    1.0
361    0.0
362    0.0
363    0.0
364    0.0
Name: Final de Semana, Length: 365, dtype: float64

As there is no specific classification column, we have improvised the **final-de-semana** column as it is the indication of whether it is weekend or not.

In [None]:
isWeekend = dataset.loc[target_variable == 1.0] 
notWeekend = dataset.loc[target_variable == 0.0]

In [None]:
plt.scatter(isWeekend.iloc[:, 0], isWeekend.iloc[:,1], label='is weekend')
plt.scatter(notWeekend.iloc[:, 0], notWeekend.iloc[:, 1],  label='is not weekend')
plt.legend()
plt.show()

## Splitting into testing and training dataset using sklearn library

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
input_features = np.array(input_features)
target_variable = np.array(target_variable)

In [None]:
X_train,X_test,Y_train,Y_test = train_test_split(input_features,target_variable,test_size=0.05,random_state=23)

In [None]:
X_train = X_train.reshape(-1,1)
X_test = X_test.reshape(-1,1)
print(X_train.shape)
print(X_test.shape)
print(Y_train.shape)
print(Y_test.shape)

(346, 1)
(19, 1)
(346,)
(19,)


## Applying Logistic Regression Model usind the sklearn library

In [None]:
from sklearn.linear_model import LogisticRegression

In [None]:
model = LogisticRegression()  

In [None]:
model.fit(X_train,Y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [None]:
Prediction = model.predict(X_test)

Confusion Matrix

In [None]:
from sklearn import metrics

In [None]:
confusion = metrics.confusion_matrix(Y_test,Prediction)

In [None]:
confusion

array([[13,  0],
       [ 3,  3]])

In [None]:
print("Accuracy:",metrics.accuracy_score(Y_test, Prediction))
print("Precision:",metrics.precision_score(Y_test, Prediction))
print("Recall:",metrics.recall_score(Y_test, Prediction))

Accuracy: 0.8421052631578947
Precision: 1.0
Recall: 0.5
