In [1]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

df = pd.read_csv(f"C:\Programowanie\codecademy_projects\data_sets_codecademy\WineQT.csv")
print(df.columns)
y = df['quality']
features = df.drop(columns = ['quality'])


## 1. Data transformation
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler().fit(features)

X = scaler.transform(features)

print(X)


## 2. Train-test split
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=99)
## 3. Fit a logistic regression classifier without regularization
from sklearn.linear_model import LogisticRegression

clf_no_reg = LogisticRegression(penalty='none')

clf_no_reg.fit(X, y)
## 4. Plot the coefficients
predictors = features.columns
coefficients = clf_no_reg.coef_.ravel()
coef = pd.Series(coefficients,predictors).sort_values()
coef.plot(kind='bar', title = 'Coefficients (no regularization)')
plt.tight_layout()
plt.show()
plt.clf()


## 5. Training and test performance
from sklearn.metrics import f1_score

y_pred_train = clf_no_reg.predict(X_train)
y_pred_test = clf_no_reg.predict(X_test)

print('Training score', f1_score(y_train ,y_pred_train))
print('Test score', f1_score(y_test, y_pred_test))




## 6. Default Implementation (L2-regularized!)

clf_default = LogisticRegression()
clf_default.fit(X_train, y_train)


## 7. Ridge Scores
y_pred_train_d = clf_default.predict(X_train)
y_pred_test_d = clf_default.predict(X_test)

print('Training score_d', f1_score(y_train ,y_pred_train_d))
print('Test score_d', f1_score(y_test, y_pred_test_d))

## 8. Coarse-grained hyperparameter tuning
training_array = []
test_array = []
C_array = [0.0001, 0.001, 0.01, 0.1, 1]

for i in C_array:
  model = LogisticRegression(C = i)
  model.fit(X_train, y_train)
  y_pred_train_model = model.predict(X_train)
  y_pred_test_model = model.predict(X_test)
  f1_train = f1_score(y_train, y_pred_train_model)
  f1_test = f1_score(y_test, y_pred_test_model)
  training_array.append(f1_train)
  test_array.append(f1_test)

print(test_array)


## 9. Plot training and test scores as a function of C
plt.plot(C_array,training_array)
plt.plot(C_array,test_array)
plt.xscale('log')
plt.show()
plt.clf()

## 10. Making a parameter grid for GridSearchCV
C_array_log = np.logspace(-4, -2, 100)
C = {'C': C_array_log}


## 11. Implementing GridSearchCV with l2 penalty
from sklearn.model_selection import GridSearchCV

model_grid = LogisticRegression()

grid_search = GridSearchCV(
  model_grid,
  param_grid = C,
  scoring = 'f1',
  cv = 5)

grid_search.fit(X_train, y_train)


## 12. Optimal C value and the score corresponding to it
print(grid_search.best_params_)
print(grid_search.best_score_)

## 13. Validating the "best classifier"

clf_best = LogisticRegression(C = grid_search.best_params_['C'])
clf_best.fit(X_train,y_train)
y_pred_best = clf_best.predict(X_test)
print(f1_score(y_test,y_pred_best))

## 14. Implement L1 hyperparameter tuning with LogisticRegressionCV
from sklearn.linear_model import LogisticRegressionCV

clf_l1 = LogisticRegressionCV(Cs=C_array_log, cv=5, penalty='l1', solver='liblinear', scoring='f1')

clf_l1.fit(X, y)
## 15. Optimal C value and corresponding coefficients
print(clf_l1.coef_)
print(clf_l1.C_)


## 16. Plotting the tuned L1 coefficients
coefficients = clf_l1.coef_.ravel()
coef = pd.Series(coefficients,predictors).sort_values()

plt.figure(figsize = (12,8))
coef.plot(kind='bar', title = 'Coefficients for tuned L1')
plt.tight_layout()
plt.show()
plt.clf()


Index(['fixed acidity', 'volatile acidity', 'citric acid', 'residual sugar',
       'chlorides', 'free sulfur dioxide', 'total sulfur dioxide', 'density',
       'pH', 'sulphates', 'alcohol', 'quality', 'Id'],
      dtype='object')
[[-0.52157961  0.93933222 -1.36502663 ... -0.57365783 -0.96338181
  -1.73561799]
 [-0.29259344  1.94181282 -1.36502663 ...  0.1308811  -0.59360107
  -1.73346186]
 [-0.29259344  1.27349242 -1.16156762 ... -0.04525363 -0.59360107
  -1.73130573]
 ...
 [-1.20853813  0.38239855 -0.9581086  ... -0.45623467  0.05351522
   1.70125196]
 [-1.38027776  0.10393172 -0.8563791  ...  0.60057372  0.70063152
   1.70340809]
 [-1.38027776  0.6330187  -0.75464959 ...  0.30701583 -0.22382033
   1.70772035]]


InvalidParameterError: The 'penalty' parameter of LogisticRegression must be a str among {'l1', 'elasticnet', 'l2'} or None. Got 'none' instead.

In [None]:
!pip install ucimlrepo