In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegressionCV

In [2]:
df = pd.read_csv(f"C:\Programowanie\codecademy_projects\data_sets_codecademy\WineQT.csv")
print(df.columns)
y = df['quality']
features = df.drop(columns = ['quality'])

Index(['fixed acidity', 'volatile acidity', 'citric acid', 'residual sugar',
       'chlorides', 'free sulfur dioxide', 'total sulfur dioxide', 'density',
       'pH', 'sulphates', 'alcohol', 'quality', 'Id'],
      dtype='object')


In [3]:
scaler = StandardScaler().fit(features)

X = scaler.transform(features)
print(X)

[[-0.52157961  0.93933222 -1.36502663 ... -0.57365783 -0.96338181
  -1.73561799]
 [-0.29259344  1.94181282 -1.36502663 ...  0.1308811  -0.59360107
  -1.73346186]
 [-0.29259344  1.27349242 -1.16156762 ... -0.04525363 -0.59360107
  -1.73130573]
 ...
 [-1.20853813  0.38239855 -0.9581086  ... -0.45623467  0.05351522
   1.70125196]
 [-1.38027776  0.10393172 -0.8563791  ...  0.60057372  0.70063152
   1.70340809]
 [-1.38027776  0.6330187  -0.75464959 ...  0.30701583 -0.22382033
   1.70772035]]


In [4]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=99)

In [11]:
clf_no_reg = LogisticRegression()

clf_no_reg.fit(X, y)
y_pred = clf_no_reg.predict(X_test)
print(f1_score(y_test, y_pred, average=None))

[0.         0.         0.64       0.54545455 0.4        0.        ]


In [None]:
#Evaluate the model above
#The F1 score is 0.0. This is a very low score and indicates that the model is not performing well.

In [12]:
predictors = features.columns
coefficients = clf_no_reg.coef_.ravel()
coef = pd.Series(coefficients, predictors).sort_values()
coef.plot(kind='bar', title = 'Coefficients (no regularization)')
plt.tight_layout()
plt.show()
plt.clf()

ValueError: Length of values (72) does not match length of index (12)

In [None]:
y_pred_train = clf_no_reg.predict(X_train)
y_pred_test = clf_no_reg.predict(X_test)
print('Training score', f1_score(y_train ,y_pred_train))
print('Test score', f1_score(y_test, y_pred_test))

In [None]:
clf_default = LogisticRegression()
clf_default.fit(X_train, y_train)


## 7. Ridge Scores
y_pred_train_d = clf_default.predict(X_train)
y_pred_test_d = clf_default.predict(X_test)

print('Training score_d', f1_score(y_train ,y_pred_train_d))
print('Test score_d', f1_score(y_test, y_pred_test_d))

In [None]:
training_array = []
test_array = []
C_array = [0.0001, 0.001, 0.01, 0.1, 1]


In [None]:
for i in C_array:
  model = LogisticRegression(C = i)
  model.fit(X_train, y_train)
  y_pred_train_model = model.predict(X_train)
  y_pred_test_model = model.predict(X_test)
  f1_train = f1_score(y_train, y_pred_train_model)
  f1_test = f1_score(y_test, y_pred_test_model)
  training_array.append(f1_train)
  test_array.append(f1_test)

print(test_array)

In [None]:
plt.plot(C_array,training_array)
plt.plot(C_array,test_array)
plt.xscale('log')
plt.show()
plt.clf()


In [None]:
C_array_log = np.logspace(-4, -2, 100)
C = {'C': C_array_log}


In [None]:
model_grid = LogisticRegression()

grid_search = GridSearchCV(
  model_grid,
  param_grid = C,
  scoring = 'f1',
  cv = 5)

grid_search.fit(X_train, y_train)

print(grid_search.best_params_)
print(grid_search.best_score_)

In [None]:
clf_best = LogisticRegression(C = grid_search.best_params_['C'])
clf_best.fit(X_train,y_train)
y_pred_best = clf_best.predict(X_test)
print(f1_score(y_test,y_pred_best))


In [None]:
clf_l1 = LogisticRegressionCV(Cs=C_array_log, cv=5, penalty='l1', solver='liblinear', scoring='f1')

clf_l1.fit(X, y)

print(clf_l1.coef_)
print(clf_l1.C_)


In [None]:
coefficients = clf_l1.coef_.ravel()
coef = pd.Series(coefficients,predictors).sort_values()

plt.figure(figsize = (12,8))
coef.plot(kind='bar', title = 'Coefficients for tuned L1')
plt.tight_layout()
plt.show()
plt.clf()
