In [1]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from scipy.stats import pointbiserialr
from scipy.stats import skew, kurtosis
from sklearn import svm
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
# from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, recall_score, classification_report, confusion_matrix
from imblearn.metrics import specificity_score



Wczytanie danych

In [2]:
r_wine = pd.read_csv('data/winequality-red.csv', sep=';')
r_wine['type'] = 'red'
w_wine = pd.read_csv('data/winequality-white.csv', sep=';')
w_wine['type'] = 'white'

In [3]:
data = pd.concat([r_wine, w_wine], ignore_index=True)
data

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality,type
0,7.4,0.70,0.00,1.9,0.076,11.0,34.0,0.99780,3.51,0.56,9.4,5,red
1,7.8,0.88,0.00,2.6,0.098,25.0,67.0,0.99680,3.20,0.68,9.8,5,red
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.99700,3.26,0.65,9.8,5,red
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.99800,3.16,0.58,9.8,6,red
4,7.4,0.70,0.00,1.9,0.076,11.0,34.0,0.99780,3.51,0.56,9.4,5,red
...,...,...,...,...,...,...,...,...,...,...,...,...,...
6492,6.2,0.21,0.29,1.6,0.039,24.0,92.0,0.99114,3.27,0.50,11.2,6,white
6493,6.6,0.32,0.36,8.0,0.047,57.0,168.0,0.99490,3.15,0.46,9.6,5,white
6494,6.5,0.24,0.19,1.2,0.041,30.0,111.0,0.99254,2.99,0.46,9.4,6,white
6495,5.5,0.29,0.30,1.1,0.022,20.0,110.0,0.98869,3.34,0.38,12.8,7,white


In [4]:
data['quality'].value_counts()

quality
6    2836
5    2138
7    1079
4     216
8     193
3      30
9       5
Name: count, dtype: int64

OPIS I WSTEPNA ANALIZA DANYCH

Typy danych w zbiorze

In [5]:
data.dtypes

fixed acidity           float64
volatile acidity        float64
citric acid             float64
residual sugar          float64
chlorides               float64
free sulfur dioxide     float64
total sulfur dioxide    float64
density                 float64
pH                      float64
sulphates               float64
alcohol                 float64
quality                   int64
type                     object
dtype: object

Rozmiar danych

In [6]:
data.shape

(6497, 13)

Statystyki opisowe dla zmiennych liczbowych

In [7]:
data_float_column = data.select_dtypes(include=[float]).columns
data[data_float_column].describe()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol
count,6497.0,6497.0,6497.0,6497.0,6497.0,6497.0,6497.0,6497.0,6497.0,6497.0,6497.0
mean,7.215307,0.339666,0.318633,5.443235,0.056034,30.525319,115.744574,0.994697,3.218501,0.531268,10.491801
std,1.296434,0.164636,0.145318,4.757804,0.035034,17.7494,56.521855,0.002999,0.160787,0.148806,1.192712
min,3.8,0.08,0.0,0.6,0.009,1.0,6.0,0.98711,2.72,0.22,8.0
25%,6.4,0.23,0.25,1.8,0.038,17.0,77.0,0.99234,3.11,0.43,9.5
50%,7.0,0.29,0.31,3.0,0.047,29.0,118.0,0.99489,3.21,0.51,10.3
75%,7.7,0.4,0.39,8.1,0.065,41.0,156.0,0.99699,3.32,0.6,11.3
max,15.9,1.58,1.66,65.8,0.611,289.0,440.0,1.03898,4.01,2.0,14.9


In [None]:
sns.countplot(x='quality', data=data)

Analiza Wpływu Zmiennych na Zmienną Prognozowaną (Y)


In [None]:
#heat mapa korelacji  ale nie wiem czy to dobra metoda
plt.figure(figsize=(10,8))
sns.heatmap(data.corr(), annot=True, fmt='.2f')
plt.show()

In [None]:
# Przypuśćmy, że 'df' to twój DataFrame, a 'quality' to twoja zmienna binarna.
# Przeiteruj przez wszystkie kolumny (oprócz 'quality') i oblicz korelację punktowo-biserialną.
for column in data.columns.drop('quality'):
    pbc = pointbiserialr(data[column], data['quality'])
    print(f"Korelacja punktowo-biserialna między {column} a quality: {pbc.correlation:.3f}, p-wartość: {pbc.pvalue:.4f}")


Ocena zbalansowania zbioru danych

In [None]:
#Histogram
plt.figure(figsize=(8,4))
sns.histplot(data['quality'], kde=True)
plt.title('Histogram zmiennej quality')
plt.show()

In [None]:
#Boxplot
plt.figure(figsize=(6, 7))
sns.boxplot(y=data['quality'])
plt.title('Boxplot zmiennej quality')
plt.show()

In [None]:
#Skośnosc i kurtoza
print(f"Skośność: ", skew(data['quality']))
print(f'Kurtoza: ', kurtosis(data['quality']))

PRZYGOTOWANIE DANYCH

Wartosci brakujace

In [None]:
data.isna().sum()

Sprawdzenie wartosci odstajacych

In [None]:
# fig, axs = plt.subplots(ncols=3, nrows = 3, figsize=(15,15))
# 
# sns.boxplot(ax=axs[0][0], y=data['fixed acidity'])
# axs[0][0].set_title('fixed acidity')
# 
# sns.boxplot(ax=axs[1], y=data['volatile acidity'])
# axs[1].set_title('volatile acidity')
# 
# sns.boxplot(ax=axs[2], y=data['citric acid'])
# axs[2].set_title('citric acid')
# 
# sns.set_style("darkgrid", {"grid.color": ".6", "grid.linestyle": ":"})
# plt.tight_layout()
# plt.show()

PODZIAL DANYCH NA ZBIOR UCZACY I TESTOWY


In [None]:
X = data.drop('quality', axis=1)
y = data['quality']

In [None]:
#podzial na zbior testowy i treningowy
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
#kopia danych
X_train_copy = X_train.copy()
X_test_copy = X_test.copy()

Statystyki opisowe dla zbiorow

In [None]:
X_train.describe()

In [None]:
X_test.describe()


ALGORYTM SVM

Standaryzacja zmiennych

In [None]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [None]:
def svm_algorithm(kernel):
    model = svm.SVC(kernel=kernel)
    model.fit(X_train, y_train)
    
    y_pred_train = model.predict(X_train)
    y_pred_test = model.predict(X_test)
    
    accuracy_train = accuracy_score(y_train, y_pred_train)
    accuracy_test = accuracy_score(y_test, y_pred_test)
    recall_train = recall_score(y_train, y_pred_train,  average=None)
    recall_test = recall_score(y_test, y_pred_test, average=None)
    specificity_train = specificity_score(y_train, y_pred_train)
    specificity_test = specificity_score(y_test, y_pred_test)

    measures = ['', 'Dokładność', 'Czułość', 'Specyficzność']    
    print(f'Jądro: {kernel}')
    print(f'{measures[0]: <10} {measures[1]: <15} {measures[2]: <20} {measures[3]: <25}')
    print(f'Treningowy {round(accuracy_train, 3): <15} {round(recall_train, 3): <20} {round(specificity_train, 3): <25}')
    print(f'Testowy    {round(accuracy_test, 3): <15} {round(recall_test, 3): <20} {round(specificity_test, 3): <25}')
    print()

    
    

In [None]:
svm_algorithm('rbf')