In [None]:
# Libraries importing
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sb
import sklearn
from imblearn.over_sampling import RandomOverSampler
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.feature_selection import SelectKBest, chi2
from tqdm.notebook import tqdm
from sklearn import metrics
from sklearn.svm import SVC
from xgboost import XGBClassifier
from sklearn.linear_model import LogisticRegression
import warnings

# Ignore warnings
warnings.filterwarnings('ignore')

# Data importing
df = pd.read_csv(r'C:\Users\Vamshi Krishna\Downloads\pd_speech_features.csv\pd_speech_features.csv')

# Data exploration
print(df.shape)
print(df.info())
print(df.describe())

# Data cleaning
# Group by 'id' and calculate the mean for each group
df = df.groupby('id').mean().reset_index()
df.drop('id', axis=1, inplace=True)

# Remove highly correlated features
columns = list(df.columns)
for col in columns:
    if col == 'class':
        continue

    filtered_columns = [col]
    for col1 in df.columns:
        if (col == col1) or (col == 'class'):
            continue

        val = df[col].corr(df[col1])

        if val > 0.7:
            columns.remove(col1)
            continue
        else:
            filtered_columns.append(col1)

    df = df[filtered_columns]

print(df.shape)

# Feature selection
X = df.drop('class', axis=1)
X_norm = MinMaxScaler().fit_transform(X)
selector = SelectKBest(chi2, k=30)
selector.fit(X_norm, df['class'])
filtered_columns = selector.get_support()
filtered_data = X.loc[:, filtered_columns]
filtered_data['class'] = df['class']
df = filtered_data
print(df.shape)

# Data visualization
x = df['class'].value_counts()
plt.pie(x.values, labels=x.index, autopct='%1.1f%%')
plt.show()

# Splitting data into training and validation sets
features = df.drop('class', axis=1)
target = df['class']
X_train, X_val, Y_train, Y_val = train_test_split(features, target, test_size=0.2, random_state=10)

# Balancing data using RandomOverSampler
ros = RandomOverSampler(sampling_strategy='minority', random_state=0)
X, Y = ros.fit_resample(X_train, Y_train)

# Model training and evaluation
models = [LogisticRegression(), XGBClassifier(), SVC(kernel='rbf', probability=True)]

for model in models:
    model.fit(X, Y)

    # Training accuracy
    train_preds = model.predict_proba(X)[:, 1]
    print(f'{model}:')
    print('Training ROC AUC Score:', metrics.roc_auc_score(Y, train_preds))

    # Validation accuracy
    val_preds = model.predict_proba(X_val)[:, 1]
    print('Validation ROC AUC Score:', metrics.roc_auc_score(Y_val, val_preds))
    print()

from sklearn.metrics import confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns

# Calculate confusion matrix
cm = confusion_matrix(Y_val, models[0].predict(X_val))

# Plot confusion matrix
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, cmap='Blues', fmt='g', xticklabels=['Predicted Negative', 'Predicted Positive'],
            yticklabels=['Actual Negative', 'Actual Positive'])
plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.title('Confusion Matrix')
plt.show()
