In [1]:
# Required Libraries
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import roc_curve, roc_auc_score
from sklearn.feature_extraction.text import CountVectorizer

import pandas as pd
import matplotlib.pyplot as plt




In [2]:
# Load Dataset
data = pd.read_csv('DatingAppReviewsDataset.csv')



In [3]:
# We will convert the 'Review' column into numerical features using Count Vectorizer
# fill NaNs with empty string
data['Review'] = data['Review'].fillna('')

# then apply CountVectorizer()
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(data['Review'])

# 'Rating' will be our target variable
y = data['Rating']

# Split the dataset into training set and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)




In [4]:
# List of models
models = [
    ('Decision Tree', DecisionTreeClassifier()),
    ('Support Vector Classifier', SVC(probability=True)),
    ('Logistic Regression', LogisticRegression()),
    ('KNN', KNeighborsClassifier()),
]



In [6]:
import numpy as np
print(np.unique(y_train))
print(np.unique(y_test))

[0 1 2 3 4 5]
[1 2 3 4 5]


In [None]:
# Loop through models, train, predict, and get performance
for name, model in models:
    model.fit(X_train, y_train)  # Train model
    y_pred_prob = model.predict_proba(X_test)[:,1]  # Predict probabilities
    fpr, tpr, _ = roc_curve(y_test, y_pred_prob)  # Get ROC curve
    roc_auc = roc_auc_score(y_test, y_pred_prob)  # Get ROC AUC
    plt.plot(fpr, tpr, label=f'{name} (area = {roc_auc:.2f})')  # Plot ROC curve



In [None]:
# Plot ROC curve settings
plt.plot([0, 1], [0, 1], 'k--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic')
plt.legend(loc="lower right")
plt.show()

In [None]:
import numpy as np
from collections import Counter

# Define Euclidean distance function
def euclidean_distance(x1, x2):
    return np.sqrt(np.sum((x1 - x2)**2))

class KNN:

    def __init__(self, k=3):
        self.k = k

    def fit(self, X, y):
        self.X_train = X
        self.y_train = y

    def predict(self, X):
        predicted_labels = [self._predict(x) for x in X]
        return np.array(predicted_labels)

    def _predict(self, x):
        # Compute distances
        distances = [euclidean_distance(x, x_train) for x_train in self.X_train]
        # Get k nearest samples, labels
        k_indices = np.argsort(distances)[:self.k]
        k_nearest_labels = [self.y_train[i] for i in k_indices]
        # Majority vote, most common class label
        most_common = Counter(k_nearest_labels).most_common(1)
        return most_common[0][0]


In [None]:
from sklearn.preprocessing import label_binarize
from sklearn.metrics import roc_curve, auc
from sklearn.multiclass import OneVsRestClassifier
from itertools import cycle

# Binarize the output
y_train_bin = label_binarize(y_train, classes=[0, 1, 2, 3, 4, 5])
y_test_bin = label_binarize(y_test, classes=[0, 1, 2, 3, 4, 5])

n_classes = y_train_bin.shape[1]

# Learn to predict each class against the other using OneVsRestClassifier
classifier = OneVsRestClassifier(model)
y_score = classifier.fit(X_train, y_train_bin).decision_function(X_test)

# Compute ROC curve and ROC area for each class
fpr = dict()
tpr = dict()
roc_auc = dict()

for i in range(n_classes):
    fpr[i], tpr[i], _ = roc_curve(y_test_bin[:, i], y_score[:, i])
    roc_auc[i] = auc(fpr[i], tpr[i])

# Plot the ROC curve for each class
colors = cycle(['aqua', 'darkorange', 'cornflowerblue', 'green', 'red', 'purple'])
for i, color in zip(range(n_classes), colors):
    plt.plot(fpr[i], tpr[i], color=color,
             label='ROC curve of class {0} (area = {1:0.2f})'.format(i, roc_auc[i]))

plt.plot([0, 1], [0, 1], 'k--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic for Multi-Class Data')
plt.legend(loc="lower right")
plt.show()
