In [81]:

import pandas as pd
import numpy as np

In [82]:
# Load the dataset
def load_csv(file_path):
    return pd.read_csv(file_path)

In [83]:
# Function to clean the text data
def clean_text(text):
    return text.replace(',', '').replace('.', '').lower()

In [84]:
# Function to split data into training and testing sets
def split_train_test(data, test_size):
    mask = np.random.rand(len(data)) < (1 - test_size)
    X_train = data[mask]
    X_test = data[~mask]
    y_train = X_train['Label'].values  # Use .values for correct indexing
    y_test = X_test['Label'].values      # Use .values for correct indexing
    return X_train['Text'], y_train, X_test['Text'], y_test

In [85]:
# Function to get word frequency from the training data
def get_words_frequency(X_train):
    words = ' '.join(X_train).split()
    bags = set(words)
    word_count = {word: words.count(word) for word in bags}
    return word_count, bags

In [86]:
# Function to transform the text data into a frequency vector
def transform(X, bags):
    vectors = []
    for text in X:
        vector = [text.split().count(word) for word in bags]
        vectors.append(vector)
    return np.array(vectors)

In [87]:
# KNN Implementation
class KNNText:
    def __init__(self, k=3):
        self.k = k
        self.X_train = None
        self.y_train = None

    def fit(self, X, y):
        self.X_train = X
        self.y_train = y

    def predict(self, X):
        predictions = []
        for test_point in X:
            distances = np.linalg.norm(self.X_train - test_point, axis=1)
            k_indices = np.argsort(distances)[:self.k]
            k_nearest_labels = [self.y_train[i] for i in k_indices]
            predictions.append(max(set(k_nearest_labels), key=k_nearest_labels.count))
        return np.array(predictions)

In [88]:
# Load and clean the data
data = load_csv('/content/Education.csv')
data['Text'] = data['Text'].apply(clean_text)

In [89]:
# Split the data into train and test sets
X_train, y_train, X_test, y_test = split_train_test(data, 0.25)

In [90]:
# Get word frequencies and bags
words_train_fre, bags = get_words_frequency(X_train)

In [91]:
# Transform the train and test data
words_train_fre = transform(X_train, bags)
words_test_fre = transform(X_test, bags)

In [92]:
# Initialize and fit KNN
knn = KNNText(k=3)
knn.fit(words_train_fre, y_train)

# Make predictions
predictions = knn.predict(words_test_fre)

# Create a DataFrame for predictions
pred_df = pd.DataFrame(predictions, columns=['Predict'])
pred_df.index = range(1, len(pred_df) + 1)

# Prepare the actual labels DataFrame, ensuring the index aligns properly
y_test_df = pd.DataFrame(y_test, columns=['Actual'])
y_test_df.index = range(1, len(y_test_df) + 1)

# Concatenate predictions and actual labels
result = pd.concat([pred_df, y_test_df], axis=1)

# Display the result
print(result)

     Predict    Actual
1   negative  positive
2   positive  positive
3   positive  positive
4   negative  negative
5   positive  negative
6   negative  negative
7   positive  positive
8   positive  positive
9   negative  negative
10  positive  negative
11  positive  negative
12  positive  positive
13  positive  negative


     Predict    Actual
1   negative  negative
2   positive  positive
3   positive  negative
4   positive  positive
5   positive  positive
6   positive  positive
7   negative  positive
8   negative  negative
9   negative  negative
10  positive  positive
11  negative  positive
12  positive  positive
13  positive  negative
14  negative  negative
15  positive  negative
