In [7]:
import pandas as pd
import math
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

# Load the dataset
data = pd.read_csv('sentimentdataset.csv')

# Drop unnecessary columns
data = data.drop(['Unnamed: 0.1', 'Unnamed: 0'], axis=1)

# Preprocess the data
# Encode categorical variables
label_encoder = LabelEncoder()
data['Platform'] = label_encoder.fit_transform(data['Platform'])
data['Country'] = label_encoder.fit_transform(data['Country'])

# Separate the features (X) and target variable (y)
X = data[['Platform', 'Retweets', 'Likes', 'Country', 'Month', 'Day', 'Hour']]
y = data['Sentiment']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Combine features and target variable for training and testing
train_data = list(zip(y_train, X_train.values.tolist()))
test_data = list(zip(y_test, X_test.values.tolist()))

# Euclidean distance calculation
def euclidean_distance(v1, v2):
    squared_diff = sum((x1 - x2) ** 2 for x1, x2 in zip(v1, v2))
    return math.sqrt(squared_diff)

# KNN algorithm with weighted voting
def knn(train_data, test_sample, k):
    distances = [(euclidean_distance(sample[1], test_sample[1]), sample[0]) for sample in train_data]
    distances.sort()
    k_neighbors = distances[:k]

    class_counts = {}
    for distance, label in k_neighbors:
        weight = 1 / (distance + 1e-10)  # Avoid division by zero
        class_counts[label] = class_counts.get(label, 0) + weight

    predicted_class = max(class_counts.items(), key=lambda x: x[1])[0]
    return predicted_class

# Choose the value of k
k = 18
print(f"K= {k}")

num_correct = 0
num_misclassified = 0

for test_sample in test_data:
    desired_class = test_sample[0]
    computed_class = knn(train_data, test_sample, k)
    print(f"Desired class: {desired_class}, Computed class: {computed_class}")

    if desired_class == computed_class:
        num_correct += 1
    else:
        num_misclassified += 1

accuracy = num_correct / len(test_data) * 100
print(f"Accuracy rate: {accuracy:.1f}%")
print(f"Number of misclassified test samples: {num_misclassified}")
print(f"Total number of test samples: {len(test_data)}")

K= 18
Desired class:  Curiosity , Computed class:  Joy 
Desired class:  Positive  , Computed class:  Positive  
Desired class:  Fearful , Computed class:  Hopeful 
Desired class:  Heartbreak , Computed class:  Confident 
Desired class:  Joy , Computed class:  Excitement 
Desired class:  Miscalculation , Computed class:  Playful 
Desired class:  Positive  , Computed class:  Positive  
Desired class:  Reflection    , Computed class:  Wonderment    
Desired class:  Happy , Computed class:  Joy 
Desired class:  Joy , Computed class:  Positive  
Desired class:  Sorrow      , Computed class:  Inspiration 
Desired class:  Anticipation  , Computed class:  Determination 
Desired class:  Grief           , Computed class:  Envy            
Desired class:  Hate , Computed class:  Joy 
Desired class:  Acceptance   , Computed class:  Positive  
Desired class:  Sadness      , Computed class:  Positive  
Desired class:  Excitement   , Computed class:  Joy 
Desired class:  Euphoria     , Computed class