## KNN (neutral, positive, negative)

### Not used in project

In [1]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import re
import nltk

In [2]:
# Loading datasets
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')

# Preprocessing function to clean text
def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'\W', ' ', text)
    tokens = word_tokenize(text)
    tokens = [word for word in tokens if word not in stopwords.words('english')]
    text = ' '.join(tokens)
    return text

# Preprocess the headlines
train_df['Headline'] = train_df['Headline'].apply(preprocess_text)
test_df['Headline'] = test_df['Headline'].apply(preprocess_text)

#Class count
count_neutral, count_positive, count_negative = train_df.Sentiment.value_counts()
print(count_negative)

#Divide by class
dataset_neutral = train_df[train_df['Sentiment'] == 'NEUTRAL']
dataset_positive = train_df[train_df['Sentiment'] == 'POSITIVE']
dataset_negative = train_df[train_df['Sentiment'] == 'NEGATIVE']

dataset_neutral_under = dataset_neutral.sample(count_negative)
dataset_positive_under = dataset_positive.sample(count_negative)

dataset_balance = pd.concat([dataset_neutral_under, dataset_positive_under, dataset_negative], axis=0)

# Vectorizing text
vectorizer = CountVectorizer()
X_train = vectorizer.fit_transform(dataset_balance['Headline'])
X_test = vectorizer.transform(test_df['Headline'])

# Labels
y_train = dataset_balance['Sentiment']
y_test = test_df['Sentiment']

# KNN Classifier
# You might want to experiment with the 'n_neighbors' parameter
# A common starting point is 5, but it's a hyperparameter you can tune
classifier = KNeighborsClassifier(n_neighbors=5)

# Training
classifier.fit(X_train, y_train)

# Predicting
y_pred = classifier.predict(X_test)

# Accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy}')


4709
Accuracy: 0.3551784669397308


In [3]:
0.3165593914569924

0.3165593914569924

## KNN (positive, negative)

In [4]:
# Loading datasets
train_df = pd.read_csv('train_pos_neg.csv')
test_df = pd.read_csv('test_pos_neg.csv')

# Preprocessing function to clean text
def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'\W', ' ', text)
    tokens = word_tokenize(text)
    tokens = [word for word in tokens if word not in stopwords.words('english')]
    text = ' '.join(tokens)
    return text

# Preprocess the headlines
train_df['Headline'] = train_df['Headline'].apply(preprocess_text)
test_df['Headline'] = test_df['Headline'].apply(preprocess_text)

#Class count
count_positive, count_negative = train_df.Sentiment.value_counts()
print(count_negative)

#Divide by class
dataset_positive = train_df[train_df['Sentiment'] == 'POSITIVE']
dataset_negative = train_df[train_df['Sentiment'] == 'NEGATIVE']

dataset_positive_under = dataset_positive.sample(count_negative)

dataset_balance = pd.concat([dataset_positive_under, dataset_negative], axis=0)

# Vectorizing text
vectorizer = CountVectorizer()
X_train = vectorizer.fit_transform(dataset_balance['Headline'])
X_test = vectorizer.transform(test_df['Headline'])

# Labels
y_train = dataset_balance['Sentiment']
y_test = test_df['Sentiment']

# KNN Classifier
# You might want to experiment with the 'n_neighbors' parameter
# A common starting point is 5, but it's a hyperparameter you can tune
classifier = KNeighborsClassifier(n_neighbors=5)

# Training
classifier.fit(X_train, y_train)

# Predicting
y_pred = classifier.predict(X_test)

# Accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy}')


4709
Accuracy: 0.5045492142266336


In [5]:
from sklearn.metrics import classification_report,confusion_matrix,accuracy_score

#Classification report for tfidf features
knn_report=classification_report(y_test,y_pred)
print(knn_report)

              precision    recall  f1-score   support

    NEGATIVE       0.50      0.23      0.32       601
    POSITIVE       0.50      0.77      0.61       608

    accuracy                           0.50      1209
   macro avg       0.50      0.50      0.46      1209
weighted avg       0.50      0.50      0.46      1209



## KNN (neutral, opinionated)

In [6]:
# Loading datasets
train_df = pd.read_csv('train_neu_opi.csv')
test_df = pd.read_csv('test_neu_opi.csv')

# Preprocessing function to clean text
def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'\W', ' ', text)
    tokens = word_tokenize(text)
    tokens = [word for word in tokens if word not in stopwords.words('english')]
    text = ' '.join(tokens)
    return text

# Preprocess the headlines
train_df['Headline'] = train_df['Headline'].apply(preprocess_text)
test_df['Headline'] = test_df['Headline'].apply(preprocess_text)

#Class count
count_opinionated, count_neutral = train_df.Sentiment.value_counts()
print(count_neutral)

#Divide by class
dataset_opinionated = train_df[train_df['Sentiment'] == 'OPINIONATED']
dataset_neutral = train_df[train_df['Sentiment'] == 'NEUTRAL']

dataset_opinionated_under = dataset_opinionated.sample(count_neutral)

dataset_balance = pd.concat([dataset_opinionated_under, dataset_neutral], axis=0)

# Vectorizing text
vectorizer = CountVectorizer()
X_train = vectorizer.fit_transform(dataset_balance['Headline'])
X_test = vectorizer.transform(test_df['Headline'])

# Labels
y_train = dataset_balance['Sentiment']
y_test = test_df['Sentiment']

# KNN Classifier
# You might want to experiment with the 'n_neighbors' parameter
# A common starting point is 5, but it's a hyperparameter you can tune
classifier = KNeighborsClassifier(n_neighbors=5)

# Training
classifier.fit(X_train, y_train)

# Predicting
y_pred = classifier.predict(X_test)

# Accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy}')


10617
Accuracy: 0.41720304271503805


In [7]:
from sklearn.metrics import classification_report,confusion_matrix,accuracy_score

#Classification report for tfidf features
knn_report=classification_report(y_test,y_pred)
print(knn_report)

              precision    recall  f1-score   support

     NEUTRAL       0.32      0.85      0.46       500
 OPINIONATED       0.79      0.24      0.37      1209

    accuracy                           0.42      1709
   macro avg       0.55      0.54      0.41      1709
weighted avg       0.65      0.42      0.39      1709

