In [1]:
import sys
import os

module_path = '/home/kinfi4/python/Propaganda-Analyzer/src/ETL'
if module_path not in sys.path:
    sys.path.append(module_path)

from services.text_preprocessor import TextPreprocessor

In [2]:
import pickle

import pandas as pd
import numpy as np
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split

In [3]:
df = pd.read_csv('../data/news-posts-truncated/data.csv',
                 names=['channel', 'text', 'date', 'type', 'sentiment'])
processor = TextPreprocessor()

In [5]:
df['text'] = df['text'].apply(processor.preprocess_and_lemmatize)

x_train, x_test, y_train, y_test = train_test_split(
    df['text'],
    df['sentiment'],
    test_size=0.2
)

In [6]:
vectorizer = TfidfVectorizer()
train_vectors = vectorizer.fit_transform(x_train)
test_vectors = vectorizer.transform(x_test)

In [22]:
pickle.dump(vectorizer, open('./trained-models/vectorizer.sav', 'wb'))

In [7]:
best_result, best_k = 0, 0
best_knn_model = None

for k in range(2, 12):
    knn = KNeighborsClassifier(n_neighbors=k)
    knn.fit(train_vectors.toarray(), y_train)
    
    evaluation_result = round(knn.score(test_vectors, y_test), 3)
    print(f'The score for KNN with {k=} is: {evaluation_result}')
    
    if best_knn_model is None or evaluation_result > best_result:
        best_result = evaluation_result
        best_k = k
        best_knn_model = knn

print('-' * 50)
print(f'The best result was gained with k={best_k} and is equal to: {best_result}')

The score for KNN with k=2 is: 0.722
The score for KNN with k=3 is: 0.685
The score for KNN with k=4 is: 0.685
The score for KNN with k=5 is: 0.667
The score for KNN with k=6 is: 0.685
The score for KNN with k=7 is: 0.667
The score for KNN with k=8 is: 0.63
The score for KNN with k=9 is: 0.593
The score for KNN with k=10 is: 0.63
The score for KNN with k=11 is: 0.63
--------------------------------------------------
The best result was gained with k=2 and is equal to: 0.722


In [8]:
svc = SVC()
svc.fit(train_vectors, y_train)
print(f'The score for SVM is: {svc.score(test_vectors, y_test)}')

The score for SVM is: 0.6666666666666666


In [19]:
lemas = processor.preprocess_and_lemmatize("люди эвакуация")

result = vectorizer.transform([lemas])

knn.predict(result)[0]

1

In [None]:
pickle.dump(svc, open('./trained-models/svc.sav', 'wb'))
pickle.dump(best_knn_model, open('./trained-models/knn.sav', 'wb'))