In [15]:
import pickle
import sys

import joblib
import pandas as pd
from sklearn.model_selection import cross_validate, cross_val_predict, train_test_split, KFold
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.feature_extraction.text import TfidfVectorizer

module_path = '/home/kinfi4/python/Propaganda-Analyzer/src/ETL'
if module_path not in sys.path:
    sys.path.append(module_path)

from services.text_preprocessor import TextPreprocessor

## Data preprocessing

In [16]:
df = pd.read_csv(
    '../../data/training-data/news-for-training.csv', 
    names=['channel', 'text', 'date', 'type', 'sent']
)
df.head()

Unnamed: 0,channel,text,date,type,sent
0,раньше всех. ну почти.,президент эстонии алар карис признал удастся п...,2022-05-17 18:20:01,economic,-1
1,раньше всех. ну почти.,россияне 24 февраля стали тратить раза новости...,2022-05-17 18:11:38,political,-1
2,раньше всех. ну почти.,суд приговорил эксполковника захарченко совоку...,2022-05-17 17:53:12,shelling,-1
3,раньше всех. ну почти.,евросоюз допустит украине закончилось оружие в...,2022-05-17 17:44:46,political,-1
4,раньше всех. ну почти.,сша активно привлекают участия боевых действия...,2022-05-17 17:43:05,political,-1


In [17]:
df['type'].value_counts()

political       235
shelling        208
economic        197
humanitarian    123
Name: type, dtype: int64

In [18]:
preprocessor = TextPreprocessor()
df['text'] = df['text'].apply(preprocessor.preprocess_and_lemmatize)

In [19]:
vectorizer = TfidfVectorizer()
train_vectors = vectorizer.fit_transform(df['text'])

In [20]:
pickle.dump(vectorizer, open('../trained-models/vectorizer.pk', 'wb'))

In [21]:
types = pd.factorize(df['type'])
types[1]

Index(['economic', 'political', 'shelling', 'humanitarian'], dtype='object')

In [22]:
df['type'] = types[0]

In [23]:
kfold = KFold(n_splits=5, shuffle=True)

## SVC Model

In [24]:
svc = SVC(kernel='linear')
# svc = SVC(kernel='rbf')

results = cross_validate(svc, X=train_vectors, y=df['type'], cv=kfold, return_estimator=True)

print(f'The mean test score is: {results["test_score"].mean()}')

The mean test score is: 0.8086429308565531


In [25]:
svc = results['estimator'][results['test_score'].argmax()]

In [26]:
joblib.dump(svc, '../trained-models/svc-news-type-prediction.sav')

['../trained-models/svc-news-type-prediction.sav']

## KNN Model

In [27]:
best_knn_model = None
best_knn_score, best_knn_k = 0, 0

for k in range(2, 10):
    knn_model = KNeighborsClassifier(n_neighbors=k)
    
    results = cross_validate(
        knn_model,
        X=train_vectors,
        y=df['type'], 
        cv=kfold, 
        return_estimator=True
    )
    
    mean_score = round(results['test_score'].mean(), 4)
    
    print(f'KNN model with {k=} has score: {mean_score}')
    
    if mean_score > best_knn_score:
        best_knn_score = mean_score
        best_knn_k = k
        best_knn_model = results['estimator'][results['test_score'].argmax()]
        
print('-' * 30)
print(f'The best KNN model was trained with k={best_knn_k} with score: {best_knn_score}')

KNN model with k=2 has score: 0.734
KNN model with k=3 has score: 0.7431
KNN model with k=4 has score: 0.7418
KNN model with k=5 has score: 0.7654
KNN model with k=6 has score: 0.7471
KNN model with k=7 has score: 0.751
KNN model with k=8 has score: 0.7353
KNN model with k=9 has score: 0.7523
------------------------------
The best KNN model was trained with k=5 with score: 0.7654


In [29]:
joblib.dump(best_knn_model, open('../trained-models/knn-news-type-prediction.sav', 'wb'))