In [1]:
import sys
import pickle

import joblib
import pandas as pd
from sklearn.model_selection import cross_validate, cross_val_predict, train_test_split, KFold
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.feature_extraction.text import TfidfVectorizer

module_path = '/home/kinfi4/python/Propaganda-Analyzer/src/ETL'
if module_path not in sys.path:
    sys.path.append(module_path)

from services.domain.text_preprocessor import TextPreprocessor

## Data preprocessing

In [17]:
df = pd.read_csv(
    '../data/training-data/news-for-training.csv', 
    names=['channel', 'text', 'date', 'type', 'sent']
)
df.head()

Unnamed: 0,channel,text,date,type,sent
0,раньше всех. ну почти.,президент эстонии алар карис признал удастся п...,2022-05-17 18:20:01,economic,-1.0
1,раньше всех. ну почти.,россияне 24 февраля стали тратить раза новости...,2022-05-17 18:11:38,political,-1.0
2,раньше всех. ну почти.,суд приговорил эксполковника захарченко совоку...,2022-05-17 17:53:12,shelling,-1.0
3,раньше всех. ну почти.,евросоюз допустит украине закончилось оружие в...,2022-05-17 17:44:46,political,-1.0
4,раньше всех. ну почти.,сша активно привлекают участия боевых действия...,2022-05-17 17:43:05,political,-1.0


In [18]:
df['type'].value_counts()

political       579
shelling        376
economic        298
humanitarian    271
Name: type, dtype: int64

In [19]:
preprocessor = TextPreprocessor()
df['text'] = df['text'].apply(preprocessor.preprocess_and_lemmatize)

In [20]:
df['text'].head()

0    президент эстония алар карис признать удаться ...
1    россиянин февраль стать тратить раз новость ин...
2    суд приговорить эксполковник захарченко совоку...
3    евросоюз допустить украина закончиться оружие ...
4    сша активно привлекать участие боевой действие...
Name: text, dtype: object

In [21]:
vectorizer = TfidfVectorizer()

texts = df['text']
train_vectors = vectorizer.fit_transform(texts)  # тренерує наш об'єкт та повертає список векторів, що можна буде юзати для навчання

In [22]:
pickle.dump(vectorizer, open('./trained-models/vectorizer.pk', 'wb'))

In [23]:
df['type'].head(3)

0     economic
1    political
2     shelling
Name: type, dtype: object

In [24]:
types = pd.factorize(df['type'])
types[1]

Index(['economic', 'political', 'shelling', 'humanitarian'], dtype='object')

In [25]:
df['type'] = types[0]

In [26]:
df['type'].head(3)

0    0
1    1
2    2
Name: type, dtype: int64

In [28]:
kfold = KFold(n_splits=5, shuffle=True)

## SVC Model

In [31]:
svc = SVC(kernel='linear')
# svc = SVC(kernel='rbf')

results = cross_validate(svc, X=train_vectors, y=df['type'], cv=kfold, return_estimator=True)

print(f'The mean test score is: {results["test_score"].max()}')

The mean test score is: 0.7993421052631579


In [33]:
svc = results['estimator'][results['test_score'].argmax()]

In [34]:
joblib.dump(svc, open('./trained-models/svc-news-type-prediction.sav', 'wb'))

# Gaussian

In [35]:
nb = GaussianNB()

results = cross_validate(nb, X=train_vectors.toarray(), y=df['type'], cv=kfold, return_estimator=True)

print(f'The mean test score is: {results["test_score"].max()}')

The mean test score is: 0.760655737704918


In [36]:
nb = results['estimator'][results['test_score'].argmax()]

In [37]:
joblib.dump(nb, open('./trained-models/nb-news-type-prediction.sav', 'wb'))

# Decision tree

In [46]:
tree = DecisionTreeClassifier(max_depth=20)

results = cross_validate(tree, X=train_vectors.toarray(), y=df['type'], cv=kfold, return_estimator=True)

print(f'The mean test score is: {results["test_score"].max()}')

The mean test score is: 0.5967213114754099


In [43]:
best_tree = results['estimator'][results['test_score'].argmax()]

In [44]:
joblib.dump(best_tree, open('./trained-models/tree-news-type-prediction.sav', 'wb'))

## KNN Model

In [48]:
best_knn_model = None
best_knn_score, best_knn_k = 0, 0

for k in range(2, 10):
    knn_model = KNeighborsClassifier(n_neighbors=k)
    
    results = cross_validate(
        knn_model,
        X=train_vectors,
        y=df['type'], 
        cv=kfold, 
        return_estimator=True
    )
    
    max_score = round(results['test_score'].max(), 4)
    
    print(f'KNN model with {k=} has score: {max_score}')
    
    if max_score > best_knn_score:
        best_knn_score = max_score
        best_knn_k = k
        best_knn_model = results['estimator'][results['test_score'].argmax()]
        
print('-' * 30)
print(f'The best KNN model was trained with k={best_knn_k} with score: {best_knn_score}')

KNN model with k=2 has score: 0.7508
KNN model with k=3 has score: 0.7368
KNN model with k=4 has score: 0.777
KNN model with k=5 has score: 0.7672
KNN model with k=6 has score: 0.7508
KNN model with k=7 has score: 0.7632
KNN model with k=8 has score: 0.7541
KNN model with k=9 has score: 0.7664
------------------------------
The best KNN model was trained with k=4 with score: 0.777


In [49]:
joblib.dump(best_knn_model, open('./trained-models/knn-news-type-prediction.sav', 'wb'))