# Word Vectors

In [33]:
import os
import gensim
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression, PassiveAggressiveClassifier
from sklearn import naive_bayes, svm
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, precision_score, recall_score, f1_score, roc_curve, accuracy_score
from sklearn.metrics import classification_report

import warnings
warnings.filterwarnings('ignore')
%matplotlib inline

In [34]:
allnews_set = pd.read_csv('./kenyan-news/all-news.csv')

### Creating word vectors

In [35]:
# 20 iterations
w2v = gensim.models.Word2Vec(list(allnews_set['Total']), size=350, window=10, min_count=2, iter=20)

In [36]:
w2v

<gensim.models.word2vec.Word2Vec at 0x7fb0283f8640>

### Generating Word  Vectors

In [37]:
def generate_wordvectors(sentence):
    sentence = [word for word in sentence if word in w2v.wv.vocab]
    return np.mean(w2v[sentence], axis=0)

In [38]:
allnews_set['word_vectors'] = allnews_set['Total'].apply(generate_wordvectors)

In [39]:
x_train, x_test, y_train, y_test = train_test_split(allnews_set['word_vectors'], allnews_set['Tag'], test_size=0.20, random_state=0)

In [40]:
x_train

1815    [0.0847119, 0.18145373, 0.0349021, 0.04074825,...
704     [0.102782816, 0.18864474, 0.019293843, 0.05326...
1136    [0.09616085, 0.20544122, 0.033075374, 0.033583...
828     [0.09298691, 0.21591176, 0.038729552, 0.028182...
159     [0.10379068, 0.22416198, 0.0380395, 0.03347276...
                              ...                        
835     [0.04337563, 0.16793858, 0.039883986, 0.073793...
1216    [0.09344817, 0.18964249, 0.037496675, 0.040140...
1653    [0.052056603, 0.17909586, 0.05970552, 0.015416...
559     [0.086692594, 0.23544388, 0.03457702, 0.035829...
684     [0.10867344, 0.20623219, 0.020832809, 0.040218...
Name: word_vectors, Length: 1604, dtype: object

In [41]:
x_train = list(x_train)
x_test = list(x_test)

# Training models

### Logistic Regression

In [42]:
clf = LogisticRegression(C=1e5)
clf.fit(x_train, y_train)
predicted = clf.predict(x_test)
print(classification_report(y_test, predicted))

              precision    recall  f1-score   support

         0.0       0.77      0.82      0.79       200
         1.0       0.81      0.75      0.78       201

    accuracy                           0.79       401
   macro avg       0.79      0.79      0.79       401
weighted avg       0.79      0.79      0.79       401



### SVM

In [43]:
svm_mdl = svm.SVC(kernel = "linear", C = 1e2)
svm_mdl.fit(x_train, y_train)
predicted_svm = svm_mdl.predict(x_test)
print(classification_report(y_test, predicted_svm))

              precision    recall  f1-score   support

         0.0       0.76      0.83      0.79       200
         1.0       0.81      0.74      0.78       201

    accuracy                           0.79       401
   macro avg       0.79      0.79      0.79       401
weighted avg       0.79      0.79      0.79       401



### Naive Bayes

In [44]:
naive = naive_bayes.GaussianNB()
naive.fit(x_train, y_train)
predicted_naive = naive.predict(x_test)
print(classification_report(y_test, predicted_naive))

              precision    recall  f1-score   support

         0.0       0.67      0.80      0.72       200
         1.0       0.75      0.60      0.67       201

    accuracy                           0.70       401
   macro avg       0.71      0.70      0.70       401
weighted avg       0.71      0.70      0.70       401



### Passsive Agressive

In [45]:
passagg = PassiveAggressiveClassifier(max_iter = 80)
passagg.fit(x_train, y_train)
predicted_pass = passagg.predict(x_test)
print(classification_report(y_test, predicted_pass))

              precision    recall  f1-score   support

         0.0       0.74      0.78      0.76       200
         1.0       0.77      0.73      0.75       201

    accuracy                           0.76       401
   macro avg       0.76      0.76      0.76       401
weighted avg       0.76      0.76      0.76       401

