# 1. Load dataset and import necessary libraries

In [None]:
import numpy as np
import pandas as pd
from sklearn.metrics import confusion_matrix
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer

np.random.seed(42)


In [None]:
data = pd.read_excel("news.xlsx")

# 2. Data preprocessing

#### Here we extract features from the news articles using bag-of-words and tf-idf representation techniques.

In [None]:
data.head()

Unnamed: 0,Category,Title,News_Article
0,Maraqlı,Naviforce Sport Saat 2016 ilə zövqlərin ahəngi,Naviforce Sport Saat 2016 Yapon Mexanizmi Yapo...
1,Maraqlı,"Sinir ,oynaq , sinir bel ağrılarına 3 gündə son !","ŞOK ! ŞOK ! ŞOK ! Xanımlar və bəylər , bel və ..."
2,Maraqlı,Dəyərindən qat-qat aşağı qiymətə Mənzil,Dəyərindən qat-qat Aşağı Qiymətə. Həzi Aslanov...
3,İdman,2024 və 2028-ci il olimpiadalarının keçiriləcə...,2028-ci il Yay Olimpiya və Paralimpiya Oyunla...
4,Dünya,Türkiyədə zəlzələ,Türkiyədə daha bir zəlzələ meydana gəlib. L...


In [None]:
x = data["News_Article"]
y = data["Category"]

In [None]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2)

In [None]:
cv = CountVectorizer(max_features=3000)
train_cv = cv.fit_transform(x_train)
test_cv = cv.transform(x_test)

In [None]:
tfidf = TfidfVectorizer(max_features=3000)
train_tf = tfidf.fit_transform(x_train)
test_tf = tfidf.transform(x_test)

# 3. Modeling

#### For each of the a) logistic regression, b) naïve Bayes, c) stochastic gradient descent classifier, and d) random forest classifier:
- Train it on the training data with Tf-idf representations
- Test the model using test set
- Achieve accuracy rate of ~80%
- Evaluate precision, recall, and F1 scores
- Construct confusion matrix


In [None]:
mnb_cv = MultinomialNB()
mnb_cv.fit(train_cv.toarray(),y_train)

In [None]:
pred_mnb_cv = mnb_cv.predict(test_cv.toarray())

In [None]:
print(classification_report(y_test, pred_mnb_cv))

              precision    recall  f1-score   support

       Dünya       0.73      0.80      0.76      2745
     Maraqlı       0.80      0.64      0.71      3172
  Mədəniyyət       0.58      0.73      0.65       304
     Siyasət       0.77      0.78      0.78      1269
       İdman       0.89      0.93      0.91      1264
İqtisadiyyat       0.74      0.88      0.80      1246

    accuracy                           0.77     10000
   macro avg       0.75      0.79      0.77     10000
weighted avg       0.78      0.77      0.77     10000



In [None]:
print(confusion_matrix(y_test, pred_mnb_cv))

[[2185  333   16   97   39   75]
 [ 661 2036   87  105   87  196]
 [   9   57  222   12    3    1]
 [  92   35   34  994   11  103]
 [  22   35   12    8 1177   10]
 [  28   37   11   77    0 1093]]


In [None]:
mnb_tf = MultinomialNB(alpha=1.0)
mnb_tf.fit(train_tf,y_train)

In [None]:
pred_mnb_tf = mnb_tf.predict(test_tf)

In [None]:
print(classification_report(y_test, pred_mnb_tf))

              precision    recall  f1-score   support

       Dünya       0.75      0.76      0.76      2745
     Maraqlı       0.76      0.71      0.73      3172
  Mədəniyyət       0.79      0.51      0.62       304
     Siyasət       0.75      0.79      0.77      1269
       İdman       0.90      0.92      0.91      1264
İqtisadiyyat       0.76      0.88      0.81      1246

    accuracy                           0.78     10000
   macro avg       0.79      0.76      0.77     10000
weighted avg       0.78      0.78      0.77     10000



In [None]:
print(confusion_matrix(y_test, pred_mnb_tf))

[[2093  438    5  111   34   64]
 [ 570 2239   14  100   79  170]
 [   7  107  154   28    3    5]
 [  82   64   13 1005   10   95]
 [  17   53    5    9 1169   11]
 [  22   46    5   80    0 1093]]


In [None]:
log = LogisticRegression(max_iter=500)
log.fit(train_tf, y_train)

In [None]:
pred_log = log.predict(test_tf)

In [None]:
print(classification_report(y_test, pred_log))

              precision    recall  f1-score   support

       Dünya       0.81      0.82      0.82      2745
     Maraqlı       0.80      0.79      0.80      3172
  Mədəniyyət       0.81      0.60      0.69       304
     Siyasət       0.83      0.82      0.83      1269
       İdman       0.91      0.93      0.92      1264
İqtisadiyyat       0.83      0.88      0.85      1246

    accuracy                           0.83     10000
   macro avg       0.83      0.81      0.82     10000
weighted avg       0.83      0.83      0.83     10000



In [None]:
print(confusion_matrix(y_test, pred_log))

[[2258  353    6   52   34   42]
 [ 389 2507   17   78   67  114]
 [  17   79  182   20    0    6]
 [  83   66   12 1037   11   60]
 [  23   48    4    7 1177    5]
 [  26   68    5   49    0 1098]]


In [None]:
sgd = SGDClassifier()
sgd.fit(train_tf, y_train)

In [None]:
pred_sgd = sgd.predict(test_tf)

In [None]:
print(classification_report(y_test, pred_sgd))

              precision    recall  f1-score   support

       Dünya       0.81      0.81      0.81      2745
     Maraqlı       0.81      0.78      0.79      3172
  Mədəniyyət       0.81      0.56      0.66       304
     Siyasət       0.83      0.84      0.83      1269
       İdman       0.90      0.95      0.93      1264
İqtisadiyyat       0.82      0.90      0.86      1246

    accuracy                           0.83     10000
   macro avg       0.83      0.81      0.81     10000
weighted avg       0.83      0.83      0.82     10000



In [None]:
print(confusion_matrix(y_test, pred_sgd))

[[2236  364    7   57   37   44]
 [ 384 2467   15   86   82  138]
 [  17   91  170   21    2    3]
 [  83   44   11 1063   12   56]
 [  12   31    2    7 1205    7]
 [  24   39    5   54    1 1123]]


In [None]:
rnd = RandomForestClassifier(n_jobs=-1)
rnd.fit(train_tf, y_train)

In [None]:
pred_rnd = rnd.predict(test_tf)

In [None]:
print(classification_report(y_test, pred_rnd))

              precision    recall  f1-score   support

       Dünya       0.79      0.82      0.81      2745
     Maraqlı       0.78      0.78      0.78      3172
  Mədəniyyət       0.87      0.45      0.60       304
     Siyasət       0.78      0.82      0.80      1269
       İdman       0.91      0.91      0.91      1264
İqtisadiyyat       0.83      0.81      0.82      1246

    accuracy                           0.81     10000
   macro avg       0.82      0.76      0.78     10000
weighted avg       0.81      0.81      0.80     10000



In [None]:
print(confusion_matrix(y_test, pred_rnd))

[[2260  370    1   65   29   20]
 [ 397 2463   11   99   82  120]
 [  22   96  138   37    1   10]
 [  86   81    7 1035    7   53]
 [  32   61    1   10 1151    9]
 [  65   97    1   76    1 1006]]
