In [17]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
import scipy.sparse as sp

# Load your dataset
df = pd.read_csv('data//news_data.csv',  encoding='latin-1', header=None)
df.columns = ['class', 'news']
label_mapping = {'positive': 1, 'negative': -1, 'neutral': 0}
df['class'] = df['class'].map(label_mapping)

vectorizer = TfidfVectorizer() #max_features=1000

X = vectorizer.fit_transform(df['news'])
y = df['class']

print(X.shape)


(4846, 10070)


In [18]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y, 
                                    test_size = 0.40, 
                                    random_state=1)
X_test.shape

(1939, 10070)

In [27]:
#--------------------------------------------------
## ----------- K-NN Classifier ------------------## 
#--------------------------------------------------
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import roc_auc_score

steps = [('scaler', StandardScaler(with_mean=False)),         
         ('knn', KNeighborsClassifier(n_neighbors = 4))]

knn_pipeline = Pipeline(steps)
knn_pipeline.fit(X_train, y_train)

ypred_test = knn_pipeline.predict(X_test)
mat_clf = confusion_matrix(y_test, ypred_test)
report_clf = classification_report(y_test, ypred_test)

print(mat_clf)
print(report_clf)

[[ 79   2 166]
 [ 67  79 984]
 [ 53   6 503]]
              precision    recall  f1-score   support

          -1       0.40      0.32      0.35       247
           0       0.91      0.07      0.13      1130
           1       0.30      0.90      0.45       562

    accuracy                           0.34      1939
   macro avg       0.54      0.43      0.31      1939
weighted avg       0.67      0.34      0.25      1939



In [20]:
from sklearn.naive_bayes import GaussianNB
gnb = GaussianNB()
gnb.fit(X_train.toarray(), y_train)

ypred_test = gnb.predict(X_test.toarray())
mat_clf = confusion_matrix(y_test, ypred_test)
report_clf = classification_report(y_test, ypred_test)

print(mat_clf)
print(report_clf)

[[100  89  58]
 [ 77 787 266]
 [ 67 260 235]]
              precision    recall  f1-score   support

          -1       0.41      0.40      0.41       247
           0       0.69      0.70      0.69      1130
           1       0.42      0.42      0.42       562

    accuracy                           0.58      1939
   macro avg       0.51      0.51      0.51      1939
weighted avg       0.58      0.58      0.58      1939



In [21]:
#--------------------------------------------------
## ------------Logistic Regresion----------------##
#--------------------------------------------------

from sklearn.linear_model import LogisticRegression

steps = [('scaler', StandardScaler()),
         ('logReg', LogisticRegression(penalty = "l2", C = 1.0))]

LR_pipeline = Pipeline(steps)
LR_pipeline.fit(X_train.toarray(), y_train)

ypred_test = LR_pipeline.predict(X_test.toarray())
mat_clf = confusion_matrix(y_test, ypred_test)
report_clf = classification_report(y_test, ypred_test)

print(mat_clf)
print(report_clf)

#ypred_testP = LR_pipeline.predict_proba(X_test.toarray())
#auc = roc_auc_score(y_test, ypred_testP[:,1])
#print(auc)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


[[126  83  38]
 [ 25 983 122]
 [ 19 244 299]]
              precision    recall  f1-score   support

          -1       0.74      0.51      0.60       247
           0       0.75      0.87      0.81      1130
           1       0.65      0.53      0.59       562

    accuracy                           0.73      1939
   macro avg       0.71      0.64      0.67      1939
weighted avg       0.72      0.73      0.72      1939



In [22]:
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

#--------------------------------------------------
## ------------ SVM Classifier ------------------## 
#--------------------------------------------------

from sklearn.svm import SVC

## Linear Kernel  ---------------
steps = [('scaler', StandardScaler()),         
         ('svc', SVC(kernel = 'linear',
                     class_weight='balanced'))]

svcL_pipeline = Pipeline(steps)
svcL_pipeline.fit(X_train.toarray(), y_train)

## Polynomial Kernel -----------------------
steps = [('scaler', StandardScaler()),         
         ('svc', SVC(kernel = 'poly', degree = 3, 
                     class_weight='balanced'))]

svcPoly_pipeline = Pipeline(steps)
svcPoly_pipeline.fit(X_train.toarray(), y_train)

## RBF Kernel -----------------------
steps = [('scaler', StandardScaler()),         
         ('svc', SVC(kernel = 'rbf', gamma = 'scale',
                     class_weight='balanced'))]

svcRBF_pipeline = Pipeline(steps)
svcRBF_pipeline.fit(X_train.toarray(), y_train)


#--------------------------------------------------
## Model Evaluation ##
#--------------------------------------------------
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import roc_auc_score


ypred_test = svcL_pipeline.predict(X_test.toarray())
mat_clf = confusion_matrix(y_test, ypred_test)
report_clf = classification_report(y_test, ypred_test)

print("Linear")
print(mat_clf)
print(report_clf)

#ypred_test = svcPoly_pipeline.predict(X_test.toarray())
#mat_clf = confusion_matrix(y_test, ypred_test)
#report_clf = classification_report(y_test, ypred_test)

#print("svcPoly_pipeline")
#print(mat_clf)
#print(report_clf)



Linear
[[122  75  50]
 [ 43 879 208]
 [ 41 241 280]]
              precision    recall  f1-score   support

          -1       0.59      0.49      0.54       247
           0       0.74      0.78      0.76      1130
           1       0.52      0.50      0.51       562

    accuracy                           0.66      1939
   macro avg       0.62      0.59      0.60      1939
weighted avg       0.65      0.66      0.66      1939

svcPoly_pipeline
[[   5  241    1]
 [   2 1121    7]
 [   4  551    7]]
              precision    recall  f1-score   support

          -1       0.45      0.02      0.04       247
           0       0.59      0.99      0.74      1130
           1       0.47      0.01      0.02       562

    accuracy                           0.58      1939
   macro avg       0.50      0.34      0.27      1939
weighted avg       0.53      0.58      0.44      1939

svcRBF_pipeline
[[ 119  104   24]
 [  22 1046   62]
 [  34  309  219]]
              precision    recall  f1-scor

In [26]:
#ypred_test = svcRBF_pipeline.predict(X_test.toarray())
#mat_clf = confusion_matrix(y_test, ypred_test)
#report_clf = classification_report(y_test, ypred_test)

#print("svcRBF_pipeline")
#print(mat_clf)
#print(report_clf)