In [1]:
import pandas as pd
import numpy as np

import warnings
warnings.filterwarnings('ignore')

from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression 
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

In [2]:
data = pd.read_csv('data2.csv')
data.head(3)

Unnamed: 0,title,text,subject,date,type,text_processed,text_length
0,"As U.S. budget fight looms, Republicans flip t...",WASHINGTON (Reuters) - The head of a conservat...,politicsNews,2017-12-31,1,"['washington', 'reuter', 'head', 'conserv', 'r...",420
1,U.S. military to accept transgender recruits o...,WASHINGTON (Reuters) - Transgender people will...,politicsNews,2017-12-29,1,"['washington', 'reuter', 'transgend', 'peopl',...",374
2,Senior U.S. Republican senator: 'Let Mr. Muell...,WASHINGTON (Reuters) - The special counsel inv...,politicsNews,2017-12-31,1,"['washington', 'reuter', 'special', 'counsel',...",267


https://scikit-learn.org/stable/modules/generated/sklearn.pipeline.Pipeline.html   

https://scikit-learn.org/stable/modules/generated/sklearn.metrics.classification_report.html

In [3]:
def get_prediction(vectorizer, classifier, X_train, X_test, y_train, y_test):
    
    pipe = Pipeline([('vector', vectorizer), ('model', classifier)])
    model = pipe.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    
    print("Accuarcy: {}".format(round(accuracy_score(y_test, y_pred) * 100, 2)))
    
    matrix = confusion_matrix(y_test, y_pred)
    
    print("Confusion Matrix: \n", matrix)
    print("Classification Report: \n", classification_report(y_test, y_pred))

https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.CountVectorizer.html   

https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.TfidfVectorizer.html

In [5]:
X_train, X_test, y_train, y_test = train_test_split(data['text_processed'], data['type'], test_size = 0.4, random_state= 0)

algorithms = [LogisticRegression(), KNeighborsClassifier(n_neighbors=5), DecisionTreeClassifier(max_depth=4),
               RandomForestClassifier(max_depth=4)]

for classifier in algorithms:
    print("\n\n", classifier)
    print("***********Usng Count Vectorizer****************")
    get_prediction(CountVectorizer(), classifier, X_train, X_test, y_train, y_test)
    
    print("***********Usng TFIDF Vectorizer****************")
    get_prediction(TfidfVectorizer(), classifier, X_train, X_test, y_train, y_test)



 LogisticRegression()
***********Usng Count Vectorizer****************
Accuarcy: 99.45
Confusion Matrix: 
 [[9170   51]
 [  46 8353]]
Classification Report: 
               precision    recall  f1-score   support

           0       1.00      0.99      0.99      9221
           1       0.99      0.99      0.99      8399

    accuracy                           0.99     17620
   macro avg       0.99      0.99      0.99     17620
weighted avg       0.99      0.99      0.99     17620

***********Usng TFIDF Vectorizer****************
Accuarcy: 98.37
Confusion Matrix: 
 [[9050  171]
 [ 116 8283]]
Classification Report: 
               precision    recall  f1-score   support

           0       0.99      0.98      0.98      9221
           1       0.98      0.99      0.98      8399

    accuracy                           0.98     17620
   macro avg       0.98      0.98      0.98     17620
weighted avg       0.98      0.98      0.98     17620



 KNeighborsClassifier()
***********Usng Count 

Here we see that the **Decision Tree Classifier** algorithm is giving best results with accuracy score of **99.53%**, usng TFIDF Vectorizer.