#**Text Classification**

## Import Library

In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import PassiveAggressiveClassifier
from sklearn.naive_bayes import MultinomialNB
import string
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize 
from nltk.stem import PorterStemmer
from nltk import WordNetLemmatizer
from sklearn import model_selection, naive_bayes, svm
from sklearn.metrics import accuracy_score, confusion_matrix 
from sklearn.metrics import plot_confusion_matrix
import seaborn as sns

## Read Dataset

In [2]:
df = pd.read_csv('dataset (1).csv')
df.fillna(' ')
df.shape
df.head()

Unnamed: 0,No.,Title,Text,Label,Unnamed: 4,Unnamed: 5,Unnamed: 6,Unnamed: 7
0,1,The Brazilian doctor offering bogus Covid reme...,A Brazilian state representative and doctor is...,Fake,,,,
1,2,France puzzled by mystery anti-Pfizer campaign...,Several French social media influencers say th...,Fake,,,,
2,3,Covid-19: YouTube launches vaccination ad camp...,YouTube has launched a multi-million-pound adv...,Fake,,,,
3,4,Israel-Palestinian conflict: False and mislead...,As the Israel-Palestinian conflict has escalat...,Fake,,,,
4,5,The misinformation bubble threatening Brazil's...,False information from the mouths of politicia...,Fake,,,,


In [3]:
df["Title"]=df["Title"].str.lower()
df["Text"]=df["Text"].str.lower()
df["Label"]=df["Label"].str.lower()

In [4]:
# remove punctuation
df['Text'] = df['Text'].apply(lambda x: re.sub('[^\w\s]', ' ', x))
df['Title'] = df['Title'].apply(lambda x: re.sub('[^\w\s]', ' ', x))

TypeError: ignored

In [None]:
# remove one and two character words
df['Text'] = df['Text'].apply(lambda x: re.sub(r'\b\w{1,3}\b', '', x))
df['Title'] = df['Title'].apply(lambda x: re.sub(r'\b\w{1,3}\b', '', x))

In [None]:
# remove numerical values
df['Text'] = df['Text'].apply(lambda x: re.sub(r'[0-9]+', '', x))
df['Title'] = df['Title'].apply(lambda x: re.sub(r'[0-9]+', '', x))

In [None]:
# \s+ means all empty space (\n, \r, \t)
df['Text'] = df['Text'].apply(lambda x: re.sub('\s+', ' ', x))
df['Title'] = df['Title'].apply(lambda x: re.sub('\s+', ' ', x))

In [None]:
labels = df.Label
labels.head()

## Split Dataset

In [None]:
x_train, x_test, y_train, y_test = train_test_split(df['Text'], labels, test_size=0.2, random_state=69)

## Initialize TF-IDF Vectorizer

In [None]:
tfidf_vectorizer=TfidfVectorizer(stop_words='english')

# Fit, and transform train set, transform test set
tfidf_train=tfidf_vectorizer.fit_transform(x_train) 
tfidf_test=tfidf_vectorizer.transform(x_test)

## Initialize Passive Aggresive Classifier

In [None]:
# Initialize a PassiveAggressiveClassifier
pac=PassiveAggressiveClassifier(max_iter=100)
pac.fit(tfidf_train,y_train)

# Predict on the test set
y_pred = pac.predict(tfidf_test)

## Evaluation/Accuracy


In [None]:
score_pac = accuracy_score(y_test,y_pred)
print(f'Passive Aggresive Classifier Accuracy: {round(score_pac*100,2)}%')

In [None]:
matrix = confusion_matrix(y_test,y_pred)
print (matrix)

In [None]:
group_names = ['True Negative','False Positive','False Negative','True Positive']
group_counts = ['{0:0.0f}'.format(value) for value in
                matrix.flatten()]
group_percentages = ['{0:.2%}'.format(value) for value in
                     matrix.flatten()/np.sum(matrix)]
labels = [f'{v1}\n{v2}\n{v3}' for v1, v2, v3 in
          zip(group_names,group_counts,group_percentages)]
labels = np.asarray(labels).reshape(2,2)
sns.heatmap(matrix, annot=labels, fmt='', cmap='Blues')

In [None]:
# fit the training dataset on the NB classifier
# from sklearn.naive_bayes import MultinomialNB
Naive = MultinomialNB()
Naive.fit(tfidf_train,y_train)
# predict the labels on validation dataset
predictions_NB = Naive.predict(tfidf_test)
# Use accuracy_score function to get the accuracy
print("Naive Bayes Accuracy Score -> ",accuracy_score(y_test, predictions_NB)*100)

In [None]:
# Classifier - Algorithm - SVM
# fit the training dataset on the classifier
SVM = svm.SVC(C=1.0, kernel='linear', degree=3, gamma='auto')
SVM.fit(tfidf_train,y_train)
# predict the labels on validation dataset
predictions_SVM = SVM.predict(tfidf_test)
# Use accuracy_score function to get the accuracy
print("SVM Accuracy Score -> ",accuracy_score(y_test,predictions_SVM)*100)