In [1]:
import numpy as np 
import pandas as pd 
import string
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score 
import nltk
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression



In [2]:
dataframe = pd.read_csv("dataset.csv") 

dataframe.drop('Unnamed: 0',axis=1,inplace=True)

dataframe.dropna(inplace=True) 

dataframe['length'] = dataframe['text_'].apply(len) 

dataframe[dataframe['label']=='OR'][['text_','length']].sort_values(by='length',ascending=False).head().iloc[0].text_ 



"weak on current scienc after see twice i agre much posit five star review out respect read review i 'll repeat everyth i like present i found goofi over ear hairdo facial hair arrang daniel vitali describ `` wild food expert '' distract ugh ditto david wolf extrem goofi wild hairdo on hand jon gabriel describ `` author weight loss expert '' nice groom good present hi stori person transform fellow pound whew becom jock normal weight inspir christian northrup preserv rank one america 's cutest doctor a realli nice look woman present dr. mercola jason vale kri carr alejandro junger fine it disappoint jami oliv popular uk give babi cow growth fluid pas unscientif popular idea milk none present anyth zilch say work doctor t. colin campbel milk bodi bad it good see present take stand sugar they agre evil sugar refin carbohydr with respect dr. northrup `` it 's fat make fat 's sugar '' statement pas muster commun expert recogn evil sugar not mutual exclus recogn proven danger fat particularl

In [3]:
def convertmyTxt(rv): 
    np = [c for c in rv if c not in string.punctuation] 
    np = ''.join(np) 
    return [w for w in np.split() if w.lower() not in stopwords.words('english')] 


In [4]:

x_train, x_test, y_train, y_test = train_test_split(dataframe['text_'],dataframe['label'],test_size=0.25)
# Random Forest Classifier model 
RFC_model= Pipeline([
    ('bow',CountVectorizer(analyzer=convertmyTxt)),
    ('tfidf',TfidfTransformer()),
    ('classifier',RandomForestClassifier())
]) 

RFC_model.fit(x_train,y_train) 



Pipeline(steps=[('bow',
                 CountVectorizer(analyzer=<function convertmyTxt at 0x7fdb7a257940>)),
                ('tfidf', TfidfTransformer()),
                ('classifier', RandomForestClassifier())])

In [5]:
randomForestClassifier = RFC_model.predict(x_test) 
randomForestClassifier

array(['OR', 'CG', 'CG', ..., 'CG', 'OR', 'CG'], dtype=object)

In [9]:
print("Random Forest Classifier: ")
print(classification_report(y_test,randomForestClassifier,target_names=["OR","CG"]))

Random Forest Classifier: 
              precision    recall  f1-score   support

          OR       0.81      0.89      0.85      5056
          CG       0.88      0.80      0.84      5052

    accuracy                           0.84     10108
   macro avg       0.85      0.84      0.84     10108
weighted avg       0.85      0.84      0.84     10108



In [10]:
print('Random Forest Classifier - Accuracy of the model: ',str(np.round(accuracy_score(y_test,randomForestClassifier)*100,2)) + '%')


Random Forest Classifier - Accuracy of the model:  84.46%


In [11]:
# Support Vector Classifier model
SVC_model= Pipeline([
    ('bow',CountVectorizer(analyzer=convertmyTxt)),
    ('tfidf',TfidfTransformer()),
    ('classifier',SVC())
])

SVC_model.fit(x_train,y_train)

supportVectorClassifier = SVC_model.predict(x_test)
supportVectorClassifier


array(['OR', 'CG', 'CG', ..., 'CG', 'OR', 'CG'], dtype=object)

In [12]:
print("Support vector classifier : ")
print(classification_report(y_test,supportVectorClassifier,target_names=["OR","CG"]))

Support vector classifier : 
              precision    recall  f1-score   support

          OR       0.89      0.86      0.88      5056
          CG       0.87      0.90      0.88      5052

    accuracy                           0.88     10108
   macro avg       0.88      0.88      0.88     10108
weighted avg       0.88      0.88      0.88     10108



In [13]:
print('Support vector classifier - Accuracy of the model:',str(np.round(accuracy_score(y_test,supportVectorClassifier)*100,2)) + '%')


Support vector classifier - Accuracy of the model: 88.1%


In [14]:
LR_model = Pipeline([
    ('bow',CountVectorizer(analyzer=convertmyTxt)),
    ('tfidf',TfidfTransformer()),
    ('classifier',LogisticRegression(max_iter=1000))
])

LR_model.fit(x_train,y_train)


logisticRegression = LR_model.predict(x_test)
logisticRegression



Logistic regession -accuracy of the model: 86.61%


In [15]:
print("Logistic regession : ")
print(classification_report(y_test,logisticRegression,target_names=["OR","CG"]))

Logistic regession : 
              precision    recall  f1-score   support

          OR       0.87      0.86      0.87      5056
          CG       0.86      0.87      0.87      5052

    accuracy                           0.87     10108
   macro avg       0.87      0.87      0.87     10108
weighted avg       0.87      0.87      0.87     10108



In [16]:
print('Logistic regession -accuracy of the model:',str(np.round(accuracy_score(y_test,logisticRegression)*100,2)) + '%')


Logistic regession -accuracy of the model: 86.61%


In [17]:
import pickle
pickle.dump(LR_model,open('model.pkl', 'wb'))


In [18]:
# load the model from disk
loaded_model = pickle.load(open('model.pkl', 'rb'))

In [19]:

def fake_news_det(news):
    input_data = [news]
    prediction = loaded_model.predict(input_data)
    return prediction

In [20]:
fake_news_det("accur descript work great came time m ha")

array(['OR'], dtype=object)