In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [2]:
df_fake = pd.read_csv("fake.csv")
df_true = pd.read_csv("true.csv")

In [3]:
df_fake['target'] = 'fake'
df_true['target'] = 'true'

In [4]:
print(f"The shape of fake news data: {df_fake.shape}")
print(f"The shape of true news data: {df_true.shape}")

The shape of fake news data: (23481, 5)
The shape of true news data: (21417, 5)


In [5]:
df = pd.concat([df_fake,df_true],ignore_index=True)

In [6]:
df.head()

Unnamed: 0,title,text,subject,date,target
0,Donald Trump Sends Out Embarrassing New Year’...,Donald Trump just couldn t wish all Americans ...,News,"December 31, 2017",fake
1,Drunk Bragging Trump Staffer Started Russian ...,House Intelligence Committee Chairman Devin Nu...,News,"December 31, 2017",fake
2,Sheriff David Clarke Becomes An Internet Joke...,"On Friday, it was revealed that former Milwauk...",News,"December 30, 2017",fake
3,Trump Is So Obsessed He Even Has Obama’s Name...,"On Christmas day, Donald Trump announced that ...",News,"December 29, 2017",fake
4,Pope Francis Just Called Out Donald Trump Dur...,Pope Francis used his annual Christmas Day mes...,News,"December 25, 2017",fake


In [7]:
df = df.drop_duplicates()
df = df.reset_index(drop=True)

In [8]:
df = df[['title','target']]

In [9]:
df.head()

Unnamed: 0,title,target
0,Donald Trump Sends Out Embarrassing New Year’...,fake
1,Drunk Bragging Trump Staffer Started Russian ...,fake
2,Sheriff David Clarke Becomes An Internet Joke...,fake
3,Trump Is So Obsessed He Even Has Obama’s Name...,fake
4,Pope Francis Just Called Out Donald Trump Dur...,fake


In [10]:
df.shape

(44689, 2)

In [11]:
print("The dataset contains {} rows and {} columns".format(df.shape[0],df.shape[1]))

The dataset contains 44689 rows and 2 columns


In [12]:
df = df.drop_duplicates(subset='title', keep='first')
df = df.reset_index(drop=True)

In [13]:
import nltk
nltk.download('wordnet') 

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\jenil\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [14]:
from nltk.stem import PorterStemmer,WordNetLemmatizer
import re
from nltk.corpus import stopwords

ps = PorterStemmer()


def text_preprocessing(content):
    clean_text = re.sub("[^a-zA-Z]"," ",content)
    clean_text  = clean_text.lower()
    clean_text = clean_text.split()
    clean_text = [ps.stem(word) for word in clean_text if word not in stopwords.words('english')]
    clean_text = ' '.join(clean_text)
    return clean_text
    

In [15]:
df['title'] = df['title'].apply(text_preprocessing)

In [16]:
from sklearn.preprocessing import LabelEncoder
lb_enc = LabelEncoder()
if 'target' in df.columns:
    
    df['target'] = lb_enc.fit_transform(df['target'])
else:
    print("'target' column not found in DataFrame.")

In [17]:
from sklearn.feature_extraction.text import TfidfVectorizer
X = df['title']
y = df['target']

In [27]:
X.iloc[35350]

'vaccin begin bangladesh camp head cholera outbreak'

In [29]:
y.iloc[35350]

1

In [18]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=2)

In [19]:
vect = TfidfVectorizer()
X_train = vect.fit_transform(X_train)
X_test = vect.transform(X_test)

In [20]:
print(f"The shape of training data: {X_train.shape}")
print(f"The shape of testing data: {X_test.shape}")

The shape of training data: (30983, 12266)
The shape of testing data: (7746, 12266)


In [21]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
lr = LogisticRegression()
lr.fit(X_train,y_train)

In [22]:
y_pred = lr.predict(X_test)
acc_score = accuracy_score(y_test,y_pred)
print("The accuracy score on testing data is : {:.2f}%".format(acc_score*100))

The accuracy score on testing data is : 93.85%


In [23]:
y_pred_train = lr.predict(X_train)
acc_score_train = accuracy_score(y_train,y_pred_train)
print("The accuracy score on training data is : {:.2f}%".format(acc_score_train*100))

The accuracy score on training data is : 95.53%


In [24]:
import pickle
pickle.dump(vect,open('vector.pkl','wb'))
pickle.dump(lr,open('model.pkl','wb'))