In [None]:
# Author : Sagar Bapodara (This is my first submission in a kaggle competition)

# Importing Dependencies

In [1]:
import os
import sys
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import re 
from nltk.corpus import stopwords 
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split 
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score 
print('Dependencies Loaded Successfully')

Dependencies Loaded Successfully


In [2]:
# Importing the stopwords
import nltk
nltk.download('stopwords')
print(stopwords.words('english')) #English Stopwords

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\2543b\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


# Importing Data 

In [None]:
news_data = pd.read_csv('../input/fake-news/train.csv')
news_data.head(10)

#### 1 : Fake News, 0 : Real News

# Basic Data Analysis

In [None]:
news_data.shape

In [None]:
print(news_data['label'].value_counts())

In [None]:
# Checking for missing values in the dataset
news_data.isnull().sum()

In [None]:
# Replacing the null values with emtpy strings 
news_data = news_data.fillna('')

In [None]:
# Checking for missing values again
news_data.isnull().sum()

# Classification Plan :
#### To use '*Title*' and '*Author*' data columns to make predictions

In [None]:
#merging the author name and news title 
news_data['content'] = news_data['author']+' '+news_data['title']

In [None]:
print(news_data['content'])

In [None]:
# separating the data & label
X = news_data.drop(columns='label', axis=1)
Y = news_data['label']

In [None]:
print(X)

In [None]:
print(Y)

# Stemming Process 
#### In short : Reducing a word to its root word, removing prefix and suffix 

In [None]:
port_stem = PorterStemmer()

#### Defining the stemmer function

In [None]:
def stemming(content):
    stemmed_content = re.sub('[^a-zA-Z]',' ',content)
    stemmed_content = stemmed_content.lower()
    stemmed_content = stemmed_content.split()
    stemmed_content = [port_stem.stem(word) for word in stemmed_content if not word in stopwords.words('english')]
    stemmed_content = ' '.join(stemmed_content)
    return stemmed_content

In [None]:
news_data['content'] = news_data['content'].apply(stemming)

In [None]:
print(news_data['content'])

In [None]:
# Seperating the data and the label 

X = news_data['content'].values
Y = news_data['label'].values

# Vectorizing the content data

In [None]:
# converting the textual data into numerical data 
vectorizer = TfidfVectorizer()
vectorizer.fit(X)
X = vectorizer.transform(X)

# Splitting the dataset into Training(80%) and Testing(20%)

In [None]:
X_train, X_test, Y_train, Y_test = train_test_split(X,Y, test_size=0.2, stratify=Y, random_state=2)

# Training the Logistic Regression Model

In [None]:
model = LogisticRegression()

In [None]:
model.fit(X_train, Y_train)

# Model Evaluation

#### A. On Training Data

In [None]:
X_train_prediction = model.predict(X_train)

In [None]:
training_data_accuracy = accuracy_score(X_train_prediction, Y_train)

In [None]:
 print('Accuracy score of training data :', training_data_accuracy)

#### B. On Testing Data

In [None]:
X_test_prediction = model.predict(X_test)
testing_data_accuracy = accuracy_score(X_test_prediction, Y_test)

In [None]:
 print('Accuracy score of testing data :', testing_data_accuracy)

## If you found this useful, kindly upvote and comment your views :) 