In [3]:
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [4]:
df = pd.read_csv("train_news.csv")
df.head(10)

Unnamed: 0.1,Unnamed: 0,id,headline,written_by,news,label
0,0,9653,Ethics Questions Dogged Agriculture Nominee as...,Eric Lipton and Steve Eder,"WASHINGTON — In Sonny Perdue’s telling, Geo...",0
1,1,10041,U.S. Must Dig Deep to Stop Argentina’s Lionel ...,David Waldstein,HOUSTON — Venezuela had a plan. It was a ta...,0
2,2,19113,Cotton to House: ’Do Not Walk the Plank and Vo...,Pam Key,"Sunday on ABC’s “This Week,” while discussing ...",0
3,3,6868,"Paul LePage, Besieged Maine Governor, Sends Co...",Jess Bidgood,"AUGUSTA, Me. — The beleaguered Republican g...",0
4,4,7596,A Digital 9/11 If Trump Wins,Finian Cunningham,Finian Cunningham has written extensively on...,1
5,5,3196,Whatever the Outcome on November 8th the US Wi...,,Taming the corporate media beast Whatever the ...,1
6,6,5134,Rapid Evolution Saved This Fish From Pollution...,JoAnna Klein,The State of New Jersey says you can’t eat the...,0
7,7,1504,Alabama Prison Officials Retaliate Against Pri...,Brian Sonenstein,Advocates say prison officials at the Kilby Co...,1
8,8,13559,,steventexas,People have made up their minds on president.\...,1
9,9,4203,Can We Live in a Constant State of Love?,Gillian,Leave a reply \nToni Emerson – When we fall in...,1


In [5]:
 df.shape

(20800, 6)

In [6]:
df.columns

Index(['Unnamed: 0', 'id', 'headline', 'written_by', 'news', 'label'], dtype='object')

In [7]:
df.drop(['Unnamed: 0'],inplace=True,axis=1)

In [8]:
df.head(10)

Unnamed: 0,id,headline,written_by,news,label
0,9653,Ethics Questions Dogged Agriculture Nominee as...,Eric Lipton and Steve Eder,"WASHINGTON — In Sonny Perdue’s telling, Geo...",0
1,10041,U.S. Must Dig Deep to Stop Argentina’s Lionel ...,David Waldstein,HOUSTON — Venezuela had a plan. It was a ta...,0
2,19113,Cotton to House: ’Do Not Walk the Plank and Vo...,Pam Key,"Sunday on ABC’s “This Week,” while discussing ...",0
3,6868,"Paul LePage, Besieged Maine Governor, Sends Co...",Jess Bidgood,"AUGUSTA, Me. — The beleaguered Republican g...",0
4,7596,A Digital 9/11 If Trump Wins,Finian Cunningham,Finian Cunningham has written extensively on...,1
5,3196,Whatever the Outcome on November 8th the US Wi...,,Taming the corporate media beast Whatever the ...,1
6,5134,Rapid Evolution Saved This Fish From Pollution...,JoAnna Klein,The State of New Jersey says you can’t eat the...,0
7,1504,Alabama Prison Officials Retaliate Against Pri...,Brian Sonenstein,Advocates say prison officials at the Kilby Co...,1
8,13559,,steventexas,People have made up their minds on president.\...,1
9,4203,Can We Live in a Constant State of Love?,Gillian,Leave a reply \nToni Emerson – When we fall in...,1


In [9]:
df.isnull().sum()

id               0
headline       558
written_by    1957
news            39
label            0
dtype: int64

In [11]:
#We can see that the features ‘headline’, ‘written_by’ and ‘news’ are important and all are in text form. 
#So, we can combine these features to make one final feature which we will use to train the model. 
#Let’s call the feature ‘total’.
# Firstly, fill all the null spaces with a space
df = df.fillna(' ')
df['total'] = df['headline'] + ' ' + df['written_by'] + ' ' + df['news']


In [13]:
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer


In [14]:
stop_words = stopwords.words('english')

In [18]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\acer\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.


True

In [20]:
lemmatizer = WordNetLemmatizer()

In [22]:
for index, row in df.iterrows():
    filter_sentence = ' '
    sentence = row['total']
    
    # Tokenization
    words = nltk.word_tokenize(sentence)
    # Stopwords removal
    words = [w for w in words if not w in stop_words]
    # Lemmatization
    for words in words:
        filter_sentence = filter_sentence  + ' ' + str(lemmatizer.lemmatize(words)).lower()
    df.loc[index, 'total'] = filter_sentence
df = df[['total', 'label']]

In [23]:
df.head(10)

Unnamed: 0,total,label
0,ethics questions dogged agriculture nominee ...,0
1,u.s. must dig deep stop argentina ’ lionel m...,0
2,cotton house : ’ do not walk plank vote bill...,0
3,"paul lepage , besieged maine governor , send...",0
4,a digital 9/11 if trump wins finian cunningh...,1
5,whatever outcome november 8th us will be col...,1
6,rapid evolution saved this fish from polluti...,0
7,alabama prison officials retaliate against p...,1
8,steventexas people made mind president . i ’...,1
9,can we live constant state love ? gillian le...,1


In [24]:
X_train = df['total']
Y_train = df['label']

In [25]:
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer


In [26]:
count_vectorizer = CountVectorizer()
count_vectorizer.fit_transform(X_train)
freq_term_matrix = count_vectorizer.transform(X_train)
tfidf = TfidfTransformer(norm = "l2")
tfidf.fit(freq_term_matrix)
tf_idf_matrix = tfidf.fit_transform(freq_term_matrix)

In [27]:
from sklearn.model_selection import train_test_split

In [28]:
X_train, X_test, y_train, y_test = train_test_split(tf_idf_matrix,Y_train, random_state=0)

# Logistic Regression

In [33]:
from sklearn.linear_model import LogisticRegression
logreg = LogisticRegression()
logreg.fit(X_train, y_train)
Accuracy = logreg.score(X_test, y_test)
print(Accuracy)

0.9611538461538461


# Naive Bayes

In [34]:
from sklearn.naive_bayes import MultinomialNB
NB = MultinomialNB()
NB.fit(X_train, y_train)
Accuracy = NB.score(X_test, y_test)
print(Accuracy)

0.8861538461538462


# Decision Tree Classifier

In [35]:
from sklearn.tree import DecisionTreeClassifier
clf = DecisionTreeClassifier()
clf.fit(X_train, y_train)
Accuracy = clf.score(X_test, y_test)
print(Accuracy)

0.9655769230769231


In [36]:
#As we can see, the decision tree classifier performed the best on the train set and gave an accuracy of 96.55%.