In [1]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [2]:
news_dataset = pd.read_csv('cleaned_stemmed_news_train_data.csv',index_col='id')

In [3]:
news_dataset.head()

Unnamed: 0_level_0,title,author,text,label,content
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,House Dem Aide: We Didn’t Even See Comey’s Let...,Darrell Lucus,House Dem Aide: We Didn’t Even See Comey’s Let...,1,darrel lucu hous dem aid even see comey letter...
1,"FLYNN: Hillary Clinton, Big Woman on Campus - ...",Daniel J. Flynn,Ever get the feeling your life circles the rou...,0,daniel j flynn flynn hillari clinton big woman...
2,Why the Truth Might Get You Fired,Consortiumnews.com,"Why the Truth Might Get You Fired October 29, ...",1,consortiumnew com truth might get fire truth m...
3,15 Civilians Killed In Single US Airstrike Hav...,Jessica Purkiss,Videos 15 Civilians Killed In Single US Airstr...,1,jessica purkiss civilian kill singl us airstri...
4,Iranian woman jailed for fictional unpublished...,Howard Portnoy,Print \nAn Iranian woman has been sentenced to...,1,howard portnoy iranian woman jail fiction unpu...


In [4]:
#separating the data and label
X = news_dataset['content'].values.astype('str')
Y = news_dataset['label'].values

In [5]:
print(X)

['darrel lucu hous dem aid even see comey letter jason chaffetz tweet hous dem aid even see comey letter jason chaffetz tweet darrel lucu octob subscrib jason chaffetz stump american fork utah imag courtesi michael jolley avail creativ common licens apolog keith olbermann doubt worst person world week fbi director jame comey accord hous democrat aid look like also know second worst person well turn comey sent infam letter announc fbi look email may relat hillari clinton email server rank democrat relev committe hear comey found via tweet one republican committe chairmen know comey notifi republican chairmen democrat rank member hous intellig judiciari oversight committe agenc review email recent discov order see contain classifi inform long letter went oversight committe chairman jason chaffetz set polit world ablaz tweet fbi dir inform fbi learn exist email appear pertin investig case reopen jason chaffetz jasoninthehous octob cours know case comey actual say review email light unrel 

In [6]:
print(Y)

[1 0 1 ... 0 1 1]


In [7]:
Y.shape

(20800,)

In [8]:
# converting the textual data to numerical data
vectorizer = TfidfVectorizer()
vectorizer.fit(X)

X = vectorizer.transform(X)

In [9]:
print(X)

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 5144980 stored elements and shape (20800, 111501)>
  Coords	Values
  (0, 109752)	0.049158312425168854
  (0, 109697)	0.0190646711515277
  (0, 108742)	0.04416544119908134
  (0, 108738)	0.09477494042884232
  (0, 108695)	0.03758488097939004
  (0, 108658)	0.01130614774071694
  (0, 108007)	0.017092546683505856
  (0, 107190)	0.017105936674103112
  (0, 107099)	0.012543234221230963
  (0, 107013)	0.029126417104928328
  (0, 106934)	0.012863319680563097
  (0, 106734)	0.011771716334271506
  (0, 105884)	0.025727197929110487
  (0, 105848)	0.031296701378124764
  (0, 104837)	0.02153649554212262
  (0, 103422)	0.06544555398259812
  (0, 102736)	0.03314918847150756
  (0, 102485)	0.01639612818098454
  (0, 101717)	0.038071924979380216
  (0, 101077)	0.011082403436475742
  (0, 101067)	0.0432044670628921
  (0, 101014)	0.13602128375819167
  (0, 100866)	0.0713092337063475
  (0, 99577)	0.03944988916619374
  (0, 99009)	0.027120358929731154
  :	:
  (20799

Splitting the dataset to training & test data

In [10]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.2, stratify=Y, random_state=2)

Training the Model: Logistic Regression

In [11]:
model = LogisticRegression()

In [12]:
model.fit(X_train, Y_train)

Evaluation

accuracy score

In [13]:
# accuracy score on the training data
X_train_prediction = model.predict(X_train)
training_data_accuracy = accuracy_score(X_train_prediction, Y_train)

In [14]:
print('Accuracy score of the training data : ', training_data_accuracy)

Accuracy score of the training data :  0.9798677884615384


In [15]:
# accuracy score on the training data
X_test_prediction = model.predict(X_test)
testing_data_accuracy = accuracy_score(X_test_prediction, Y_test)

In [16]:
print('Accuracy score of the test data : ', testing_data_accuracy)

Accuracy score of the test data :  0.9543269230769231


Making a Predictive System

In [17]:
X_new = X_test[0]

prediction = model.predict(X_new)
print(prediction)

if (prediction[0]==0):
  print('The news is Real')
else:
  print('The news is Fake')

[1]
The news is Fake


In [18]:
print(Y_test[0])

1


Prediction of Data from test.csv and Checking it with Submit.csv

In [19]:
# importing the cleaned_stemmed_news_test_data.csv
X_test_cases = pd.read_csv('cleaned_stemmed_news_test_data.csv', usecols=['id','content'], index_col='id')

In [20]:
X_test_cases.head()

Unnamed: 0_level_0,content
id,Unnamed: 1_level_1
20800,specter trump loosen tongu purs string silicon...
20801,russian warship readi strike terrorist near al...
20802,nodapl nativ american leader vow stay winter f...
20803,tim tebow attempt anoth comeback time basebal ...
20804,keiser report meme war e truth broadcast netwo...


In [21]:
test_cases = X_test_cases['content'].values.astype('str')

In [22]:
test_cases = vectorizer.transform(test_cases)

In [23]:
# accuracy score on the test cases data
test_cases_prediction = model.predict(test_cases)

In [24]:
# importing the submit.csv to check accuracy
Y_test_results = pd.read_csv('submit.csv', index_col='id')

In [25]:
# importing the cleaned_stemmed_news_test_data.csv
test_results = Y_test_results['label'].values

In [26]:
model_accuracy = accuracy_score(test_cases_prediction, test_results)

In [27]:
print('Our Regression Model Accuracy Score is: ', model_accuracy)

Our Regression Model Accuracy Score is:  0.6336538461538461


In [28]:
np.set_printoptions(threshold=np.inf)
print(test_cases_prediction)

[0 1 1 0 1 1 0 1 1 1 1 1 1 1 1 0 0 0 1 0 1 1 1 1 1 0 1 0 0 1 0 1 0 0 1 0 0
 1 1 0 0 0 1 0 0 1 0 0 1 1 0 1 0 0 1 0 1 0 0 0 1 1 1 0 1 1 1 1 1 0 1 1 1 0
 0 0 0 0 1 0 0 1 0 1 1 1 1 0 0 0 1 0 0 1 1 0 0 0 1 1 0 1 0 0 0 0 1 0 1 1 1
 1 0 0 0 0 0 1 0 1 0 0 1 0 0 1 1 0 1 0 1 0 1 0 0 0 0 1 1 1 1 0 1 1 1 0 1 1
 0 0 1 1 0 1 0 1 0 1 1 1 1 1 1 1 0 1 0 1 1 0 0 0 0 1 0 0 0 0 0 0 0 0 0 1 0
 1 1 0 1 0 1 0 0 0 1 1 1 0 1 0 0 1 0 1 0 1 1 1 0 1 0 1 0 0 1 1 1 0 0 1 0 1
 0 0 0 1 1 1 0 0 0 0 1 1 1 1 1 1 1 0 1 1 0 1 1 0 0 1 0 1 1 1 1 0 1 0 0 1 1
 1 0 1 1 0 1 1 1 1 0 1 0 1 1 1 0 0 0 0 0 1 1 1 0 1 1 0 0 1 0 1 0 1 0 1 1 1
 1 0 1 0 0 0 1 1 1 0 1 1 0 0 0 1 1 0 0 1 0 1 0 1 0 0 0 1 0 1 1 0 0 1 1 0 1
 1 0 1 1 0 0 0 1 0 1 0 1 0 1 1 0 1 1 0 1 1 0 1 0 1 0 0 1 1 1 1 1 0 1 1 0 0
 0 1 0 0 1 0 1 1 0 0 0 1 1 0 1 1 0 1 0 0 1 1 1 1 0 0 1 0 0 0 0 0 0 0 1 0 0
 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 1 0 1 1 0 1 0 0 0 1 0 1 0 0 0 1 1 0 1 0 1 0
 0 1 0 1 0 0 1 1 1 1 0 0 0 0 1 1 0 0 0 0 0 0 0 0 0 0 1 0 0 0 1 0 1 1 0 1 1
 0 1 0 0 0 1 1 0 0 0 0 0 

In [29]:
np.set_printoptions(threshold=np.inf)
print(test_results)

[0 1 0 1 1 1 1 1 0 1 0 1 0 1 1 0 0 1 1 1 1 1 0 1 1 0 1 0 0 1 1 1 0 0 1 0 1
 1 0 0 0 1 1 1 0 1 1 0 1 1 0 0 0 1 1 0 1 0 0 1 1 0 0 1 1 1 0 1 0 1 1 1 0 1
 0 0 0 0 1 1 0 1 1 1 0 0 1 0 0 1 1 0 0 1 1 0 0 1 0 0 1 1 1 0 0 0 0 0 1 1 0
 1 1 0 1 1 0 0 1 1 1 1 1 1 1 1 1 1 1 0 1 0 1 1 1 0 0 0 0 1 1 1 0 1 1 0 1 1
 0 0 1 1 0 1 0 1 1 1 0 1 1 1 1 0 0 1 1 1 1 1 1 0 1 0 0 1 0 1 1 1 0 1 1 1 0
 1 0 0 1 1 1 1 0 1 1 0 0 1 1 1 0 1 1 1 1 1 1 0 1 1 0 1 1 1 0 1 1 0 0 0 0 1
 0 1 0 1 1 1 0 0 0 0 0 1 1 1 0 0 1 1 1 0 0 1 0 1 1 1 0 1 1 1 0 0 0 1 1 1 1
 1 1 1 1 1 1 1 1 1 0 1 0 1 0 1 0 0 0 0 0 0 1 1 1 1 1 0 0 0 1 1 0 1 0 1 0 0
 1 1 1 0 1 0 0 1 1 0 1 1 0 0 1 0 1 0 1 1 0 1 0 0 0 0 0 1 0 1 1 0 0 1 0 1 1
 1 0 1 0 1 0 1 1 1 1 0 1 0 1 1 1 1 0 0 1 0 0 1 1 0 1 0 0 0 1 0 1 0 1 1 1 0
 0 0 1 0 0 0 1 0 0 0 0 0 0 1 1 1 0 1 1 0 0 1 0 1 1 1 1 1 1 1 0 1 1 1 1 0 1
 0 0 1 0 1 1 0 1 0 0 0 1 1 0 0 0 1 0 0 1 1 0 1 0 0 0 1 0 0 0 1 1 1 1 0 0 1
 0 0 1 1 1 1 1 0 1 1 1 0 0 1 0 0 1 1 0 0 1 0 0 0 0 1 0 1 0 0 1 0 0 0 0 1 1
 1 1 1 1 1 1 1 1 0 0 1 0 