Importing the dependencies->

In [86]:
import numpy as np
import pandas as pd
import re
from nltk.corpus import stopwords # contains the bogus word; nltl - natural language tool-kit
from nltk.stem.porter import PorterStemmer # to performing the stemming on words
from sklearn.feature_extraction.text import TfidfVectorizer # to convert the text into  feature vector
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression # Model to be used in
from sklearn.metrics import accuracy_score


In [61]:
# Download the stopwords

import nltk
nltk.download('stopwords')


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [4]:
# Printing the stopwords in english
print(stopwords.words('english'))

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

In [5]:
#Data Collections and data preprocessing

In [62]:
df_train = pd.read_csv('./content/train.csv')

In [9]:
df_train.shape

(20800, 5)

In [10]:
df_train.head()

Unnamed: 0,id,title,author,text,label
0,0,House Dem Aide: We Didn’t Even See Comey’s Let...,Darrell Lucus,House Dem Aide: We Didn’t Even See Comey’s Let...,1
1,1,"FLYNN: Hillary Clinton, Big Woman on Campus - ...",Daniel J. Flynn,Ever get the feeling your life circles the rou...,0
2,2,Why the Truth Might Get You Fired,Consortiumnews.com,"Why the Truth Might Get You Fired October 29, ...",1
3,3,15 Civilians Killed In Single US Airstrike Hav...,Jessica Purkiss,Videos 15 Civilians Killed In Single US Airstr...,1
4,4,Iranian woman jailed for fictional unpublished...,Howard Portnoy,Print \nAn Iranian woman has been sentenced to...,1


In [63]:
df_train['label'].value_counts()

1    10413
0    10387
Name: label, dtype: int64

In [64]:
df_train.isnull().sum()

id           0
title      558
author    1957
text        39
label        0
dtype: int64

In [65]:
# Since we have large dataset; therefore null values are being replace with null string

df_train = df_train.fillna("")

In [66]:
df_train.isnull().sum()

id        0
title     0
author    0
text      0
label     0
dtype: int64

In [87]:
# Merging the author name and news title

df_train['content'] = df_train['title']+" "+df_train['text']

In [88]:
df_train.head()

Unnamed: 0,id,title,author,text,label,content
0,0,House Dem Aide: We Didn’t Even See Comey’s Let...,Darrell Lucus,House Dem Aide: We Didn’t Even See Comey’s Let...,1,House Dem Aide: We Didn’t Even See Comey’s Let...
1,1,"FLYNN: Hillary Clinton, Big Woman on Campus - ...",Daniel J. Flynn,Ever get the feeling your life circles the rou...,0,"FLYNN: Hillary Clinton, Big Woman on Campus - ..."
2,2,Why the Truth Might Get You Fired,Consortiumnews.com,"Why the Truth Might Get You Fired October 29, ...",1,Why the Truth Might Get You Fired Why the Trut...
3,3,15 Civilians Killed In Single US Airstrike Hav...,Jessica Purkiss,Videos 15 Civilians Killed In Single US Airstr...,1,15 Civilians Killed In Single US Airstrike Hav...
4,4,Iranian woman jailed for fictional unpublished...,Howard Portnoy,Print \nAn Iranian woman has been sentenced to...,1,Iranian woman jailed for fictional unpublished...


In [68]:
# seperating the data and labels

x= df_train.drop(columns='label', axis=1)
y = df_train['label']

Stemming - stremming is process of reducing a word to its Root Word

> example:
> actor, actoress, acting ->> act






In [89]:
porter_stem = PorterStemmer()

def stemming(content):
  stemmed_content = re.sub('[^a-zA-Z]',' ', content) # Convert which are not alphabets to space " "
  stemmed_content = stemmed_content.lower()
  stemmed_content = stemmed_content.split()
  stemmed_content = [porter_stem.stem(word) for word in stemmed_content if not word in stopwords.words('english')]
  stemmed_content = ' '.join(stemmed_content)

  return stemmed_content
  

In [90]:
df_train['content'] = df_train['content'].apply(stemming)

KeyboardInterrupt: ignored

In [71]:
print(df_train['content'])

0        darrel lucu hous dem aid even see comey letter...
1        daniel j flynn flynn hillari clinton big woman...
2                   consortiumnew com truth might get fire
3        jessica purkiss civilian kill singl us airstri...
4        howard portnoy iranian woman jail fiction unpu...
                               ...                        
20795    jerom hudson rapper trump poster child white s...
20796    benjamin hoffman n f l playoff schedul matchup...
20797    michael j de la merc rachel abram maci said re...
20798    alex ansari nato russia hold parallel exercis ...
20799                            david swanson keep f aliv
Name: content, Length: 20800, dtype: object


In [72]:
# Seperating the data and columns
# x= df_train.drop(columns='label', axis=1)
x = df_train['content'].values
y = df_train['label'].values

In [73]:
print(x)
print(y)

['darrel lucu hous dem aid even see comey letter jason chaffetz tweet'
 'daniel j flynn flynn hillari clinton big woman campu breitbart'
 'consortiumnew com truth might get fire' ...
 'michael j de la merc rachel abram maci said receiv takeov approach hudson bay new york time'
 'alex ansari nato russia hold parallel exercis balkan'
 'david swanson keep f aliv']
[1 0 1 ... 0 1 1]


In [74]:
# Converting the textual data to numerical data: for feature engineering

vectorizer = TfidfVectorizer()
vectorizer.fit(x)

x = vectorizer.transform(x)

In [31]:
print(x)

  (0, 15686)	0.28485063562728646
  (0, 13473)	0.2565896679337957
  (0, 8909)	0.3635963806326075
  (0, 8630)	0.29212514087043684
  (0, 7692)	0.24785219520671603
  (0, 7005)	0.21874169089359144
  (0, 4973)	0.233316966909351
  (0, 3792)	0.2705332480845492
  (0, 3600)	0.3598939188262559
  (0, 2959)	0.2468450128533713
  (0, 2483)	0.3676519686797209
  (0, 267)	0.27010124977708766
  (1, 16799)	0.30071745655510157
  (1, 6816)	0.1904660198296849
  (1, 5503)	0.7143299355715573
  (1, 3568)	0.26373768806048464
  (1, 2813)	0.19094574062359204
  (1, 2223)	0.3827320386859759
  (1, 1894)	0.15521974226349364
  (1, 1497)	0.2939891562094648
  (2, 15611)	0.41544962664721613
  (2, 9620)	0.49351492943649944
  (2, 5968)	0.3474613386728292
  (2, 5389)	0.3866530551182615
  (2, 3103)	0.46097489583229645
  :	:
  (20797, 13122)	0.2482526352197606
  (20797, 12344)	0.27263457663336677
  (20797, 12138)	0.24778257724396507
  (20797, 10306)	0.08038079000566466
  (20797, 9588)	0.174553480255222
  (20797, 9518)	0.295420

In [75]:
#Splitting the dataset to training and test data

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size= 0.2, random_state = 2, stratify = y)



In [58]:
print(x_train.shape, x_test.shape)

(16640, 17128) (4160, 17128)


Training the model

In [35]:
model = LogisticRegression() 

model.fit(x_train , y_train)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

Evaluation 

Metrics used is Accuracy Score

In [36]:
# accuracy on training data - 

x_train_prediction = model.predict(x_train)

training_data_accuracy = accuracy_score(x_train_prediction, y_train)

print(training_data_accuracy)

0.9865985576923076


In [37]:
# accuracy on testing data - 

x_test_prediction = model.predict(x_test)

test_data_accuracy = accuracy_score(x_test_prediction, y_test)

print(test_data_accuracy)

0.9790865384615385


Making a predictive system

In [44]:
df_testing = pd.read_csv('/content/test.csv')

print(df_testing.columns)

Index(['id', 'title', 'author', 'text'], dtype='object')


In [83]:
df_testing.head()

Unnamed: 0,id,title,author,text,content
0,20800,"Specter of Trump Loosens Tongues, if Not Purse...",David Streitfeld,"PALO ALTO, Calif. — After years of scorning...",david streitfeld specter trump loosen tongu pu...
1,20801,Russian warships ready to strike terrorists ne...,,Russian warships ready to strike terrorists ne...,russian warship readi strike terrorist near al...
2,20802,#NoDAPL: Native American Leaders Vow to Stay A...,Common Dreams,Videos #NoDAPL: Native American Leaders Vow to...,common dream nodapl nativ american leader vow ...
3,20803,"Tim Tebow Will Attempt Another Comeback, This ...",Daniel Victor,"If at first you don’t succeed, try a different...",daniel victor tim tebow attempt anoth comeback...
4,20804,Keiser Report: Meme Wars (E995),Truth Broadcast Network,42 mins ago 1 Views 0 Comments 0 Likes 'For th...,truth broadcast network keiser report meme war e


In [45]:
df_testing.isnull().sum()

id          0
title     122
author    503
text        7
dtype: int64

In [48]:
df_testing = df_testing.fillna("")

In [50]:
df_testing.isnull().sum()

id        0
title     0
author    0
text      0
dtype: int64

In [51]:
df_testing['content'] = df_testing['author']+" "+df_testing['title']

In [52]:
df_testing['content'] = df_testing['content'].apply(stemming)

In [59]:
print(df_testing['content'])

0       david streitfeld specter trump loosen tongu pu...
1       russian warship readi strike terrorist near al...
2       common dream nodapl nativ american leader vow ...
3       daniel victor tim tebow attempt anoth comeback...
4        truth broadcast network keiser report meme war e
                              ...                        
5195    jodi rosen bangladeshi traffic jam never end n...
5196    sheryl gay stolberg john kasich sign one abort...
5197    mike mcphate california today exactli sushi ne...
5198                us marin deploy russian border norway
5199        teddi wayn awkward sex onscreen new york time
Name: content, Length: 5200, dtype: object


In [60]:
# x = df_testing['content'].values
z= df_testing['content'].values

print(z)

['david streitfeld specter trump loosen tongu purs string silicon valley new york time'
 'russian warship readi strike terrorist near aleppo'
 'common dream nodapl nativ american leader vow stay winter file lawsuit polic'
 ... 'mike mcphate california today exactli sushi new york time'
 'us marin deploy russian border norway'
 'teddi wayn awkward sex onscreen new york time']


In [76]:
z = vectorizer.transform(z)

In [None]:
print(z)

In [78]:
z_test_prediction = model.predict(z)

In [84]:
d = pd.read_csv('/content/submit.csv')

y = d['label'].values

In [85]:
#Prediction the model


test_data_prediction = accuracy_score(z_test_prediction, y)

print(test_data_prediction)

0.6440384615384616


In [79]:
def prediction_model(filepath=None):
  if filepath == None:
    return Null

  df_check = pd.read_csv(filepath)
  df_check = df_check.fillna("")
  df_check['content'] = df_check['author']+" "+df_check['title']
  df_check['content'] = df_check['content'].apply(stemming)
  prediction_data = df_check['content'].values
  prediction_data = vectorizer.transform(prediction_data)
  predicted_values = model.predict(z)

  return predicted_values
