In [1]:
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.feature_extraction.text import TfidfVectorizer
import pickle

In [2]:
#datasets
real = pd.read_csv('True.csv')
fake = pd.read_csv('Fake.csv')

In [3]:
#add a column to data to indicate if it's from the real or fake news sets
fake['type'] = 0
real['type'] = 1

In [4]:
modify_text = [real['text'][i].replace('WASHINGTON (Reuters) - ', '').replace('BRUSSELS (Reuters) - ', '').replace('MINSK (Reuters) - ', '').replace('MOSCOW (Reuters) - ', '').replace('JAKARTA (Reuters) - ', '').replace('LONDON (Reuters) - ', '').replace('(Reuters) - ','').replace('LIMA (Reuters) - ','').replace('SAN FRANCISCO (Reuters) - ','').replace('MEXICO CITY (Reuters) - ','') for i in range(len(real['text']))]
real['text'] = modify_text

In [5]:
df = pd.concat([fake, real], axis=0) #combine the two datasets into one, x axis
df = df.drop(columns=['subject', 'date']) # drop subject and date column
df = df.sample(frac = 1) #shuffle data
df = df.reset_index(drop=True) #drops the current index of the df and replaces it with an index of increasing integers, because data is shuffled/out of order
df

Unnamed: 0,title,text,type
0,Trump expected to nominate attorney Sullivan a...,President Donald Trump is expected to nominate...,1
1,[VIDEO] WHAT JERRY SEINFELD HAS TO SAY ABOUT O...,"Kids Just Want To Use Words That s racist, th...",0
2,Puerto Rico enacts emergency debt moratorium bill,SAN JUAN Puerto Rico’s governor on Wednesday s...,1
3,TUCKER CARLSON Exposes Radical Middle School T...,"Tonight, Tucker Carlson took on Yvette Felarca...",0
4,“President Trump Fights Fire with Fire” Sarah ...,John Roberts was first to question the tweets ...,0
...,...,...,...
44893,Australia arrests man accused of trying to sel...,MELBOURNE Australian police said on Sunday the...,1
44894,Watch Joe Biden Take Trump’s Own Words And Te...,If there s anyone in this day and age who most...,0
44895,"Australians give up 51,000 illegal guns as gov...","SYDNEY Australians turned in 51,000 illegal fi...",1
44896,A kiss is not just a kiss on Cleveland convent...,CLEVELAND In an age of social media and sound ...,1


In [6]:
X = df['text'] #input set
y = df['type'] #output set

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle = True ) #split into train and test 

# Convert text to vectors
vectorization = TfidfVectorizer(stop_words='english', max_df=0.7)
Xv_train = vectorization.fit_transform(X_train)
Xv_test = vectorization.transform(X_test)

# train model & calculate accuracy
model = DecisionTreeClassifier()
model.fit(Xv_train, y_train)
predictions = model.predict(Xv_test)
score = accuracy_score(y_test, predictions)
print("Accuracy score: {0:.2f} %".format(100 * score))

Accuracy score: 94.57 %


In [31]:
news = input()

Kenya’s main opposition party warned the election commission against reorganising voting in four western counties where polls did not open on Thursday due to violence and said supporters should boycott any process that goes ahead.


In [17]:
def ouiOuNon(n):
    if n == 0:
        print("Seems like fake news...")
    else:
        print("This seems real to me!")
        

In [14]:
def detect(test):
    vector_test = vectorization.transform([test])
    prediction = model.predict(vector_test)
    return prediction

In [32]:
ouiOuNon(detect(news))

This seems real to me!


In [26]:
news = input()

Since then, the Russia investigation has revealed a sprawling scandal: Members of Trump’s campaign, including those in the president’s inner circle, were in constant contact with representatives of the Russian government throughout the election and transition. The two campaigns discussed tactics and policy, including the release of “dirt” on their mutual opponent, Hillary Clinton, and rolling back American sanctions against Russia. And they executed their strategies timed to maximally benefit Trump’s chances of victory.


In [27]:
ouiOuNon(detect(news))

Seems like fake news...


In [9]:
filename = 'model.pkl'
file_obj = open(filename, 'wb')
pickle.dump(model, file_obj)
v = 'tfidf_vectorizer.pickle'
v_obj = open(v, 'wb')
pickle.dump(vectorization, v_obj)
file_obj.close()
v_obj.close()
