In [1]:
import pandas as pd
import re
import pickle

# Join CSV

In [2]:
true_news = pd.read_csv('True.csv')

In [3]:
news = []
for row in true_news.iterrows():
    dict_row = {
        'text' : row[1]['text'],
        'label': 1
    }
    news.append(dict_row)

In [4]:
fake_news = pd.read_csv('Fake.csv')

In [5]:
for row_fake in fake_news.iterrows():
    dict_row_fake = {
        'text': row_fake[1]['text'],
        'label' : 0
    }
    news.append(dict_row_fake)

In [6]:
df_news = pd.DataFrame(news)

In [7]:
df_news

Unnamed: 0,text,label
0,WASHINGTON (Reuters) - The head of a conservat...,1
1,WASHINGTON (Reuters) - Transgender people will...,1
2,WASHINGTON (Reuters) - The special counsel inv...,1
3,WASHINGTON (Reuters) - Trump campaign adviser ...,1
4,SEATTLE/WASHINGTON (Reuters) - President Donal...,1
...,...,...
44893,21st Century Wire says As 21WIRE reported earl...,0
44894,21st Century Wire says It s a familiar theme. ...,0
44895,Patrick Henningsen 21st Century WireRemember ...,0
44896,21st Century Wire says Al Jazeera America will...,0


# Preprocessing

In [8]:
for row in df_news.iterrows():
    sentence = row[1]['text']
    sentence = re.sub(r"[^a-zA-Z0-9\s]", r"", sentence)
    df_news['text'][row[0]] = sentence.lower()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.


In [9]:
df_news

Unnamed: 0,text,label
0,washington reuters the head of a conservative...,1
1,washington reuters transgender people will be...,1
2,washington reuters the special counsel invest...,1
3,washington reuters trump campaign adviser geo...,1
4,seattlewashington reuters president donald tr...,1
...,...,...
44893,21st century wire says as 21wire reported earl...,0
44894,21st century wire says it s a familiar theme w...,0
44895,patrick henningsen 21st century wireremember ...,0
44896,21st century wire says al jazeera america will...,0


# Training

In [10]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.svm import LinearSVC



In [11]:
tfidf = TfidfVectorizer(min_df=0.1, 
                        max_df=0.8, 
                        max_features = 200000, 
                        ngram_range = (1, 1))

vectorizer_name = "model/vectorizer.pickle"
features = tfidf.fit_transform(df_news.text).toarray()

pickle.dump(tfidf.vocabulary_, open(vectorizer_name, 'wb'))
labels = df_news.label

In [21]:
X_train, X_test, y_train, y_test = train_test_split(features, 
                                                    labels, 
                                                    test_size=0.15, 
                                                    random_state=7)

model = LinearSVC()
model = model.fit(X_train, y_train)
model_name = 'model/finalized_model.sav'

# save model
pickle.dump(model, open(model_name, 'wb'))

# Predict Accuracy

In [22]:
loaded_model = pickle.load(open(model_name, 'rb'))
result = loaded_model.score(X_test, y_test)
print(result)

0.9948032665181886


# Predict Given Text

In [23]:
texts = ['at the end of the year corona will kill million people']

In [24]:
#load vocab
vocab = pickle.load(open("model/vectorizer.pickle", "rb"))
vectorizer = TfidfVectorizer(min_df=0.1, 
                                    max_df=0.8, 
                                    max_features = 200000, 
                                    ngram_range = (1, 1), 
                                    vocabulary=vocab)

vectorizer.fit(texts)

text_features = vectorizer.transform(texts)

#make predictions
predictions = loaded_model.predict(text_features)
print(predictions)

[0]
