# Fake news classifier

In [None]:
import os
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix

In [None]:
# Download and unzip data unless the file already exists.
if not os.path.exists("./fake.json.zip"):
  !wget -O fake.json.zip https://www.dropbox.com/s/fs613hv1u24cjb9/fake_news_reddit_cikm20.json.zip?dl=0
  !unzip fake.json.zip

In [None]:
df = pd.read_json("fake_news_reddit_cikm20.json")

In [None]:
df.head()

In [None]:
r = np.random.RandomState(42)
# Note: use small sample to for testing out different ideas or looking at parts of the data
# by uncommenting the following.
#df.sample(n=5000, replace=False, random_state=r)

# Comment this out if using a smaller sample.
sample = df.copy()

# Calculate tfidf vectors
corpus = sample["text"]

tfidf_vectorizer = TfidfVectorizer(min_df=2,ngram_range=(1,1))
# Note tested with ngram sizes (1,2) and (2,2)
# ngram size (1,2) did not improve the results notably.

tfidf_vectorizer.fit(corpus)
tfidf_vectors = tfidf_vectorizer.transform(corpus)

In [None]:
# Olenainen koodi lataamiseen
import joblib
vectorizer2 = joblib.load("tfidf_vectorizer.joblib")
tfidf_vectors2 = joblib.load("tfidf_vectors.joblib")
model2 = joblib.load("logreg-model-with-all-data-balanced-2.joblib")
X = tfidf_vectors2
y = df["label"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.10, random_state=42)

In [None]:
# Code for saving the feature vectors.
import joblib
joblib.dump(tfidf_vectorizer, "tfidf_vectorizer.joblib")
joblib.dump(tfidf_vectors, "tfidf_vectors.joblib")

In [None]:
# Code for generating wordcloud
from wordcloud import WordCloud
import matplotlib.pyplot as plt

tf_idf_row_as_list = np.squeeze(tfidf_vectors2[19912].toarray())
indices = np.argsort(tf_idf_row_as_list)[::-1]

word_cloud_dict = {}
for i in indices[0:50]:
    #print(f"{features[i]}: {row_pos[i]}")
    print(features_df[0][i])
    word_cloud_dict[features_df[0][i]] = tf_idf_row_as_list[i]

wordcloud = WordCloud().generate_from_frequencies(word_cloud_dict)

plt.figure(figsize=(15,10))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")
plt.savefig("wordcloud.png")
#plt.show()

In [None]:
# Turning arbitraty text into a featuire vector
example_document = vectorizer2.transform(['text goes here'])

In [None]:
# Split for test and training data.
X = tfidf_vectors
y = sample["label"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.10, random_state=42)

In [None]:
# In case we want to have a look at the words.
features = tfidf_vectorizer.get_feature_names()
features_df = pd.DataFrame(features)

In [None]:
# Use this to track down docs in which documents specific feature is present.
def doc_indexes_where_feature_present(tfidf_vectors, features, feature_name):

    col = features.index(feature_name)
    rows, cols = tfidf_vectors[:,col].nonzero()
    
    return rows

def get_non_zero_tf_idf_scores_by_feature_name(tfidf_vectors, features, feature_name):

    col = features.index(feature_name)
    rows, cols = tfidf_vectors[:,col].nonzero()
    
    # Get all tfidf scores for given feature name
    tf_idf_scores = tfidf_vectors[rows, col].todense()
    return tf_idf_scores
    

In [None]:
# Train the model
model = LogisticRegression(max_iter=4000, class_weight="balanced")
model.fit(X_train, y_train)
predictions = model.predict(X_test)

In [None]:
# Check the scores
print(
"Accuracy:", accuracy_score(y_test,predictions),
"\nPrecision:", precision_score(y_test,predictions),
"\nRecall:", recall_score(y_test,predictions),
"\nF1:", f1_score(y_test,predictions))

In [None]:
# Plot a confusion matrix.
mat = confusion_matrix(y_test, predictions)
import matplotlib.pyplot as plt
import seaborn as sns
fig, ax = plt.subplots()
sns.heatmap(mat.T, ax=ax, fmt="d", square=True, annot=True, cbar=False, cmap="YlGnBu")
ax.set_xlabel("True label")
ax.set_ylabel("Predicted label")

In [None]:
# Code for saving the model.
from joblib import dump, load
dump(model, "logreg-model-with-all-data-balanced.joblib")

In [None]:
# Code for loading the model
from joblib import dump, load
mod = load("tfidf-model-with-all-data-balanced.joblib")

In [None]:
# Give it a go with random forest
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(
    max_depth=None,
    n_estimators=200,
    random_state=0
)
rf.fit(X_train, y_train)
y_predtrain_rf = rf.predict(X_test)