In [2]:
'''
Student ID: 
Author: 
'''
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.metrics import roc_auc_score, precision_score, recall_score, accuracy_score
import joblib

# Data Preprocessing

In [6]:
def data_preprocessing():
    # read the news.csv file 
    df = pd.read_csv("news.csv", sep=",")
    df.columns = ["id", "title", "text", "label"]
    # Label "REAL": 1 "FAKE": 0
    df.loc[:, "label"] = df["label"].apply(lambda x: 1 if x == "REAL" else 0)
    
    # Split the dataset to 80% training set and 20% testing set.
    X = df['title'] + ". " + df["text"]
    y = df["label"]

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle = True, stratify = y, random_state=100)
    
    return X_train, X_test, y_train, y_test
    


def check_ratio(y_train, y_test):
    # The ratio of real-to-fake news are roughly the same in both training and testing sets
    train_real, train_fake, test_real, test_fake = 0, 0, 0, 0
    for i in y_train:
        if i == 1: train_fake += 1
        else: train_real +=1
    for j in y_test:
        if j == 1: test_fake += 1
        else: test_real += 1
    print(train_real, train_fake, test_real, test_fake)
    print("Real-to-fake new ratio in training sets:", train_real/train_fake)
    print("Real-to-fake new ratio in testing sets:", test_real/test_fake)
    print("Thus the ratio of real-to-fake news roughly the same in both training and testing sets")

X_train, X_test, y_train, y_test = data_preprocessing()
check_ratio(y_train, y_test)
  

2531 2537 633 634
Real-to-fake new ratio in training sets: 0.9976350019708317
Real-to-fake new ratio in testing sets: 0.998422712933754
Thus the ratio of real-to-fake news roughly the same in both training and testing sets


# Training Logistic Regression Models with Adding Bi-Grams to the Model

In [7]:

def LR_with_CountVectorizer(X_train, y_train):
    # Prepare pipeline building up using sklearn's CounterVectorizer
    pipe_count = Pipeline([
        # Add bigram in CounterVectorizer
        ('vec', CountVectorizer(ngram_range=(2,2))),
        ('log', LogisticRegression())
    ])
    pipe_count.fit(X_train, y_train)
    
    return pipe_count

def LR_with_TfidfVectorizer(X_train, y_train):
    # Prepare pipeline building up using sklearn's TfidfVectorizer
    pipe_tfidf = Pipeline([
        # Add bigram in TfidfVectorizer
        ('vec', TfidfVectorizer(ngram_range=(2,2))), #(1,2)
        ('log', LogisticRegression())
    ])
    pipe_tfidf.fit(X_train, y_train)
    
    return pipe_tfidf

def evalution(pipe_count_model, pipe_tfidf_model, X_test, y_test):
    # Compute (i) accuracy, (ii) precision and (iii) recall based on the testing set.
    # Evaluate the model1 
    print(" Evalluate model: Logistic Regression Models with Adding Bi-Grams using CounterVectorizer")
    # pipe_count_model = LR_with_CountVectorizer(X_train, y_train)
    y_pred_count = pipe_count_model.predict(X_test)
    print("The following metrics of model1: ")
    print("Accuracy: {:.4f}".format(accuracy_score(y_test, y_pred_count)))
    print("Precision: {:.4f}".format(precision_score(y_test, y_pred_count)))
    print("Accuracy: {:.4f}".format(recall_score(y_test, y_pred_count)))
    
    
    # Evaluate the model2
    print(" Evalluate model: Logistic Regression Models with Adding Bi-Grams using TfidfVectorizer")
    # pipe_tfidf_model = LR_with_TfidfVectorizer(X_train, y_train)
    y_pred_tfidf = pipe_tfidf_model.predict(X_test)
    print("The following metrics of model2: ")
    print("Accuracy: {:.4f}".format(accuracy_score(y_test, y_pred_tfidf)))
    print("Precision: {:.4f}".format(precision_score(y_test, y_pred_tfidf)))
    print("Accuracy: {:.4f}".format(recall_score(y_test, y_pred_tfidf)))


def save_model(pipe_count_model, pipe_tfidf_model):
    # Save the models in a .pkl file using joblib
    joblib.dump(pipe_count_model, 'count_model.pkl')
    joblib.dump(pipe_tfidf_model, 'tfidf_model.pkl')
    
    


In [8]:
if __name__ == "__main__":
    # Train the model
    model1 = LR_with_CountVectorizer(X_train, y_train)
    model2 = LR_with_TfidfVectorizer(X_train, y_train)
    
    # Evaluate the model
    evalution(model1, model2, X_test, y_test)
    
    #Save the model
    save_model(model1, model2)
    

 Evalluate model: Logistic Regression Models with Adding Bi-Grams using CounterVectorizer
The following metrics of model1: 
Accuracy: 0.9132
Precision: 0.9396
Accuracy: 0.8833
 Evalluate model: Logistic Regression Models with Adding Bi-Grams using TfidfVectorizer
The following metrics of model2: 
Accuracy: 0.9155
Precision: 0.9355
Accuracy: 0.8927
