In [19]:
# Import ncessary libraries
import nltk.downloader
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression, PassiveAggressiveClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.pipeline import Pipeline
import pickle
from termcolor import colored

# Download necessary NLTK data
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /home/jparep/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /home/jparep/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to /home/jparep/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [42]:
# Configuration settings
FAKE_CSV_PATH = '/home/jparep/proj/nlp-tweet-analysis/data/raw/fake.csv'
REAL_CSV_PATH = '/home/jparep/proj/nlp-tweet-analysis/data/raw/true.csv'
MODEL_PATH = '/home/jparep/proj/nlp-tweet-analysis/model/model.pkl'
VECTORIZER_PATH = '/home/jparep/proj/nlp-tweet-analysis/model/vectorizer.pkl'
RANDOM_SEED = 42

# Initliaze stopwords and lemmatizer
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

# Set random seed for reproducibility
np.random.seed(RANDOM_SEED)

In [21]:
def read_csv_files(real_csv, fake_csv):
    """Load data and label fake and real and return concatenate dataframe"""
    df_fake = pd.read_csv(fake_csv)
    df_real = pd.read_csv(real_csv)
    
    df_fake['label'] = 'fake'
    df_real['label'] = 'real'
    
    df_concat = pd.concat([df_fake, df_real], axis=0).sample(frac=1).reset_index(drop=True)
    return df_concat

In [28]:
def preprocess_text(text):
    """Preprocess data"""
    text = re.sub(r'[^a-zA-Z0-9]+', ' ', text).lower()
    tokens = word_tokenize(text)
    lem = [lemmatizer.lemmatize(word) for word in tokens if word not in stop_words]
    return ' '.join(lem)

In [36]:
def load_and_preprocess_data():
    """Load data and preprocess. Make fake and real to 1 and 0 respectively"""
    df = read_csv_files(REAL_CSV_PATH, FAKE_CSV_PATH)
    df = df[['text', 'label']]
    df['processed_text'] = df['text'].apply(preprocess_text)
    df["label"] = df["label"].map({"real": 0, "fake": 1})
    return df

In [37]:
# Analyse preprocessed data
df = load_and_preprocess_data()
df.head(5)

Unnamed: 0,text,label,processed_text
0,Anyone who uses a driving service such as Uber...,1,anyone us driving service uber lyft take risk ...
1,WASHINGTON (Reuters) - President Donald Trump ...,0,washington reuters president donald trump said...
2,ISTANBUL (Reuters) - A Turkish court on Wednes...,0,istanbul reuters turkish court wednesday order...
3,Dinesh D Souza s Hillary s America will debu...,1,dinesh souza hillary america debut theater jul...
4,"When on the campaign trail, those running for ...",1,campaign trail running office like use music b...


In [38]:
# Train-validate-test split
def train_valid_test_split(X, y, train_size=0.7, valid_size=0.15, test_size=0.15):
    X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=(valid_size + test_size), random_state=RANDOM_SEED)
    
    ratio = valid_size / (valid_size + test_size)
    
    X_valid, X_test, y_valid, y_test = train_test_split(X_temp, y_temp, test_size=(1.0 - ratio), random_state=RANDOM_SEED)
    return X_train, X_valid, X_test, y_train, y_valid, y_test

In [40]:
# call method
X = df['processed_text']
y = df['label']
X_train, X_valid, X_test, y_train, y_valid, y_test = train_valid_test_split(X, y)

In [43]:
# Vectorize data
def vectorize_data(X_train, X_valid, X_test):
    vectorizer = TfidfVectorizer()
    xv_train = vectorizer.fit_transform(X_train)
    xv_valid = vectorizer.fit_transform(X_valid)
    xv_test = vectorizer.fit_transform(X_test)
    
    with open(VECTORIZER_PATH, 'wb') as f:
        pickle.dump(vectorizer, f)
    
    return xv_train, xv_valid, xv_test, vectorizer

# call method
xv_train, xv_valid, xv_test, vectorizer = vectorize_data(X_train, X_valid, X_test)

ValueError: too many values to unpack (expected 3)