# **IMPORTS**

---

In [2]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

In [3]:
import math
import nltk
import os
import re

In [4]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords



from sklearn.model_selection import train_test_split as tts
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from sklearn.metrics import classification_report,confusion_matrix,accuracy_score

```
⠀⠀⠀⠀⠀⠀⣀⣤⡤
⠀⠀⠀⠀⢀⣾⣿⠋
⠀⠀⠀⣠⣾⣿⡟
⠀⠀⢸⠛⠉⢹⠃⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⢀⡠⠄⠠⣀
⠀⠀⡘⠀⠀⠀⡀⠀⠀⠀⠀⠀⠀⠀⠀⣠⠖⠉⠀⠀⠀⣾⣿⣦⡀
⠀⠀⡇⠀⠀⠀⢡⠄⠀⠀⣀⣀⣀⣠⠊⠀⠀⠀⠀⡠⠞⠛⠛⠛⠛⡀
⠀⠀⢃⠀⠀⠀⠀⠗⠚⠉⠉⠀⠈⠁⠀⠀⠀⢀⡔⠁⠀
⠀⠀⠸⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⣴⣶⣄⠲⡎
⠀⠀⠀⠃⠀⠀⢠⣤⡀⠀⠀⠀⠀⣿⣿⣿⠀⠘⡄
⠀⠀⠀⡆⠀⠀⣿⣿⡇⠀⠀⠀⠀⠈⠛⠉⣴⣆⢹⡄
⠀⠀⠀⣇⢰⡧⣉⡉⠀⠀⢀⡀⠀⣀⣀⣠⣿⡷⢠⡇
⠀⠀⠀⢻⠘⠃⠈⠻⢦⠞⠋⠙⠺⠋⠉⠉⠉⢡⠟
⠀⠀⠀⠀⠳⢄⡀⠀⠀⠀⠀⠀⠀⠀⠀⠀⢠⠋⠀⠀
```

# **SETTINGS**
---

In [5]:
# Matplotlib inline to visualize Matplotlib graphs
%matplotlib inline

# Configuration to set so that all the Seaborn figures come out with this size
%config Inlinebackend.figure_format= 'retina'

In [6]:
# Set the Seaborn context to "poster" for larger text and figures
sns.set_context("poster")

# Set the default figure size for Seaborn plots
sns.set(rc={"figure.figsize": (12., 6.)})

# Set the Seaborn style to "whitegrid" for a white background with gridlines
sns.set_style("whitegrid")

In [7]:
# Set the max displayable columns to max
pd.set_option('display.max_columns', None)

In [8]:
# Activates XLA (for JIT compiler)
os.environ["TF_XLA_FLAGS"]= "--tf_xla_enable_xla_devices"

# Uses the right memory when using GPU
os.environ["TF_FORCE_GPU_ALLOW_GROWTH"]= "true"

# **DOCUMENTATION**
---

[placeholder](https://google.com)

# **DATA**

---

In [9]:
# Data loading
train_path= "data/training_data.csv"
test_path=  "data/testing_data.csv"

# Since the csv separates the columns with " " instead of a comma, we define <sep> as None
# We use engine= "python" to avoid warning about \c or \b imports from text
train_data=  pd.read_csv(train_path, names= ["label", "text"], header=None, sep=None, engine= "python")
test_data=   pd.read_csv(test_path, names= ["label", "text"], header=None, sep=None, engine= "python")

In [10]:
# Dataframe for training creation
df_train= pd.DataFrame(train_data)
df_train.head(3)

Unnamed: 0,label,text
0,﻿0,donald trump sends out embarrassing new year‚s...
1,0,drunk bragging trump staffer started russian c...
2,0,sheriff david clarke becomes an internet joke ...


In [11]:
# Dataframe for testing creation
df_test= pd.DataFrame(test_data)
df_test.head(3)

Unnamed: 0,label,text
0,﻿2,copycat muslim terrorist arrested with assault...
1,2,wow! chicago protester caught on camera admits...
2,2,germany's fdp look to fill schaeuble's big shoes


>`0` is real new, `1` is fake new, `2` is the prediction we need to do at the end

# **DATA CHECKING**

---

In [12]:
# 34k rows of headers articles
# 2 columns: text and the label.
print(f"Rows: {df_train.shape[0]}\nColumns: {df_train.shape[1]}")

Rows: 34152
Columns: 2


In [13]:
df_train.isnull().sum()

label    0
text     0
dtype: int64

In [14]:
df_train.sample(1)

Unnamed: 0,label,text
28091,1,kerry trip to cuba for rights dialogue cancele...


# **FUNCTIONS**

---

In [15]:
def clean_text(text):
  text= text.lower()                      # ensures lowercase
  text= re.sub(r"\t", " ", text)          # erases \t from text
  text= re.sub(r"[^a-z0-9\s]", "", text)  # erases symbols
  text= re.sub(r"\s+", " ", text).strip() # erases initial-end blank spaces

  return text

In [19]:
def lemmatize_text(text):
  lemma= WordNetLemmatizer()                        # creates lemmatizer
  stop_words= set(stopwords.words('english'))       # load stopwords on english

  words= text.split()                               # split the text if not
  words= [lemma.lemmatize(w) for w in words         # apply lemmatizer to all words
           if w not in stop_words and len(w) > 2]
  
  return " ".join(words)

In [17]:
def train_and_evaluate(model ,model_name, X_train, X_test, y_train, y_test):
  print(f"Training {model_name}\n{50*"-"}")

  model.fit(X_train, y_train)   # fit the model
  y_pred= model.predict(X_test) # predict with X_test

  # metrics
  print(f"Accuracy Score: {accuracy_score(y_test, y_pred)}")
  print(classification_report(y_test, y_pred, target_names=["Fake", "Real"]))
  print(f"Confusion Matrix:\n{confusion_matrix(y_test, y_pred)}")

In [None]:
def split_data(X, y):
  # y is messed with '\ufeff0' dtype
  y= y.astype(str).str.replace(r"[^0-9]", "", regex=True).astype(int)

  X_train, X_test, y_train, y_test= tts(X, y, test_size= .2, stratify= y,  random_state= 69)
  print(f"X-train: {X_train.shape}\nX-test: {X_test.shape}\ny-train: {y_train.shape}\ny-test: {y_test.shape}")
  
  return X_train, X_test, y_train, y_test

#### Train/Test Split Failure

**Error:**  
`ValueError: The least populated class in y has only 1 member`

**Cause:**  
The dataset contained a hidden label (`"﻿0"`, BOM character) with only **one sample**.  
`stratify` requires **at least two samples per class**, so the split failed.

**Fix:**  
Clean the labels before splitting:

```python
y = y.astype(str).str.replace(r"[^0-9]", "", regex=True).astype(int)
```


> this sould be on celaning data, but putting on the split_data func. is easier and quick (know as bad practice, ik)

In [None]:
def fit_transform_tfidf(X_train, X_test, tfidf_vectorizer):
  X_train_tfidf= tfidf_vectorizer.fit_transform(X_train)
  X_test_tfidf=  tfidf_vectorizer.transform(X_test)

  print(f"TF-IDF shape: {X_train_tfidf.shape}")
  return X_train_tfidf, X_test_tfidf

# **DATA PRRE-PROCESSING**

---

## Cleaning data

In [20]:
df_train["clean_text"]= df_train["text"].apply(clean_text)
df_train["lemmatized_text"]= df_train["clean_text"].apply(lemmatize_text)

In [24]:
df_train.sample(3)

Unnamed: 0,label,text,clean_text,lemmatized_text
3224,0,conservative calls president obama a ‚muslim‚ ...,conservative calls president obama a muslim ov...,conservative call president obama muslim resol...
12263,0,the numbers are in: here‚s how the catholics v...,the numbers are in heres how the catholics vot...,number here catholic voted historic election
18587,1,u.s. house clears path for tax bill with budge...,us house clears path for tax bill with budget ...,house clear path tax bill budget approval


## Split the data

In [None]:
y= df_train.label
X= df_train["text"]
X_clean= df_train["clean_text"]
X_lemma= df_train["lemmatized_text"]
# call for split_data(X, y) when requires (we're gonna do some tests)

In [49]:
X_train, X_test, y_train, y_test= split_data(X= X, y= y)

X-train: (27321,)
X-test: (6831,)
y-train: (27321,)
y-test: (6831,)


In [34]:
y.value_counts()

label
0     17571
1     16580
﻿0        1
Name: count, dtype: int64

## **VECTORIZATION I**

We know about `BoW`, but wince `TfidfVectorizer` is just better we don't want to waste time using a method that is just worse. We'll be using directly TfidfVectorizer.

In [44]:
tfidf_vectorizer= TfidfVectorizer(
  lowercase=    True,       # ensures lowercase
 #stop_words=   "english",  # stopwords for english lang !!! MAY WORK GRONG WITH LEMMATIZED DATA
  max_features= 5000,       # 5000 more common words
  ngram_range=  (1, 2),     # unigrams and bigrams
  max_df=       .9,         # ignore words that appears more than 90% on all documents
  min_df=       5           # ignore words that appears more than 5 times in different documents
)

In [52]:
X_train_tfidf, X_test_tfidf= fit_transform_tfidf(X_test= X_test, X_train= X_train, tfidf_vectorizer= tfidf_vectorizer)

TF-IDF shape: (27321, 5000)


## Tests

### Using original data

In [53]:
lr_model= LogisticRegression(max_iter= 1000, random_state= 69)
train_and_evaluate(lr_model, "Logistic Regression",  X_train_tfidf, X_test_tfidf, y_train= y_train, y_test= y_test)

Training Logistic Regression
--------------------------------------------------
Accuracy Score: 0.9392475479432001
              precision    recall  f1-score   support

        Fake       0.95      0.93      0.94      3515
        Real       0.93      0.95      0.94      3316

    accuracy                           0.94      6831
   macro avg       0.94      0.94      0.94      6831
weighted avg       0.94      0.94      0.94      6831

Confusion Matrix:
[[3282  233]
 [ 182 3134]]
