# Wykrycie Fake Newsów z Wykorzystaniem Machine Learningu

In [1]:
import pandas as pd
import numpy as np
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
import nltk
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
import re
import string
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import RandomForestClassifier

In [2]:
#załadowanie zbioru fałszywych oraz prawdziwych wiadomości
zbior = pd.read_csv("train.csv")


In [3]:
zbior.head()

Unnamed: 0,id,title,author,text,label
0,0,House Dem Aide: We Didn’t Even See Comey’s Let...,Darrell Lucus,House Dem Aide: We Didn’t Even See Comey’s Let...,1
1,1,"FLYNN: Hillary Clinton, Big Woman on Campus - ...",Daniel J. Flynn,Ever get the feeling your life circles the rou...,0
2,2,Why the Truth Might Get You Fired,Consortiumnews.com,"Why the Truth Might Get You Fired October 29, ...",1
3,3,15 Civilians Killed In Single US Airstrike Hav...,Jessica Purkiss,Videos 15 Civilians Killed In Single US Airstr...,1
4,4,Iranian woman jailed for fictional unpublished...,Howard Portnoy,Print \nAn Iranian woman has been sentenced to...,1


## Przetworzenie danych

In [4]:
#będziemy pracować z samą zawartością artykułów
zbior_wiadomosci = zbior.drop(["id", "title","author"], axis=1)

## Usuwamy wiersze z pustymi wartościami

In [5]:
zbior_wiadomosci.isnull().sum()

text     39
label     0
dtype: int64

In [6]:
zbior_wiadomosci = zbior_wiadomosci.dropna()

In [7]:
zbior_wiadomosci.isnull().sum()

text     0
label    0
dtype: int64

In [8]:
#mieszamy zbior
zbior_wiadomosci = zbior_wiadomosci.sample(frac = 1)

In [9]:
zbior_wiadomosci.head()

Unnamed: 0,text,label
16099,Former Cleveland Browns quarterback Johnny Man...,0
19231,Pieczenik: ‘There is a coup in the White House...,1
16359,Previous Why Are Russia and China Buying Up Al...,1
15848,It just goes to show you that you can't rely o...,1
2583,UNITED NATIONS — Holding photographs of dea...,0


#### Tworzenie funkcji do czyszczenia tekstu (znaki interpunkcyjne oraz końcówki leksykalne)

In [10]:
#Pobieranie zbioru stopwords
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Jachos\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [11]:
print(stopwords.words('english'))
print(len(stopwords.words('english')))

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

In [12]:
#funkcja do usuwania końcówek leksykalnych
port_stem = PorterStemmer()

In [13]:
def clean(content):
    cleaned_content = re.sub('[^a-zA-Z]',' ', str(content))
    cleaned_content = cleaned_content.lower()
    return cleaned_content


def stemming(content):
    stemmed_content = content.split()
    stemmed_content = [port_stem.stem(word) for word in stemmed_content if not word in stopwords.words('english')]
    stemmed_content = ' '.join(stemmed_content)
    return stemmed_content


In [14]:
zbior_wiadomosci["text"] = zbior_wiadomosci["text"].apply(clean)

In [15]:
zbior_wiadomosci.head()

Unnamed: 0,text,label
16099,former cleveland browns quarterback johnny man...,0
19231,pieczenik there is a coup in the white house...,1
16359,previous why are russia and china buying up al...,1
15848,it just goes to show you that you can t rely o...,1
2583,united nations holding photographs of dea...,0


In [16]:
print(type(zbior_wiadomosci['text'][0]))

<class 'str'>


In [17]:
# zbior_wiadomosci["text"] = zbior_wiadomosci["text"].apply(stemming)
#wykonanie powyższej linijki zajmuje dłuższą chwile,
#dlatego na potrzebe prezentacji wczytamy plik csv zapisany po stemmingu
# zbior_wiadomosci.to_csv('train_clean.csv')

In [18]:
zbior_wiadomosci=pd.read_csv("train_clean.csv", index_col=0)
zbior_wiadomosci["text"] = zbior_wiadomosci["text"].apply(clean)

In [19]:
zbior_wiadomosci.head()

Unnamed: 0,text,label
0,hous dem aid even see comey letter jason chaff...,1
1,ever get feel life circl roundabout rather hea...,0
2,truth might get fire octob tension intellig an...,1
3,video civilian kill singl us airstrik identifi...,1
4,print iranian woman sentenc six year prison ir...,1


#### Dzielimy kolumny zbioru

In [20]:

x = zbior_wiadomosci["text"]
y = zbior_wiadomosci["label"]

#### Dzielimy zbiór danych na zbiór testowy i treningowy

In [21]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25)

#### Konwertujemy tekst na wartości wektorowe

In [22]:
vector = TfidfVectorizer()
xv_train = vector.fit_transform(x_train)
xv_test = vector.transform(x_test)

## Tworzenie Modeli - Skorzystamy z 4 różnych algorytmów

### 1. Logistic Regression - Regresja Logistyczna

In [23]:
LR = LogisticRegression()
LR.fit(xv_train,y_train)

LogisticRegression()

In [24]:
predykcja_lr=LR.predict(xv_test)

In [25]:
LR.score(xv_test, y_test)

0.9470236948564824

In [26]:
print(classification_report(y_test, predykcja_lr))

              precision    recall  f1-score   support

           0       0.95      0.94      0.95      2617
           1       0.94      0.95      0.95      2574

    accuracy                           0.95      5191
   macro avg       0.95      0.95      0.95      5191
weighted avg       0.95      0.95      0.95      5191



### 2. Decision Tree Classification - Drzewa Decyzyjne

In [27]:
DT = DecisionTreeClassifier()
DT.fit(xv_train, y_train)

DecisionTreeClassifier()

In [28]:
predykcja_dt = DT.predict(xv_test)

In [29]:
DT.score(xv_test, y_test)

0.8838374109034868

In [30]:
print(classification_report(y_test, predykcja_dt))

              precision    recall  f1-score   support

           0       0.88      0.89      0.89      2617
           1       0.89      0.88      0.88      2574

    accuracy                           0.88      5191
   macro avg       0.88      0.88      0.88      5191
weighted avg       0.88      0.88      0.88      5191



### 3. Gradient Boosting Classifier - Drzewa Decyzyjne wzmocnione Gradientem

In [31]:
GBC = GradientBoostingClassifier(random_state=0)
GBC.fit(xv_train, y_train)

GradientBoostingClassifier(random_state=0)

In [32]:
predykcja_gbc = GBC.predict(xv_test)

In [33]:
GBC.score(xv_test, y_test)

0.9285301483336544

In [34]:
print(classification_report(y_test, predykcja_gbc))

              precision    recall  f1-score   support

           0       0.94      0.92      0.93      2617
           1       0.92      0.94      0.93      2574

    accuracy                           0.93      5191
   macro avg       0.93      0.93      0.93      5191
weighted avg       0.93      0.93      0.93      5191



### 4. Random Forest Classifier - Lasy Losowe

In [35]:
RFC = RandomForestClassifier(random_state=0)
RFC.fit(xv_train, y_train)

RandomForestClassifier(random_state=0)

In [36]:
predykcja_rfc = RFC.predict(xv_test)

In [37]:
RFC.score(xv_test, y_test)

0.9192833750722404

In [38]:
print(classification_report(y_test, predykcja_rfc))

              precision    recall  f1-score   support

           0       0.90      0.94      0.92      2617
           1       0.94      0.89      0.92      2574

    accuracy                           0.92      5191
   macro avg       0.92      0.92      0.92      5191
weighted avg       0.92      0.92      0.92      5191



# Ręczne testowanie modeli


In [39]:
def output(n):
    if n == 1:
        return "Fałszywe Wiadomości"
    elif n == 0:
        return "Prawdziwe Wiadomości"
    
def manual_testing(news):
    testing_news = {"text":[news]}
    new_test = pd.DataFrame(testing_news)
    new_test["text"] = new_test["text"].apply(clean) 
    new_test["text"] = new_test["text"].apply(stemming) 
    print(new_test["text"])
    new_x_test = new_test["text"]
    new_xv_test = vector.transform(new_x_test)
    predykcja_LR = LR.predict(new_xv_test)
    predykcja_DT = DT.predict(new_xv_test)
    predykcja_GBC = GBC.predict(new_xv_test)
    predykcja_RFC = RFC.predict(new_xv_test)

    return print("\n\nLR Prediction: {} \nDT Prediction: {} \nGBC Prediction: {} \nRFC Prediction: {}".format(output(predykcja_LR[0]), 
                                                                                                              output(predykcja_DT[0]), 
                                                                                                              output(predykcja_GBC[0]), 
                                                                                                              output(predykcja_RFC[0])))

In [44]:
print(zbior_wiadomosci['label'][4])

1


In [45]:
print(zbior_wiadomosci['text'][4])

print iranian woman sentenc six year prison iran revolutionari guard search home found notebook contain fiction stori written woman stone death accord eurasia review golrokh ebrahimi irae wife polit prison arash sadeghi serv year prison sentenc human right activist public report intellig unit revolutionari guard came arrest husband raid apart without warrant found draft stori ebrahimi irae written articl state one confisc draft stori stone women death adulteri never publish never present anyon articl state narr follow stori protagonist watch movi stone women islam law adulteri


In [46]:
news = str(input())
manual_testing(news)



print iranian woman sentenc six year prison iran revolutionari guard search home found notebook contain fiction stori written woman stone death accord eurasia review golrokh ebrahimi irae wife polit prison arash sadeghi serv year prison sentenc human right activist public report intellig unit revolutionari guard came arrest husband raid apart without warrant found draft stori ebrahimi irae written articl state one confisc draft stori stone women death adulteri never publish never present anyon articl state narr follow stori protagonist watch movi stone women islam law adulteri
0    print iranian woman sentenc six year prison ir...
Name: text, dtype: object


LR Prediction: Fałszywe Wiadomości 
DT Prediction: Fałszywe Wiadomości 
GBC Prediction: Fałszywe Wiadomości 
RFC Prediction: Fałszywe Wiadomości
