In [18]:
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import pandas as pd
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report, confusion_matrix
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import RandomForestClassifier
np.random.seed(1234)

In [2]:
data = pd.read_csv('WELFake_Dataset.csv')
data.head()

Unnamed: 0.1,Unnamed: 0,title,text,label
0,0,LAW ENFORCEMENT ON HIGH ALERT Following Threat...,No comment is expected from Barack Obama Membe...,1
1,1,,Did they post their votes for Hillary already?,1
2,2,UNBELIEVABLE! OBAMA’S ATTORNEY GENERAL SAYS MO...,"Now, most of the demonstrators gathered last ...",1
3,3,"Bobby Jindal, raised Hindu, uses story of Chri...",A dozen politically active pastors came here f...,0
4,4,SATAN 2: Russia unvelis an image of its terrif...,"The RS-28 Sarmat missile, dubbed Satan 2, will...",1


In [3]:
data=data.iloc[: , 1:]

In [4]:
data.head()

Unnamed: 0,title,text,label
0,LAW ENFORCEMENT ON HIGH ALERT Following Threat...,No comment is expected from Barack Obama Membe...,1
1,,Did they post their votes for Hillary already?,1
2,UNBELIEVABLE! OBAMA’S ATTORNEY GENERAL SAYS MO...,"Now, most of the demonstrators gathered last ...",1
3,"Bobby Jindal, raised Hindu, uses story of Chri...",A dozen politically active pastors came here f...,0
4,SATAN 2: Russia unvelis an image of its terrif...,"The RS-28 Sarmat missile, dubbed Satan 2, will...",1


In [5]:
data.describe()

Unnamed: 0,label
count,72134.0
mean,0.514404
std,0.499796
min,0.0
25%,0.0
50%,1.0
75%,1.0
max,1.0


In [6]:
data.shape

(72134, 3)

In [7]:
data.isnull()

Unnamed: 0,title,text,label
0,False,False,False
1,True,False,False
2,False,False,False
3,False,False,False
4,False,False,False
...,...,...,...
72129,False,False,False
72130,False,False,False
72131,False,False,False
72132,False,False,False


In [8]:
data = data.fillna(' ')

In [9]:
data.isnull().sum()

title    0
text     0
label    0
dtype: int64

In [10]:
data["content"] = data["title"] + data["text"]

In [11]:
X_train, X_test, Y_train, Y_test = train_test_split(data["content"], data['label'], test_size=0.25, random_state=1234)

In [12]:
# Convert text data to CountVector features
count_vectorizer = CountVectorizer(stop_words='english', max_df=0.7)
count_train = count_vectorizer.fit_transform(X_train)
count_test = count_vectorizer.transform(X_test)


In [13]:
## RandomForest Classifier
rf_classifier  = RandomForestClassifier(n_estimators=100,random_state=123)
#print(count_train.shape) 
#print(Y_train)

rf_classifier.fit(count_train, Y_train)
y_pred_train = rf_classifier.predict(count_train)
accuracy_train = accuracy_score(Y_train, y_pred_train)
print("Train Accuracy:", accuracy_train)

y_pred = rf_classifier.predict(count_test)
accuracy = accuracy_score(Y_test, y_pred)
print("Test Accuracy:", accuracy)

precision = precision_score(Y_test, y_pred)
print("Test precision:", precision)

recall = recall_score(Y_test, y_pred)
print("Test recall:", recall)

f1 = f1_score(Y_test, y_pred)
print("Test f1:", f1)

confusion=confusion_matrix(Y_test, y_pred)
print("Test confusion matrix:",confusion)

report = classification_report(Y_test, y_pred)
print("Test Report:", report)

(54100, 236389)
10862    0
32643    1
15858    1
28856    1
2691     1
        ..
55985    1
32399    0
60620    1
34086    0
58067    0
Name: label, Length: 54100, dtype: int64
Train Accuracy: 1.0
Test Accuracy: 0.9424975047133193
Test precision: 0.9388941748599218
Test recall: 0.9508565310492505
Test f1: 0.9448374913559232
Test confusion matrix: [[8116  578]
 [ 459 8881]]
Test Report:               precision    recall  f1-score   support

           0       0.95      0.93      0.94      8694
           1       0.94      0.95      0.94      9340

    accuracy                           0.94     18034
   macro avg       0.94      0.94      0.94     18034
weighted avg       0.94      0.94      0.94     18034



In [21]:
##Linear SVC
svm_classifier = LinearSVC()
svm_classifier.fit(count_train, Y_train)

y_pred_train = svm_classifier.predict(count_train)
accuracy_train = accuracy_score(Y_train, y_pred_train)
print("Train Accuracy:", accuracy_train)

y_pred_SVC = rf_classifier.predict(count_test)
accuracy = accuracy_score(Y_test, y_pred_SVC)
print("Test Accuracy:", accuracy)

precision = precision_score(Y_test, y_pred_SVC)
print("Test precision:", precision)

recall = recall_score(Y_test, y_pred_SVC)
print("Test recall:", recall)

f1 = f1_score(Y_test, y_pred_SVC)
print("Test f1:", f1)

confusion=confusion_matrix(Y_test, y_pred_SVC)
print("Test confusion matrix:",confusion)

report = classification_report(Y_test, y_pred_SVC)
print("Test Report:", report)



Train Accuracy: 1.0
Test Accuracy: 0.9424975047133193
Test precision: 0.9388941748599218
Test recall: 0.9508565310492505
Test f1: 0.9448374913559232
Test confusion matrix: [[8116  578]
 [ 459 8881]]
Test Report:               precision    recall  f1-score   support

           0       0.95      0.93      0.94      8694
           1       0.94      0.95      0.94      9340

    accuracy                           0.94     18034
   macro avg       0.94      0.94      0.94     18034
weighted avg       0.94      0.94      0.94     18034



In [19]:
##LogisticRegression
logistic_classifier = LogisticRegression()
logistic_classifier.fit(count_train, Y_train)

y_pred_train = logistic_classifier.predict(count_train)
accuracy_train = accuracy_score(Y_train, y_pred_train)
print("Train Accuracy:", accuracy_train)

y_pred = rf_classifier.predict(count_test)
accuracy = accuracy_score(Y_test, y_pred)
print("Test Accuracy:", accuracy)

precision = precision_score(Y_test, y_pred)
print("Test precision:", precision)

recall = recall_score(Y_test, y_pred)
print("Test recall:", recall)

f1 = f1_score(Y_test, y_pred)
print("Test f1:", f1)

confusion=confusion_matrix(Y_test, y_pred)
print("Test confusion matrix:",confusion)

report = classification_report(Y_test, y_pred)
print("Test Report:", report)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Train Accuracy: 0.9995748613678374
Test Accuracy: 0.9424975047133193
Test precision: 0.9388941748599218
Test recall: 0.9508565310492505
Test f1: 0.9448374913559232
Test confusion matrix: [[8116  578]
 [ 459 8881]]
Test Report:               precision    recall  f1-score   support

           0       0.95      0.93      0.94      8694
           1       0.94      0.95      0.94      9340

    accuracy                           0.94     18034
   macro avg       0.94      0.94      0.94     18034
weighted avg       0.94      0.94      0.94     18034

