In [17]:
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.preprocessing import StandardScaler
from scipy.sparse import hstack
from sklearn.linear_model import LogisticRegression
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score

all_data = pd.read_csv('data/train.csv')
data = all_data.dropna()

x_train, x_test, y_train, y_test = train_test_split(data[['text', 'title']], data['label'], test_size=0.2, random_state=38923)

In [7]:
text_tokenizer = TfidfVectorizer(stop_words='english')
title_tokenizer = TfidfVectorizer(stop_words='english')

x = [text_tokenizer.fit_transform(x_train['text']), title_tokenizer.fit_transform(x_train['title'])]
x = hstack(x)

svd_model = TruncatedSVD(1000)
x = svd_model.fit_transform(x)

scl = StandardScaler()
x = scl.fit_transform(x)

In [18]:
model = LogisticRegression(penalty='elasticnet', solver='saga', l1_ratio=0.5, C=0.05, n_jobs=-1)
model.fit(x, y_train)



LogisticRegression(C=0.05, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=0.5, max_iter=100,
                   multi_class='auto', n_jobs=-1, penalty='elasticnet',
                   random_state=None, solver='saga', tol=0.0001, verbose=0,
                   warm_start=False)

In [19]:
x_test = [text_tokenizer.transform(x_test['text']), title_tokenizer.transform(x_test['title'])]
x_test = hstack(x_test)

x_test = svd_model.transform(x_test)

x_test = scl.transform(x_test)

y_pred = model.predict(x_test)

In [20]:
print(np.sum(y_pred == y_test) / y_pred.shape[0])

0.9773038009297238


In [21]:
roc_auc_score(y_test, y_pred)

0.9773984514186825

In [22]:
print(np.sum(model.coef_ != 0))

661


In [23]:
true = pd.read_csv('data/True.csv')
true['label'] = 0
false = pd.read_csv('data/Fake.csv')
false['label'] = 1
data = pd.concat([true, false], axis=0)
data['text_length'] = data['text'].str.split().str.len()
data = data[data['text_length'] >= 30]
data = data.dropna()

In [24]:
x_test = [text_tokenizer.transform(data['text']), title_tokenizer.transform(data['title'])]
x_test = hstack(x_test)

x_test = svd_model.transform(x_test)

x_test = scl.transform(x_test)

y_pred = model.predict(x_test)

y_test = data['label']

In [25]:
print(np.sum(y_pred == y_test) / y_pred.shape[0])

0.700440122589119


In [26]:
roc_auc_score(y_test, y_pred)

0.6986425143993079

In [27]:
x_train, x_test, y_train, y_test = train_test_split(data[['text', 'title']], data['label'], test_size=0.2, random_state=38923)

In [28]:
text_tokenizer = TfidfVectorizer(stop_words='english')
title_tokenizer = TfidfVectorizer(stop_words='english')

x = [text_tokenizer.fit_transform(x_train['text']), title_tokenizer.fit_transform(x_train['title'])]
x = hstack(x)

svd_model = TruncatedSVD(1000)
x = svd_model.fit_transform(x)

scl = StandardScaler()
x = scl.fit_transform(x)

In [29]:
model = LogisticRegression(penalty='elasticnet', solver='saga', l1_ratio=0.5, C=0.05, n_jobs=-1)
model.fit(x, y_train)

LogisticRegression(C=0.05, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=0.5, max_iter=100,
                   multi_class='auto', n_jobs=-1, penalty='elasticnet',
                   random_state=None, solver='saga', tol=0.0001, verbose=0,
                   warm_start=False)

In [30]:
x_test = [text_tokenizer.transform(x_test['text']), title_tokenizer.transform(x_test['title'])]
x_test = hstack(x_test)

x_test = svd_model.transform(x_test)

x_test = scl.transform(x_test)

y_pred = model.predict(x_test)

In [31]:
print(np.sum(y_pred == y_test) / y_pred.shape[0])

0.9912442396313365


In [32]:
roc_auc_score(y_test, y_pred)

0.9912592028343272