In [2]:
import re
import nltk

import requests
import itertools
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.stem.porter import PorterStemmer
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from nltk.tokenize import sent_tokenize, word_tokenize, RegexpTokenizer

from sklearn import metrics
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import confusion_matrix, plot_confusion_matrix, plot_roc_curve, roc_auc_score
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.ensemble import BaggingClassifier, GradientBoostingClassifier, AdaBoostClassifier, VotingClassifier, RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import PassiveAggressiveClassifier, LogisticRegression, LogisticRegressionCV
from sklearn.decomposition import TruncatedSVD
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score, StratifiedKFold
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, HashingVectorizer

from time import sleep

### Modeling

In [17]:
df = pd.read_csv('./datasets/combined_data.csv')
df = df.drop(columns = 'Unnamed: 0')
df['label'] = df['label'].map({'true' : 0,'false' : 1,'misleading' : 1})
X = df['text']
y = df['label']
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify = y, random_state = 42)

In [21]:
estimator = []
estimator.append(('gs_bag', BaggingClassifier()))
estimator.append(('mnnb', MultinomialNB(alpha = 0.1)))
estimator.append(('lrcv', LogisticRegressionCV(penalty = 'l2', solver = 'liblinear')))
cvec = CountVectorizer(ngram_range = (1,1), stop_words= None)

Z_train = cvec.fit_transform(X_train)
Z_test = cvec.transform(X_test)

In [28]:
vote = VotingClassifier(estimators = estimator, voting = 'hard')

In [29]:
vote.fit(Z_train, y_train)

VotingClassifier(estimators=[('gs_bag', BaggingClassifier()),
                             ('mnnb', MultinomialNB(alpha=0.1)),
                             ('lrcv',
                              LogisticRegressionCV(solver='liblinear'))])

In [30]:
vote.score(Z_train, y_train), vote.score(Z_test, y_test)

(0.9948153768335862, 0.91350531107739)