In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import re
from collections import Counter
import joblib

from sklearn.linear_model import LogisticRegression

from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay, recall_score, precision_score
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, FunctionTransformer

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

from nltk.tokenize import sent_tokenize, word_tokenize, RegexpTokenizer
from nltk.stem import WordNetLemmatizer
from nltk.stem.porter import PorterStemmer
from nltk.corpus import stopwords


import thor

In [2]:
news1 = pd.read_csv('../data/news10k.csv', index_col = 'Unnamed: 0')
news2 = pd.read_csv('../data/news100k_iter1.csv', index_col = 'Unnamed: 0')

onion1 = pd.read_csv('../data/onion10k.csv', index_col = 'Unnamed: 0')
onion2 = pd.read_csv('../data/onion100k_iter1.csv', index_col = 'Unnamed: 0')

In [3]:
news = pd.concat([news1[['created_utc', 'title']],news2])

onion = pd.concat([onion1[['created_utc', 'title']], onion2])

In [4]:
news['class'] = 0
onion['class'] = 1

In [5]:
news.shape, onion.shape

((110000, 3), (17307, 3))

In [6]:
total = pd.concat([news, onion])

In [7]:
total.shape

(127307, 3)

In [9]:
total['title'] = total['title'].str.lower()

In [10]:
total.drop_duplicates(subset='title', inplace=True)

In [11]:
total.shape

(117216, 3)

In [12]:
total = total[total['title'].map(lambda x: x.isascii())]

In [13]:
total[ total['class']==1 ].shape

(11318, 3)

In [14]:
total = total.sample(frac=1).reset_index(drop=True)

In [15]:
X = total.drop(columns='class')
y = total['class']

X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.1, random_state=42)

In [16]:
total_train = pd.concat([X_train, y_train], axis=1)
total_test = pd.concat([X_test, y_test], axis=1)

In [17]:
total.to_csv('../data/total.csv', index=False)
total_train.to_csv('../data/total_train.csv', index=False)
total_test.to_csv('../data/total_test.csv', index=False)

In [18]:
total_train.head()

Unnamed: 0,created_utc,title,class
52512,1638807805,sheriff's office removes photo of santa fillin...,0
83221,1636918297,this library lets you borrow people instead of...,0
48443,1647961430,russian elements have infiltrated every elemen...,0
63446,1511181800,only 4 drivers left alive go into final nascar...,1
72432,1646768574,what happens in ukraine since 2015,0
