In [2]:
import pandas as pd
import numpy as np
import re
from nltk.tokenize import word_tokenize as nltk_word_tokenize
from nltk.corpus import stopwords
from tqdm import tqdm

In [4]:
df = pd.read_csv('../../prc_1.csv')
df = df.rename(columns={'text_type':'target'})
df.target = df.target.astype('category')
df.target = df.target.cat.codes

In [5]:
df.head()

Unnamed: 0,target,text
0,1,naturally irresistible your corporate identity...
1,1,the stock trading gunslinger fanny is merrill ...
2,1,unbelievable new homes made easy im wanting to...
3,1,4 color printing special request additional in...
4,1,do not have money get software cds from here s...


TF-IDF + Naive Bias

In [None]:
stop_words = set(stopwords.words('english'))

In [9]:
def to_lower(text):
    return text.lower()
def remove_punctuations(text):
    return re.sub(r'[^\w\s]', '', text)
def to_tokenize(text):
    return nltk_word_tokenize(text)
def remove_stopwords(tokens):
    return [word for word in tokens if word not in stop_words]

In [10]:
import pymorphy2
morph = pymorphy2.MorphAnalyzer()

def lemmatize(tokens):
    return [morph.parse(word)[0].normal_form for word in tokens]

In [11]:
def text_processing(text):
    text = to_lower(text)
    text = remove_punctuations(text)
    tokens = to_tokenize(text)
    tokens = remove_stopwords(tokens)
    tokens = lemmatize(tokens)
    return ' '.join(tokens)

In [12]:
text = list(df.text)
processed_text = []

for t in tqdm(text):
    processed_text.append(text_processing(t))    

100%|██████████| 20348/20348 [00:06<00:00, 3143.05it/s]


In [14]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_vectorizer = TfidfVectorizer()
tfidf_matrix = tfidf_vectorizer.fit_transform(processed_text)

tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=tfidf_vectorizer.get_feature_names_out())


In [15]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score,roc_auc_score
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression

In [16]:
X_train,X_test,y_train,y_test = train_test_split(tfidf_df,df.target,test_size=0.3)
nb = MultinomialNB()
nb.fit(X_train,y_train)

In [18]:
lr = LogisticRegression(penalty='l2')
lr.fit(X_train,y_train)

In [19]:
roc_auc_score(y_test,nb.predict(X_test)),accuracy_score(y_test,nb.predict(X_test))

(0.7953024832476084, 0.8705978705978706)

In [20]:
roc_auc_score(y_test,lr.predict(X_test)),accuracy_score(y_test,lr.predict(X_test))

(0.8701227214243595, 0.9146601146601147)