# Notebook Exploring Logistic Regression + Vectorizer alternatives

In [1]:
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression

In [2]:
data = pd.read_csv('data/train.csv')
data.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0


In [3]:
X = data['comment_text']
y = data['toxic']

### First Approach

* 1) Count number of words
* 2) Train Logistic Regression on this only

In [4]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.35, random_state = 2019)

In [5]:
cv = CountVectorizer()
X_train = cv.fit_transform(X_train)
X_test = cv.transform(X_test)

In [6]:
lr = LogisticRegression(solver = 'saga')
lr.fit(X_train, y_train)
print(f'Training Accuracy: {lr.score(X_train, y_train)}')
print(f'Testing Accuracy: {lr.score(X_test, y_test)}')

Training Accuracy: 0.9187821174111318
Testing Accuracy: 0.9178692927484333




### Second Approach

* 1) Remove Stopwords
* 2) Count number of words
* 3) Train LR

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.35, random_state = 2019)

In [8]:
cv = CountVectorizer(stop_words='english')
X_train = cv.fit_transform(X_train)
X_test = cv.transform(X_test)

In [9]:
lr = LogisticRegression(solver = 'saga')
lr.fit(X_train, y_train)
print(f'Training Accuracy: {lr.score(X_train, y_train)}')
print(f'Testing Accuracy: {lr.score(X_test, y_test)}')

Training Accuracy: 0.9201511747862053
Testing Accuracy: 0.918531781557744




### Third Approach

* 1) Remove Stopwords
* 2) Count number of words
* 3) Weight number of words per document with tf-IDF
* 4) Train LR

In [10]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [11]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.35, random_state = 2019)
tf = TfidfVectorizer(stop_words='english')
X_train = tf.fit_transform(X_train)
X_test = tf.transform(X_test)

In [12]:
lr = LogisticRegression(solver = 'saga')
lr.fit(X_train, y_train)
print(f'Training Accuracy: {lr.score(X_train, y_train)}')
print(f'Testing Accuracy: {lr.score(X_test, y_test)}')

Training Accuracy: 0.9592753637161231
Testing Accuracy: 0.9544136078782453
