In [84]:
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup 
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction import stop_words
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score

In [117]:
df = pd.read_csv('scraped_data.csv')

In [118]:
df.isna().sum()

title        0
subreddit    0
class        0
dtype: int64

In [119]:
X = df['title']
y = df['class']

In [120]:
X_train, X_test, y_train, y_test = train_test_split(X, y)

In [121]:
cvec = CountVectorizer(stop_words='english', ngram_range=(2, 6))
X_train = cvec.fit_transform(X_train)

In [122]:
X_train.todense()

matrix([[0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        ...,
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [123]:
X_test = cvec.transform(X_test)

In [124]:
model_lr = LogisticRegression()
print(cross_val_score(model_lr, X_train, y_train).mean())
model_lr.fit(X_train, y_train)
model_lr.predict(X_test)

0.9464189553042542


array([0, 0, 1, ..., 1, 0, 1])

In [125]:
model_lr.score(X_test, y_test)

0.9862681744749596

In [126]:
coefs = model_lr.coef_

In [127]:
names_1 = cvec.get_feature_names()

In [128]:
coefs[0]

array([-0.05717381, -0.05717381, -0.05717381, ..., -0.06492242,
       -0.06492242, -0.06492242])

In [129]:
names = pd.DataFrame({'coefs':coefs[0], 'names':names_1})

In [130]:
names

Unnamed: 0,coefs,names
0,-0.057174,000 bottles
1,-0.057174,000 bottles water
2,-0.057174,000 bottles water 20
3,-0.057174,000 bottles water 20 000
4,-0.057174,000 bottles water 20 000 hats
5,-0.057174,000 hats
6,-0.057174,000 hats tomorrow
7,-0.057174,000 hats tomorrow valencia
8,-0.057174,000 hats tomorrow valencia derby
9,-0.057174,000 hats tomorrow valencia derby kicks


In [131]:
names.sort_values('coefs').head()

Unnamed: 0,coefs,names
30188,-2.084249,match thread
38043,-1.796202,premier league
9192,-1.299683,champions league
5334,-1.233187,barcelona huesca
32367,-1.2314,nacional benfica


In [132]:
names.sort_values('coefs').tail()

Unnamed: 0,coefs,names
2361,0.868356,aaron donald
37940,0.904215,practice squad
42606,1.262634,round pick
46562,1.383417,source said
25387,1.781302,khalil mack
