In [1]:
import re
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.feature_extraction import text
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.linear_model import LogisticRegression
from nltk.stem import WordNetLemmatizer
from nltk import word_tokenize

In [2]:
df = pd.read_csv('https://raw.githubusercontent.com/hys89/nlp-data-learningfri/main/horror&romance.csv')

In [3]:
df.head()

Unnamed: 0,selftext,subreddit
0,1. 500 Word Limit. All stories must be 500 wor...,shortscarystories
1,I was sitting on the soft snow with a blanket ...,shortscarystories
2,But the hatred that fills the room whenever he...,shortscarystories
3,I woke up hungry this morning. I've been sick ...,shortscarystories
4,I had seen school shootings on television in t...,shortscarystories


# Mapping Subreddit
(shortscarystories -> 1, romance ->0)

In [4]:
df['subreddit'] = df['subreddit'].map({'shortscarystories':1, 'romance':0})
df.head()

Unnamed: 0,selftext,subreddit
0,1. 500 Word Limit. All stories must be 500 wor...,1
1,I was sitting on the soft snow with a blanket ...,1
2,But the hatred that fills the room whenever he...,1
3,I woke up hungry this morning. I've been sick ...,1
4,I had seen school shootings on television in t...,1


# Train Test Split

In [5]:
X = df['selftext']
y = df['subreddit']

In [6]:

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, stratify=y, random_state=7)

In [7]:
#Checking strafication
print (y_train.value_counts(normalize=True)*100)

1    53.319058
0    46.680942
Name: subreddit, dtype: float64


# Training Process

For purposes of demonstration, we will do the following:
1. Implement basic text cleaning + tokenisation
2. Lemmatising word tokens
3. Build word count vectoriser as feature inputs into subsequent model
4. Implement classification using logistic regression
5. Evaluate model

#### 1. Implement basic text cleaning + tokenisation
#### 2. Lemmatising word tokens

In [8]:
def preprocessor(text):
    words = re.sub(r"[^A-Za-z0-9]", " ", str_input).lower().split() # Remove punctuation and tokenise
    words = [WordNetLemmatizer().lemmatize(word) for word in words] # Lemmatise each word
    return words

In [9]:
# This is a set of stop words from sklearn
stop_words = text.ENGLISH_STOP_WORDS

#### 3. Build word count vectoriser as feature inputs into subsequent model

In [10]:
# Create instance of vectoriser
cvec = CountVectorizer(ngram_range=(1,2), stop_words=stop_words)

In [11]:
X_train = cvec.fit_transform(X_train)
X_test = cvec.transform(X_test)

#### 4. Implement classification using logistic regression

In [12]:
lr = LogisticRegression()

In [13]:
lr.fit(X_train, y_train)

LogisticRegression()

#### 5. Evaluate model

In [14]:
# Test accuracy
lr.score(X_test, y_test)

0.9252136752136753

In [15]:
# Evaluating test results
print (classification_report(y_test, lr.predict(X_test), target_names=['Romance', 'ShortScaryStories']))

                   precision    recall  f1-score   support

          Romance       0.90      0.95      0.92       219
ShortScaryStories       0.95      0.90      0.93       249

         accuracy                           0.93       468
        macro avg       0.92      0.93      0.93       468
     weighted avg       0.93      0.93      0.93       468

