# Subreddit Classification using NLP

## 3. Preprocessing Data

### Imports

In [32]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier,BaggingClassifier, RandomForestClassifier
from sklearn.metrics import f1_score

#Import Library of Gaussian Naive Bayes model
from sklearn.naive_bayes import GaussianNB, MultinomialNB, BernoulliNB

from nltk.classify import NaiveBayesClassifier
from nltk import WordNetLemmatizer
from nltk.tokenize import RegexpTokenizer, word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.stem.porter import PorterStemmer


import seaborn as sns
import matplotlib.pyplot as plt


In [33]:
df = pd.read_csv('./datasets/clean_data_3.csv')

# Preprocessing

## Tokenizing

In [34]:
#only take words or numbers in
tokenizer = RegexpTokenizer(r'\w+', gaps=False)

In [35]:
df.head()

Unnamed: 0,title,Target,clean_title
0,Deadly Virus Sweeping China Is Just Olympic Fever,1,deadly virus sweeping china olympic fever
1,Southern Governors Argue Covid-19 Good Christi...,1,southern governors argue covid 19 good christi...
2,"‘They’re Doing Something To The Street,’ Repor...",1,something street reports nation staring window
3,I miss the old onion.,1,miss old onion
4,Animals Spread Disease Constantly - Horrifying...,1,animals spread disease constantly horrifying p...


In [36]:
# Converting to string. Without this tokenizer giving error
df['clean_title'] = df['clean_title'].apply(str)

In [37]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1807 entries, 0 to 1806
Data columns (total 3 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   title        1807 non-null   object
 1   Target       1807 non-null   int64 
 2   clean_title  1807 non-null   object
dtypes: int64(1), object(2)
memory usage: 42.5+ KB


In [38]:
df['tokens'] = df['clean_title'].map(tokenizer.tokenize)

In [39]:
df.head()

Unnamed: 0,title,Target,clean_title,tokens
0,Deadly Virus Sweeping China Is Just Olympic Fever,1,deadly virus sweeping china olympic fever,"[deadly, virus, sweeping, china, olympic, fever]"
1,Southern Governors Argue Covid-19 Good Christi...,1,southern governors argue covid 19 good christi...,"[southern, governors, argue, covid, 19, good, ..."
2,"‘They’re Doing Something To The Street,’ Repor...",1,something street reports nation staring window,"[something, street, reports, nation, staring, ..."
3,I miss the old onion.,1,miss old onion,"[miss, old, onion]"
4,Animals Spread Disease Constantly - Horrifying...,1,animals spread disease constantly horrifying p...,"[animals, spread, disease, constantly, horrify..."


## Lemmatizing and Stemming

In [40]:
# Function to lemmatize and stem

def lem_stem(x):

    lemmatizer = WordNetLemmatizer()
    p_stemmer = PorterStemmer()
    
    
    #lemmatize words
    lemm_words = [lemmatizer.lemmatize(word.lower()) for word in x]

    # Stem words.
    stemmed_words = [p_stemmer.stem(word) for word in lemm_words]
    
    #return lemmatized and stem words in a string
    return ' '.join(stemmed_words)

Porter Stemming removes the suffix to keep only the base words.
<br>
Lemmatization reduces the word to a base word that belongs to the language. We will use both since we are not interested in context here.

In [41]:
df['lemstem_text'] = df['tokens'].map(stem_lem)

In [42]:
df.head()

Unnamed: 0,title,Target,clean_title,tokens,lemstem_text
0,Deadly Virus Sweeping China Is Just Olympic Fever,1,deadly virus sweeping china olympic fever,"[deadly, virus, sweeping, china, olympic, fever]",deadli viru sweep china olymp fever
1,Southern Governors Argue Covid-19 Good Christi...,1,southern governors argue covid 19 good christi...,"[southern, governors, argue, covid, 19, good, ...",southern governor argu covid 19 good christian...
2,"‘They’re Doing Something To The Street,’ Repor...",1,something street reports nation staring window,"[something, street, reports, nation, staring, ...",someth street report nation stare window
3,I miss the old onion.,1,miss old onion,"[miss, old, onion]",miss old onion
4,Animals Spread Disease Constantly - Horrifying...,1,animals spread disease constantly horrifying p...,"[animals, spread, disease, constantly, horrify...",anim spread diseas constantli horrifi planet ep 7


## Save

In [44]:
df.to_csv('./datasets/clean_data_lemstemed.csv', index=False)