In [9]:
# Imports
import pandas as pd
from bs4 import BeautifulSoup
import regex as re

from nltk.tokenize import RegexpTokenizer
import nltk as nltk
from nltk.corpus import stopwords

In [14]:
# Read in data
rock = pd.read_csv('./data/rock.csv')
rap = pd.read_csv('./data/rap.csv')

In [15]:
# Get shape of rock dataframe
rock.shape

(959, 4)

In [16]:
# Get shape of rap dataframe
rap.shape

(999, 4)

In [17]:
# Show first few rows of rock
rock.head()

Unnamed: 0,title,comments,age,thread
0,"October 2019 Recommendations, Suggestions, and...",20,20551.811835,rock
1,October 2019 Covers Thread,3,20550.578502,rock
2,"Rock Hall of Fame: Notorious B.I.G., Whitney H...",9,580.078502,rock
3,Pat Benatar - Invincible,0,106.795169,rock
4,Kate Bush & David Gilmour - Running Up That Hi...,0,544.745169,rock


In [18]:
# Show first few rows of rap
rap.head()

Unnamed: 0,title,comments,age,thread
0,How does eminem do it in one breath?,45,844.253953,rap
1,Trrewillz - I Ain't Got Nothin - Inland Empire...,1,161.57062,rap
2,I need a rapper for this very raw beat I made....,1,163.52062,rap
3,Rap Bangers?,6,168.57062,rap
4,One of my favorite verses is J Cole's verse at...,0,153.387287,rap


In [19]:
# Custom function that will clean a dataframe column
def clean_text_column(df_column):
    
    words_list = []
    
    # Remove punctuation and convert to lowercase
    df_column = pd.Series([re.sub("[^a-zA-Z]", " ", BeautifulSoup(text).get_text().lower()) for text in df_column])
    
    # Instantiate tokenizer
    tokenizer = RegexpTokenizer('\s+', gaps=True)
    
    # Tokenize each line of the series and append each line (less stopwords) to words_list
    for line in df_column:
        test = tokenizer.tokenize(line)
        words_list.append(' '.join([word for word in test if word not in stopwords.words('english')]))
    
    # Return the cleaned pandas series
    return pd.Series(words_list)

In [20]:
# Clean the two columns in question
rock_unique_words = clean_text_column(rock['title'])
rap_unique_words = clean_text_column(rap['title'])

In [21]:
# Display unique words in rock
rock_unique_words

0      october recommendations suggestions playlists ...
1                                  october covers thread
2      rock hall fame notorious b g whitney houston s...
3                                 pat benatar invincible
4      kate bush david gilmour running hill live secr...
                             ...                        
954      narcissus paradise johnny john schemin dreamers
955                       artist retrospect alex chilton
956                       rush limelight moving pictures
957                             zz top sharp dressed man
958              dirty honey gone acoustic live sdr show
Length: 959, dtype: object

In [22]:
# Display unique words in rap
rap_unique_words

0                                      eminem one breath
1      trrewillz got nothin inland empire rap new rap...
2             need rapper raw beat made open suggestions
3                                            rap bangers
4      one favorite verses j cole verse end logic afr...
                             ...                        
994                  gretzky big bity bank freestyle rap
995                                  pac eminem die soon
996    blackbear x bryson tiller x juice wrld type be...
997                                        pac overrated
998                                    finished playlist
Length: 999, dtype: object

Assuming that words occuring most frequently in each subreddit do not also overlap between the two subreddits, this data should be adequate for training a model. Roughly 1000 posts for each subreddit will be used as training data and upn first glance at the unique words these data seem to be unique enough for, at the very least, a human to differentiate between the two subreddits.

In [23]:
# Check for null values
print(rock_unique_words.isnull().sum())
print()
print(rap_unique_words.isnull().sum())

0

0


In [24]:
# Check data types
print(rock_unique_words.dtypes)
print()
print(rap_unique_words.dtypes)

object

object


In [26]:
# Overwrite original thread title words with unique words only
rock['title'] = rock_unique_words
rap['title'] = rap_unique_words

In [27]:
# Null values unexpectedly appeared during the modeling phase; double check their presence here.
rock['title'].isnull().sum()

0

In [28]:
# Null values unexpectedly appeared during the modeling phase; double check their presence here.
rap['title'].isnull().sum()

0

The code below was only ran once; the files 'rock_clean.csv' and 'rap_clean.csv' contain the original cleaned data used for this project.

In [29]:
# Create .csv files for two subreddits
rock.to_csv('./data/rock_clean.csv', index=False)
rap.to_csv('./data/rap_clean.csv', index=False)