In [3]:
# Imports
import pandas as pd
from bs4 import BeautifulSoup
import regex as re

from nltk.tokenize import RegexpTokenizer
import nltk as nltk
from nltk.corpus import stopwords

In [11]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Antho\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


True

In [4]:
# Read in data
rock = pd.read_csv('./data/rock_round2.csv')
rap = pd.read_csv('./data/rap_round2.csv')

In [5]:
# Get shape of rock dataframe
rock.shape

(993, 4)

In [6]:
# Get shape of rap dataframe
rap.shape

(996, 4)

In [7]:
# Show first few rows of rock
rock.head()

Unnamed: 0,title,comments,age,thread
0,Funkadelic - Maggot Brain,16,841.220417,rock
1,Motorhead - We Are Motorhead (Live Germany 2004),0,289.60375,rock
2,Which bands do you prefer?,5,125.55375,rock
3,Anyone with a special affinity towards lengthi...,12,654.987084,rock
4,Jebediah - “Jerks Of Attention” (1997),0,181.837084,rock


In [8]:
# Show first few rows of rap
rap.head()

Unnamed: 0,title,comments,age,thread
0,Kodak v. Corona,13,632.17165,rap
1,Had to put it out there,25,1742.32165,rap
2,Is this the fastest someone can rap while bein...,0,85.088317,rap
3,Amazon Echo: Tyga Edition,0,236.02165,rap
4,I would like to see who would win 20 hits for ...,1,92.32165,rap


In [9]:
# Custom function that will clean a dataframe column
def clean_text_column(df_column):
    
    # Define a list that will contain words in a column of subreddit titles
    words_list = [] 
    
    # Remove punctuation and convert to lowercase
    df_column = pd.Series([re.sub("[^a-zA-Z]", " ", BeautifulSoup(text).get_text().lower()) for text in df_column])
    
    # Instantiate tokenizer
    tokenizer = RegexpTokenizer('\s+', gaps=True)
    
    # Tokenize each line of the series and append each line (less stopwords) to words_list
    for line in df_column:
        test = tokenizer.tokenize(line)
        words_list.append(' '.join([word for word in test if word not in stopwords.words('english')]))
    
    # Return the cleaned column as a Pandas series
    return pd.Series(words_list)

In [12]:
# Clean the two columns in question
rock_unique_words = clean_text_column(rock['title'])
rap_unique_words = clean_text_column(rap['title'])

  ' that document to Beautiful Soup.' % decoded_markup


In [13]:
# Display unique words in rock
rock_unique_words

0                                funkadelic maggot brain
1                       motorhead motorhead live germany
2                                           bands prefer
3        anyone special affinity towards lengthier songs
4                               jebediah jerks attention
                             ...                        
988    brother band released first ep think pretty gr...
989    little self promotion three piece nj garage ro...
990                          hey guys song made acoustic
991    first ep needs listeners huge effort need help...
992    watch burning windows tramp official music vid...
Length: 993, dtype: object

In [14]:
# Display unique words in rap
rap_unique_words

0                                      kodak v corona
1                                                 put
2      fastest someone rap able understand every word
3                            amazon echo tyga edition
4         would like see would win hits hits yall got
                            ...                      
991                    looking vinyl gift please help
992                                 found guy youtube
993                       dame dash reclaiming throne
994                     made new song maybe want hear
995                     new playlist original concept
Length: 996, dtype: object

Assuming that words occuring most frequently in each subreddit do not also overlap between the two subreddits, this data should be adequate for training a model. Roughly 1000 posts for each subreddit will be used as training data and upon first glance at the unique words these data seem to be unique enough for, at the very least, a human to differentiate between the two subreddits.

In [17]:
# Check for null values
print(f'Null Values - Rock Words: {rock_unique_words.isnull().sum()}')
print()
print(f'Null Values - Rap Words: {rap_unique_words.isnull().sum()}')

Null Values - Rock Words: 0

Null Values - Rap Words: 0


In [21]:
# Check data types
print(f'Data Type - Rock: {rock_unique_words.dtypes}')
print()
print(f'Data Type - Rap: {rap_unique_words.dtypes}')

Data Type - Rock: object

Data Type - Rap: object


In [22]:
# Overwrite the original titles with titles containing only unique words
rock['title'] = rock_unique_words
rap['title'] = rap_unique_words

In [23]:
# Null values unexpectedly appeared during the modeling phase; double check their presence here.
rock['title'].isnull().sum()

0

In [24]:
# Null values unexpectedly appeared during the modeling phase; double check their presence here.
rap['title'].isnull().sum()

0

The code below was only ran once; the files 'rock_clean.csv' and 'rap_clean.csv' contain the original cleaned data used for this project.

In [26]:
# Create .csv files for two subreddits
rock.to_csv('./data/rock_clean_round2.csv', index=False)
rap.to_csv('./data/rap_clean_round2.csv', index=False)