In [1]:
import requests
import time
import nltk
import pandas as pd
import regex as re
import numpy as np

from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

In [2]:
df = pd.read_csv('../csv_scrapes/concatenated_subreddit.csv')

In [3]:
df.shape

(3201, 8)

In [4]:
df.head()

Unnamed: 0,subreddit_topic,subreddit_time_created,subreddit_title,subreddit_id,subreddit_score,subreddit_num_comments,subreddit_body,subreddit_url
0,80sRock,2020-07-30 17:55:12,Ozzy Osbourne Crazy Train Acoustic Instrum...,i0nfuw,2,0,Hey All Just wanted to share my acoustic ins...,https://www.reddit.com/r/80sRock/comments/i0nf...
1,80sRock,2020-07-29 23:20:20,SONG ID Please help,i07wij,1,2,My workplace has one of those pumped in music ...,https://www.reddit.com/r/80sRock/comments/i07w...
2,80sRock,2020-07-15 05:23:34,Rock you like a hurricane,hrf8sg,4,0,People of the world I just thought I d shar...,https://www.reddit.com/r/80sRock/comments/hrf8...
3,80sRock,2020-07-08 03:15:09,Does anyone have any good sex pistols songs,hn6i70,1,0,I want to listen to a song by them but don t k...,https://www.reddit.com/r/80sRock/comments/hn6i...
4,80sRock,2019-11-08 23:09:33,trying to find a tour t shirt from now and zen...,dtlvjl,1,0,Title says it but my dad went to the Now and Z...,https://www.reddit.com/r/80sRock/comments/dtlv...


In [5]:
needed_col = ['subreddit_topic', 'subreddit_body']
df = df[needed_col]

In [6]:
df.shape

(3201, 2)

In [7]:
df['target'] = np.where(df['subreddit_topic'].str.contains('dadjokes', regex = False), 1, 0)

In [8]:
df['target'].value_counts()

0    2360
1     841
Name: target, dtype: int64

In [9]:
df.shape

(3201, 3)

In [10]:
df.head()

Unnamed: 0,subreddit_topic,subreddit_body,target
0,80sRock,Hey All Just wanted to share my acoustic ins...,0
1,80sRock,My workplace has one of those pumped in music ...,0
2,80sRock,People of the world I just thought I d shar...,0
3,80sRock,I want to listen to a song by them but don t k...,0
4,80sRock,Title says it but my dad went to the Now and Z...,0


In [11]:
#set my tokenizer
tokenizer = RegexpTokenizer(r'\w+|\$[\d\.]+|\S+')

#Lemmatize
lemmatizer = WordNetLemmatizer()

In [12]:
df.columns

Index(['subreddit_topic', 'subreddit_body', 'target'], dtype='object')

In [13]:
def column_cleaner(column, df = df):
    #For some reason, I was running into errors trying to run this code until I added the code
    #below (df[column+'_clean'] = ""), establishing from the beginning that the new column to be created
    #exists in the dataframe and contains nothing but empty strings.
    
    df[column + '_clean'] = ""
    
    #for loop through each row in the column:
    for i in range(len(df[column])):
        
        #Tokenize, or separate, each word in column's string into its own string (prep for lemmatization):
        col_tok = []
        col_tok.extend(tokenizer.tokenize(df[column][i].lower()))
        col_token = []
        [col_token.append(s) for s in col_tok if s not in col_token]
        
        #Lemmatize the words (cut the word to its base/root, for improved model results):
        col_lem = []
        for x in range(len(col_token)):
            col_lem.append(lemmatizer.lemmatize(col_token[x]))
        
        #Remove characters and numbers (for improved model results, hopefully):
        letters_only_col = []
        for c in range(len(col_lem)):
            letters_only_col.append(re.sub("[^a-zA-Z]", "", col_lem[c]))
        
        #Remove stopwords (for improved model results):
        col_words = [w for w in letters_only_col if not w in stopwords.words('english')]
        
        
        #Ensure that there are no 'None' objects in title_words:
        col_words = list(filter(None, col_words))

        #Join the lemmatized words - stopwords back to one long string (prep for
        #vectorization, done outside/after this function):
        col_words = " ".join(col_words)

        #Fill new column with 'cleaned' string from column:
        df[column+'_clean'][i] = col_words

In [14]:
column_cleaner(column='subreddit_body', df=df)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [15]:
df.head()

Unnamed: 0,subreddit_topic,subreddit_body,target,subreddit_body_clean
0,80sRock,Hey All Just wanted to share my acoustic ins...,0,hey wanted share acoustic instrumental version...
1,80sRock,My workplace has one of those pumped in music ...,0,workplace ha one pumped music playlist pm bloc...
2,80sRock,People of the world I just thought I d shar...,0,people world thought share attempt song includ...
3,80sRock,I want to listen to a song by them but don t k...,0,want listen song know start
4,80sRock,Title says it but my dad went to the Now and Z...,0,title say dad went zen tour wa younger got shi...


In [16]:
#vectorizer function definitions

In [17]:
#create CountVectorize Function

def count_vec_column(column, func_df=df):
    #Instantiate CountVectorizer:
    vect = CountVectorizer()
    
    #Create temporary variable X_text that takes on the fit/transformed results of the column:
    X_text = vect.fit_transform(func_df[column])
    
    #Turn X_text into an array (prep to easily make a DataFrame):
    X_text = X_text.toarray()
    
    #Create a temporary DataFrame with each word/word-pair/word-group as the columns:
    temp_df = pd.DataFrame(X_text,
                           columns = vect.get_feature_names())
    
    #Add the original column name to the beginning of the new columns' names to differentiate from which column
    # the vectorized words came from (this may impact the strength of the model):
    for i in range(len(temp_df.columns)):
        #print(i)
        temp_df.rename(columns={temp_df.columns[i]: column + '_' + temp_df.columns[i]}, inplace=True)
    
    #Combine the two DataFrames:
    func_df = pd.concat([func_df, temp_df], axis=1)
    return func_df

In [18]:
#CountVectorize
#commented out after beta checkpoint created because cpu intensive
df_cv = count_vec_column(func_df=df, column='subreddit_body_clean')

In [19]:
df_cv.to_csv('../csv_vectorized/subreddit_count_vec.csv', index=False, sep=",")