In [24]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import re

from nltk.tokenize import sent_tokenize, word_tokenize, RegexpTokenizer
from nltk.stem import WordNetLemmatizer 
from sklearn.model_selection import cross_val_score, train_test_split 
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

In [2]:
# read in the dataframe

df = pd.read_csv('/Users/juhee/Desktop/GA/Submissions/Projects/project_3-master/data/reddit_clean.csv')

In [3]:
# check the shape and the first 5 rows

print(df.shape)
df.head()

(6000, 6)


Unnamed: 0,subreddit,selftext,title,body,text_all,y
0,relationship_advice,I'am a bi 58 year old male. I have not came ...,BI 58 Male Caught With Guy,,BI 58 Male Caught With Guy I'am a bi 58 year o...,1
1,legaladvice,"Not looking for advice, just a question. If so...",When a dealer gets arrested do his roommates g...,,When a dealer gets arrested do his roommates g...,0
2,relationship_advice,\nUsing a throwaway cause my friends know my r...,My(14M) friend (14F) that I’ve been talking to...,,My(14M) friend (14F) that I’ve been talking to...,1
3,relationship_advice,"On mobile, kinda long sorry on advance. \n\nAp...",Did my boyfriend and I make the right assumpti...,,Did my boyfriend and I make the right assumpti...,1
4,relationship_advice,Me (22M) and my partner(21f) have been in rela...,Tough patch in our relationship of four years,,Tough patch in our relationship of four years ...,1


### Feature Engineering and Preprocessing

In [4]:
# instantiate lemmatizer and tokenizer

lemmatizer = WordNetLemmatizer()
tokenizer = RegexpTokenizer(r'\w+|\$[\d\.]+|\S+')

In [5]:
## tokenize and lemmatize

# create a column 'texts_cleaned'

df['texts_cleaned'] = ''

# iterate through each row in the column text_all
    
for i in range(len(df['text_all'])):
        
    # tokenize each word in text_all into its own string
    text_token = []
    text_token.extend(tokenizer.tokenize(df['text_all'][i].lower()))
    text_tokens = []
    [text_tokens.append(word) for word in text_token if word not in text_tokens]
        
    # lemmatize the words
    text_lemmatize = []
    for j in range(len(text_tokens)):
        text_lemmatize.append(lemmatizer.lemmatize(text_tokens[j]))
        
    # remove characters and numbers
    clean_text = []
    for k in range(len(text_lemmatize)):
        clean_text.append(re.sub('[^a-zA-Z]', '', text_lemmatize[k]))    
        
    # group them together
    texts_collection = [text for text in clean_text]
    
    # put the words back to one long string for vectorization
    texts_collection = ' '.join(texts_collection)

    # fill new column with 'cleaned' string from column:
    df['texts_cleaned'][i] = texts_collection

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [6]:
# check the column

df['texts_cleaned'][0]

'bi  male caught with guy i am a year old  have not came out gay  but found it lot easier to pick up men then woman since covid been having guy come over that know and sex most just want blowjob live my daughter who is in college she work usually will arrange meeting when work wa the middle of week scheduled go into had lined at his lunch time for quick he hurry so pulled pant down started give him living room did here front door open heard her best friend laughing they me this dick mouth went right bedroom finished sucking tell left now worried ex mother we bad divorce because told before were married talked about everything could get fact other posted thing on facebook after post page would never are them internet what should do '

In [7]:
# check the dataframe

df.head()

Unnamed: 0,subreddit,selftext,title,body,text_all,y,texts_cleaned
0,relationship_advice,I'am a bi 58 year old male. I have not came ...,BI 58 Male Caught With Guy,,BI 58 Male Caught With Guy I'am a bi 58 year o...,1,bi male caught with guy i am a year old have...
1,legaladvice,"Not looking for advice, just a question. If so...",When a dealer gets arrested do his roommates g...,,When a dealer gets arrested do his roommates g...,0,when a dealer get arrested do his roommate get...
2,relationship_advice,\nUsing a throwaway cause my friends know my r...,My(14M) friend (14F) that I’ve been talking to...,,My(14M) friend (14F) that I’ve been talking to...,1,my m friend f that i ve been talking to just t...
3,relationship_advice,"On mobile, kinda long sorry on advance. \n\nAp...",Did my boyfriend and I make the right assumpti...,,Did my boyfriend and I make the right assumpti...,1,did my boyfriend and i make the right assumpti...
4,relationship_advice,Me (22M) and my partner(21f) have been in rela...,Tough patch in our relationship of four years,,Tough patch in our relationship of four years ...,1,tough patch in our relationship of four year m...


In [8]:
# drop unnecessary columns - selftext, title, body

df.drop(columns = ['selftext', 'title', 'body'], inplace = True)

In [9]:
## count words in each post

# set up an empty list for post length

post_w_cnt = []

# iterate through the title column

for i in range(len(df['texts_cleaned'])):
    
    post_tokens = tokenizer.tokenize(df['texts_cleaned'][i])
    cnt = len(post_tokens)
    post_w_cnt.append(cnt)

In [10]:
# create the column

df['post_w_cnt'] = post_w_cnt

In [11]:
# check if the columns has been created

df.head()

Unnamed: 0,subreddit,text_all,y,texts_cleaned,post_w_cnt
0,relationship_advice,BI 58 Male Caught With Guy I'am a bi 58 year o...,1,bi male caught with guy i am a year old have...,139
1,legaladvice,When a dealer gets arrested do his roommates g...,0,when a dealer get arrested do his roommate get...,36
2,relationship_advice,My(14M) friend (14F) that I’ve been talking to...,1,my m friend f that i ve been talking to just t...,184
3,relationship_advice,Did my boyfriend and I make the right assumpti...,1,did my boyfriend and i make the right assumpti...,356
4,relationship_advice,Tough patch in our relationship of four years ...,1,tough patch in our relationship of four year m...,206


In [12]:
# check for null values

df.isnull().sum()

subreddit        0
text_all         0
y                0
texts_cleaned    0
post_w_cnt       0
dtype: int64

In [13]:
# check if there's 0 value in post_w_cnt

df.loc[df['post_w_cnt'] == 0]

Unnamed: 0,subreddit,text_all,y,texts_cleaned,post_w_cnt
3309,relationship_advice,.,1,,0
4079,relationship_advice,キモイ,1,,0
4264,relationship_advice,❤️,1,,0
5063,relationship_advice,22,1,,0
5213,legaladvice,10-4,0,,0


In [14]:
# drop any rows that have 0 in post_w_cnt

df = df[df['post_w_cnt'] != 0]

In [15]:
# check the dataframe

print(df.shape)
df.head()

(5995, 5)


Unnamed: 0,subreddit,text_all,y,texts_cleaned,post_w_cnt
0,relationship_advice,BI 58 Male Caught With Guy I'am a bi 58 year o...,1,bi male caught with guy i am a year old have...,139
1,legaladvice,When a dealer gets arrested do his roommates g...,0,when a dealer get arrested do his roommate get...,36
2,relationship_advice,My(14M) friend (14F) that I’ve been talking to...,1,my m friend f that i ve been talking to just t...,184
3,relationship_advice,Did my boyfriend and I make the right assumpti...,1,did my boyfriend and i make the right assumpti...,356
4,relationship_advice,Tough patch in our relationship of four years ...,1,tough patch in our relationship of four year m...,206


In [16]:
# save the final dataset as csv

df.to_csv('/Users/juhee/Desktop/GA/Submissions/Projects/project_3-master/data/reddit_final.csv', index = False)