### Installs

In [1]:
# !pip install -U nltk

In [2]:
# !pip install regex

### Imports

In [3]:
import pandas as pd
import numpy as np
import regex as re

from nltk.stem import WordNetLemmatizer
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords

### Read `.csv`

In [4]:
# Read in .csv files
aww1 = pd.read_csv('./data/aww1.csv')
aww2 = pd.read_csv('./data/aww2.csv')
aww3 = pd.read_csv('./data/aww3.csv')
aww4 = pd.read_csv('./data/aww4.csv')
aww5 = pd.read_csv('./data/aww5.csv')

ds1 = pd.read_csv('./data/ds1.csv')
ds2 = pd.read_csv('./data/ds2.csv')
ds3 = pd.read_csv('./data/ds3.csv')
ds4 = pd.read_csv('./data/ds4.csv')

cats1 = pd.read_csv('./data/cats1.csv')
cats2 = pd.read_csv('./data/cats2.csv')
cats3 = pd.read_csv('./data/cats3.csv')
cats4 = pd.read_csv('./data/cats4.csv')

dogs1 = pd.read_csv('./data/dogs1.csv')
dogs2 = pd.read_csv('./data/dogs2.csv')
dogs3 = pd.read_csv('./data/dogs3.csv')
dogs4 = pd.read_csv('./data/dogs4.csv')

In [5]:
# Concatenate .csv files into one DataFrame

# # cats vs. dogs
# df = pd.concat([cats1, cats2, cats3, dogs1, dogs2, dogs3])

# cats vs. dogs vs. aww
df = pd.concat([aww1, aww2, aww3, aww4, aww5, cats1, cats2, cats3, cats4, dogs1, dogs2, dogs3, dogs4])

# # cats vs. dogs vs. aww vs. datascience
# df = pd.concat([ds1, ds2, ds3, ds4, aww1, aww2, aww3, aww4, cats1, cats2, cats3, dogs1, dogs2, dogs3])

### Initial cleaning

In [6]:
# Check for nulls
df.isnull().sum()

Unnamed: 0         0
title              0
subreddit          0
name               0
num_comments       0
is_video           0
pinned             0
id                 0
selftext        8151
dtype: int64

In [7]:
# Drop unnecessary column
df.drop(['Unnamed: 0','selftext','num_comments','pinned'], axis=1, inplace=True)

In [8]:
# Reset index
df.reset_index(inplace=True, drop=True)

In [9]:
df.head()

Unnamed: 0,title,subreddit,name,is_video,id
0,R/AWW OFFICIAL DISCORD!!! JOIN NOW!,aww,t3_97vuvt,False,97vuvt
1,r/catsandchristmastrees: /r/Aww Subreddit of t...,aww,t3_a7yh12,False,a7yh12
2,sunset,aww,t3_a80fbf,False,a80fbf
3,Bandit turned 25 years old on December 7th!,aww,t3_a814sv,False,a814sv
4,Looking good for 23!!,aww,t3_a7z8or,False,a7z8or


#### Identify and remove duplicate rows

In [10]:
len(df['name'])

11961

In [11]:
len(set(df['name']))

9328

In [12]:
duplicates = df.duplicated(subset='name',
                          keep='first')

In [13]:
df = pd.concat([df, duplicates], axis=1)

In [14]:
df.rename(columns={0:'duplicate'},inplace=True)

In [15]:
# Redefine df as df without duplicates
df = df[df['duplicate'] == False]

In [16]:
# Reset index
df.reset_index(inplace=True, drop=True)

In [17]:
# Check all rows
len(df['name'])

9328

In [18]:
# Check that all rows are unique
len(set(df['name']))

9328

In [19]:
# Check how many are videos
video = df[df['is_video'] == True]
len(video)

765

In [20]:
# How many videos are datascience
len(video[video['subreddit'] == 'datascience'])

0

In [21]:
# How many videos are dogs
len(video[video['subreddit'] == 'dogs'])

0

In [22]:
# How many videos are cats
len(video[video['subreddit'] == 'cats'])

201

In [23]:
# How many videos are aww
len(video[video['subreddit'] == 'aww'])

564

### Analysis

**As mentioned in the modeling analysis ahead**, there were many `aww` posts that were predicted incorrectly as `cats`. I think that adding in another independent variable like `is_video` could improve the accuracy of my model.

### Instantiate functions


In [24]:
tokenizer = RegexpTokenizer(r'\w+', gaps=False)
lemmatizer = WordNetLemmatizer()

### Cleaning subreddit titles

In [25]:
# Access each title individually
titles = [df.iloc[i][0] for i in range(len(df))]

In [26]:
titles[:3]

['R/AWW OFFICIAL DISCORD!!! JOIN NOW!',
 'r/catsandchristmastrees: /r/Aww Subreddit of the Week!',
 'sunset']

#### Remove numbers

In [27]:
def strip_num(titles):
    for i in range(len(titles)):
        titles[i] = re.sub('[^\D]+', '', titles[i])

In [28]:
strip_num(titles)

In [29]:
titles[:3]

['R/AWW OFFICIAL DISCORD!!! JOIN NOW!',
 'r/catsandchristmastrees: /r/Aww Subreddit of the Week!',
 'sunset']

#### Tokenize titles

In [30]:
def tokenize(titles):
    for i in range(len(titles)):
        titles[i] = tokenizer.tokenize(titles[i].lower())

In [31]:
tokenize(titles)

In [32]:
titles[:3]

[['r', 'aww', 'official', 'discord', 'join', 'now'],
 ['r', 'catsandchristmastrees', 'r', 'aww', 'subreddit', 'of', 'the', 'week'],
 ['sunset']]

#### Remove stop words

In [33]:
# Remove stop words
def stop_words(titles):
    for i in range(len(titles)):
        raw_words = titles[i]
        titles[i] = [w for w in raw_words if not w in stopwords.words('english')]
        

In [34]:
stop_words(titles)

In [35]:
titles[:3]

[['r', 'aww', 'official', 'discord', 'join'],
 ['r', 'catsandchristmastrees', 'r', 'aww', 'subreddit', 'week'],
 ['sunset']]

#### Lemmatize ALL tokens

In [36]:
# Lemmatize tokens
list2 = []
def lemm(titles):
    for x in range(len(titles)):
        list1 = []
        for i in titles[x]:
            list1.append(lemmatizer.lemmatize(i))
        list2.append(list1) 

In [37]:
lemm(titles)
titles = list2

In [38]:
titles[:3]

[['r', 'aww', 'official', 'discord', 'join'],
 ['r', 'catsandchristmastrees', 'r', 'aww', 'subreddit', 'week'],
 ['sunset']]

### Analysis

Now that the remaining words have been lemmatized, we're left with more stripped-down versions of words that still had unnecessary morphemes attached. What this means is that now words that are orthographically different that have the same meanings won't be processed as different words.

In this example, lemmatizing these three words will result in three instances of the *same* word.

 - computer --> compute
 - computerize --> compute
 - compute --> compute

#### Join strings

In [39]:
for i in range(len(titles)):
    titles[i] = ' '.join(titles[i])

In [40]:
titles[:5]

['r aww official discord join',
 'r catsandchristmastrees r aww subreddit week',
 'sunset',
 'bandit turned year old december th',
 'looking good']

#### Put into DataFrame

In [41]:
# Create empty DataFrame
clean_title = pd.DataFrame(index=range(len(titles)), 
                           columns=['clean_title'])

In [42]:
# Input clean titles into empty DataFrame
for i in range(len(titles)):
    clean_title.iloc[i] = titles[i]

In [43]:
# Combine clean_title DataFrame and original DataFrame 
df['clean_title'] = clean_title

In [44]:
df.head()

Unnamed: 0,title,subreddit,name,is_video,id,duplicate,clean_title
0,R/AWW OFFICIAL DISCORD!!! JOIN NOW!,aww,t3_97vuvt,False,97vuvt,False,r aww official discord join
1,r/catsandchristmastrees: /r/Aww Subreddit of t...,aww,t3_a7yh12,False,a7yh12,False,r catsandchristmastrees r aww subreddit week
2,sunset,aww,t3_a80fbf,False,a80fbf,False,sunset
3,Bandit turned 25 years old on December 7th!,aww,t3_a814sv,False,a814sv,False,bandit turned year old december th
4,Looking good for 23!!,aww,t3_a7z8or,False,a7z8or,False,looking good


#### Save to .csv

In [46]:
# df.to_csv('./data/animal_data')