# Predicting Reddit R-rated Content by Title
The goal of this analysis is to be able to predict by the title if the post is going to be intended for people over the age of 18 or it will be suitable for anyone.

In [4]:
import pandas as pd
import numpy as np
from nltk.corpus import stopwords 
from nltk.tokenize import word_tokenize
import string 
from nltk.stem import WordNetLemmatizer



In [3]:
#Read Dataset that queried from SQL 
df1 = pd.read_csv("Over18Content.csv")
df2 = pd.read_csv("Under18Content.csv")

In [7]:
#Concatenate two datasets
frames = [df1, df2]

df3 = pd.concat(frames)

In [10]:
#Number of rows and columns
df3.shape

(2000, 2)

In [83]:
Titles = []

for row in df3["title"]:
    #tokenize words
    words = word_tokenize(row)
    #remove punctuations
    clean_words = [word.lower() for word in remove_numbers if word not in set(string.punctuation)]
    #remove stop words
    english_stops = set(stopwords.words('english'))
    clean_words = [word for word in clean_words if word not in english_stops]
    #Lematise words
    wordnet_lemmatizer = WordNetLemmatizer()
    lemma_list = [wordnet_lemmatizer.lemmatize(word) for word in clean_words]
    Titles.append(lemma_list)
    


In [177]:
#Convert each column to list and zip together to prepare it for bag of words

In [84]:
Age_Allowed = []
for row in df3["over_18"]:
    Age_Allowed.append(row)

In [85]:
print(len(Titles))
print(len(Age_Allowed))

2000
2000


In [182]:
#Sample row
print(Titles[1500])
print(Age_Allowed[1500])

['ending', 'place', 'never', 'thought', 'would']
False


In [183]:
#Combine both lists
combined = zip(Titles,Age_Allowed)

In [184]:
#Define bag_of_words function
def bag_of_words(words):
    return dict([(word, True) for word in words])

In [89]:
#Classify each word into True (Over 18) or False (Under 18)
Final_Data = []
for r, v in combined:
    bag_of_words(r)
    Final_Data.append((bag_of_words(r),v))
    
    
print(Final_Data[0:5]) 



In [152]:
#Shuffle data
import random
random.shuffle(Final_Data)
print(len(Final_Data))

2000


## Build Naive Bayes Model

In [153]:
# Split the dataset into training and test subsets
train_set, test_set = Final_Data[0:1400], Final_Data[1400:]

import nltk
import collections
from nltk.metrics.scores import (accuracy, precision, recall, f_measure) 
from nltk import metrics



refsets3 = collections. defaultdict(set)
testsets3 = collections.defaultdict(set)

classifier = nltk.NaiveBayesClassifier.train(train_set)

 
for i, (feats, label) in enumerate(test_set):
    refsets3[label].add(i)
    observed = classifier.classify(feats)
    testsets3[observed].add(i)


print("Naive Bayes Performance with Unigrams ")    
print("Accuracy:",nltk.classify.accuracy(classifier, test_set))



classifier.show_most_informative_features(n=10)

Naive Bayes Performance with Unigrams 
Accuracy: 0.8416666666666667
Most Informative Features
                      lf = True             True : False  =     18.8 : 1.0
                    chat = True             True : False  =     16.9 : 1.0
                     fun = True             True : False  =     14.9 : 1.0
                      22 = True             True : False  =      8.9 : 1.0
               subreddit = True             True : False  =      8.3 : 1.0
                    idea = True            False : True   =      7.7 : 1.0
                campaign = True            False : True   =      7.7 : 1.0
                 request = True             True : False  =      7.3 : 1.0
                      24 = True             True : False  =      7.3 : 1.0
                       v = True            False : True   =      7.1 : 1.0


## Null Accuracy to Compare to Model

In [186]:
Counter = 0
False_Counter = 0
for i, v in test_set:
    if v == True:
        Counter += 1 
    else:
        False_Counter += 1 
        
#Create counter to count how many records in test set were true vs how many were false
print("True:", Counter)
print("False:",False_Counter)

print("Null Accuracy:", (Counter / (Counter + False_Counter)) * 100)

True: 353
False: 347
Null Accuracy: 50.42857142857143


## Result

The null accuracy for the test set is 50.43%. The naive bayes model accuractely classified 84.17% of the records, which is  33.74 % better than a naive prediction without any classifiers.

Analyzing most informative features:
The term "lf" appears 18.8 times in a over_18 post than under_18. Urban dictionary says "lf" means "Looking for".

Words such as chat, fun, subreddit, and request also have a higher chance of appearing in over_18 posts.

The numbers 22 and 24 most likely represent the age of a person, which you can see an example below. 

The words "campaign", "idea", and "v" are most likely to signal under_18 post. 

"v" stands for video upload.


In [205]:
twenty_four_example = df3[df3['title'].str.contains("24")]
print(twenty_four_example.head(5))

                                                 title  over_18
27   24[M4F] Online/Anywhere - Bi dude searching fo...     True
114  Vendor review - Imperialstormtrooper's GG249 2...     True
184  24 [M4MF] Long Island/NYC Looking to watch a c...     True
225      24 [F4m] Domme seeking bi male slave under 35     True
426  Any sissies want a dom black top to control th...     True


In [214]:
#Fun is a word that kids use a lot, but adults use it a lot in a different way.
#This word shows the importance of making sure to mark content correctly. 
#You don't want a young child opening post by mistake from an adult looking for "fun"

fun_example = df3[df3['title'].str.contains("fun")]
print(fun_example.head(4))
print(fun_example.tail(4))

                                                 title  over_18
25   30 [M4F] Portland or Online - Seeking flirty, ...     True
73         21 [M4F] German boy looking for some fun :)     True
252                      [B/S] Pool time fun: Update 9     True
263                     [B/S] Pool time fun: Update 10     True
                                                 title  over_18
726  Which F.Priest is better overall? (dps, end ga...    False
899  What are some of the most fun magical items/ar...    False
924  5e: looking for suggestions for lvl 1 items th...    False
946  What are some fun/Wacky things you have done a...    False
