In [1]:
import numpy as np
import pandas as pd
import emoji
import datetime
from datetime import datetime
from datetime import timedelta
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
import spacy
from nltk.stem import WordNetLemmatizer
from nltk.stem import PorterStemmer
import requests
from bs4 import BeautifulSoup

In [2]:
#import the data
reddit = pd.read_csv('../data/reddit_data.csv')

In [3]:
# drop the new index column
reddit.drop(columns = ['Unnamed: 0'], inplace=True)
#set the post unique id as the index
reddit.set_index('id', inplace=True)

In [4]:
#converting the emojis in the titles to strings
reddit['title'] = reddit['title'].apply(emoji.demojize)

In [5]:
#removing "comments" from the number of comments and converting to float
reddit['comments'] = reddit['comments'].apply(lambda x: x.split(' ')[0])

#removing the 'k' and converting to a float
def k_thousand(x):
    if x[-1] == 'k':
        return int((float(x.replace('k','')))*1000)
    else:
        return int(x)
    
reddit['comments'] = (reddit['comments'].apply(k_thousand))
        

In [6]:
def time_fixer(x):
    if ' hours ago' in x:
        return int(x.replace(' hours ago',''))
    if ' hour ago' in x:
        return int(x.replace(' hour ago',''))
    if ' minutes ago' in x:
        return round(int(x.replace(' minutes ago',''))/60)
    if ' day ago' in x:
        return int(x.replace(' day ago',''))*24

#apply the function to our time column
reddit['time'] = reddit['time'].apply(time_fixer)
    
#rename "time" to "hours ago"
reddit.rename(columns = {'time':'hours_ago'}, inplace = True)

In [7]:
#remove the rows that have "vote" as their number of upvotes
reddit = reddit[reddit['upvotes'] != 'Vote']

In [8]:
#removing the 'k' from upvotes and converting to float
reddit['upvotes'] = reddit['upvotes'].apply(k_thousand)

In [9]:
#convert scrape_time to a date time object
reddit['scraped_time'] = reddit['scraped_time'].apply(
    lambda x: datetime.strptime(x, '%Y-%m-%d %H:%M:%S.%f'))

In [10]:
#converting hours ago to a time object
reddit['hours_ago'] = reddit['hours_ago'].apply(lambda x: timedelta(hours=x))

#subtracting hours_ago from scraped_time to get the initial posted time
reddit['posted_time'] = reddit['scraped_time'] - reddit['hours_ago']

In [11]:
#creating our target variable, 1 if above median # of comments, 0 if not
reddit['target'] = reddit['comments'].apply(lambda x: 1 if x > reddit['comments'].median() else 0)

In [12]:
#creating a variable for length of the title
reddit['title_len'] = reddit['title'].apply(lambda x: len(x.split()))

In [13]:
#creating a variable for if a title contains an emoji
reddit['title_emoji'] = reddit['title'].apply(lambda x: 1 if x.count(':')>=2 else 0)

In [14]:
#extracting the hour from the posted_time column

hours = []
for time in reddit['posted_time']:
    hours.append(time.hour)

reddit['posted_hour'] = hours

reddit['posted_hour'] = reddit['posted_hour'].apply(lambda x: 24 if x == 0 else x)

In [15]:
#creating a column indicating what day of the week the post was posted
#1 = monday

weekdays = []

for time in reddit['posted_time']:
    weekdays.append(time.weekday())

reddit['posted_weekday'] = weekdays
reddit['posted_weekday'] = reddit['posted_weekday'] + 1

def weekdays(x):
    if x == 1:
        return 'Monday'
    if x == 2:
        return 'Tuesday'
    if x == 3:
        return 'Wednesday'
    if x == 4:
        return 'Thursday'
    if x == 5:
        return 'Friday'
    if x == 6:
        return 'Saturday'
    if x == 7:
        return 'Sunday'

reddit['posted_weekday'] = reddit['posted_weekday'].apply(weekdays)

In [16]:
#creating a column with stemmed titles

#lower case all titles
reddit['title'] == reddit['title'].apply(lambda x: x.lower())

#instantiate the stemmer
ps = PorterStemmer()

#create an empty list to store all the stemmed titles
stemmed_title_list = []

#make this into a list comprehension

#stem the titles and append them to the empty list
for post_title in reddit['title']:
    temp_list = []
    for word in post_title.split():
        temp_list.append(ps.stem(word.lower()))
    title_item = " ".join(temp_list)
    stemmed_title_list.append(title_item)
 
#create a column from stemmed titles
reddit['stemmed_title'] = stemmed_title_list

In [17]:
#dropping duplicated indices, keeping the most recent version of the post
reddit = reddit[~reddit.index.duplicated(keep='last')]

In [18]:
# dropping duplicated titles
reddit = reddit[~reddit.title.duplicated(keep='first')]


In [19]:
#TFIDF top 10

#Instantiate TFIDF vectorizer
tvec = TfidfVectorizer(
                        stop_words = 'english',
                       strip_accents = 'ascii',
                       max_features = 10,
                       max_df = .60,
                       norm = 'l2'
)
#fit the TFIDF vectorizer
tvec.fit(reddit['stemmed_title'])

#create a dataframe by transforming our stemmed title with the fitted TVEC
df_tvec = pd.DataFrame(tvec.transform(reddit['stemmed_title']).todense(),
                  columns = tvec.get_feature_names())
df_tvec['post_id'] = reddit.index

#save as a csv
df_tvec.to_csv('../data/title_tfidf_10.csv', index=False)

In [20]:
#TFIDF top 25

#Instantiate TFIDF vectorizer
tvec = TfidfVectorizer(
                        stop_words = 'english',
                       strip_accents = 'ascii',
                       max_features = 25,
                       max_df = .60,
                       norm = 'l2'
)
#fit the TFIDF vectorizer
tvec.fit(reddit['stemmed_title'])

#create a dataframe by transforming our stemmed title with the fitted TVEC
df_tvec = pd.DataFrame(tvec.transform(reddit['stemmed_title']).todense(),
                  columns = tvec.get_feature_names())
df_tvec['post_id'] = reddit.index

#save as a csv
df_tvec.to_csv('../data/title_tfidf_25.csv', index=False)

In [21]:
#TFIDF top 50

#Instantiate TFIDF vectorizer
tvec = TfidfVectorizer(
                        stop_words = 'english',
                       strip_accents = 'ascii',
                       max_features = 50,
                       max_df = .60,
                       norm = 'l2'
)
#fit the TFIDF vectorizer
tvec.fit(reddit['stemmed_title'])

#create a dataframe by transforming our stemmed title with the fitted TVEC
df_tvec = pd.DataFrame(tvec.transform(reddit['stemmed_title']).todense(),
                  columns = tvec.get_feature_names())
df_tvec['post_id'] = reddit.index

#save as a csv
df_tvec.to_csv('../data/title_tfidf_50.csv', index=False)

In [22]:
#TFIDF top 100

#Instantiate TFIDF vectorizer
tvec = TfidfVectorizer(
                        stop_words = 'english',
                       strip_accents = 'ascii',
                       max_features = 100,
                       max_df = .60,
                       norm = 'l2'
)
#fit the TFIDF vectorizer
tvec.fit(reddit['stemmed_title'])

#create a dataframe by transforming our stemmed title with the fitted TVEC
df_tvec = pd.DataFrame(tvec.transform(reddit['stemmed_title']).todense(),
                  columns = tvec.get_feature_names())
df_tvec['post_id'] = reddit.index

#save as a csv
df_tvec.to_csv('../data/title_tfidf_100.csv', index=False)

In [23]:
#create a column with post_id for merging purposes
reddit['post_id'] = reddit.index

In [24]:
#removing the EXTREMELY popular posts
reddit = reddit[reddit['comments']<10000]
reddit = reddit[reddit['upvotes']<100000]

In [25]:
reddit.to_csv('../data/reddit_data_cleaned.csv', index = False)

In [27]:
reddit.shape

(14226, 17)