#  Project 3: Web APIs & NLP

## Collecting and Preprocessing

### Using Pushshift's API, collecting posts from two subreddits 'cats' and 'dogs'

In [1]:
import requests
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import time 
import warnings
import nltk

from pandas.io.parquet import to_parquet
from nltk.tokenize import sent_tokenize, word_tokenize, RegexpTokenizer
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.corpus import stopwords, wordnet
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.metrics import confusion_matrix, plot_confusion_matrix
from sklearn.model_selection import cross_val_score
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

In [2]:
topics = ['cats', 'dogs']

In [3]:
def pull_subreddit(base_url, num_of_pulls):
    dfs = []
    for top in topics:
        earliest_utc = None
        params = {
            'subreddit': top,
            'size': 500
        }
        
        for n in range(num_of_pulls):
            
            params['before'] = earliest_utc
            
            res = requests.get(base_url, params)
            assert(res.status_code == 200)
            data = res.json()
            posts = data['data']
            
            temp_df = pd.DataFrame(posts)
            if temp_df.shape[0]<=0:
                break
            earliest_utc = temp_df['created_utc'].min()
            
            dfs.append(temp_df)
            
    df = pd.concat(dfs)
    return df

In [4]:
# Request submissions
df_submissions = pull_subreddit('https://api.pushshift.io/reddit/search/submission', 7)
df_submissions = df_submissions.drop_duplicates(subset = 'title')

### Submissions Preprocessing

In [5]:
df_submissions = df_submissions[['title', 'selftext', 'subreddit']]

# Remove the "removed" texts
df_submissions = df_submissions[df_submissions['selftext'] != '[removed]']

# Change Nulls to empty text because we don't want to loose other data from that line
df_submissions['selftext'].fillna(" ", inplace=True)

# Put title and selftext column together in one column
df_submissions['text'] = df_submissions['title'] + " " +  df_submissions['selftext']

In [6]:
# Reset indexes in the subreddit dataframe
df_submissions.reset_index(drop=True, inplace=True)

In [7]:
# There are other subreddits other than cats and dogs as well
print(f'subreddits we have in our data before cleaning the data: {df_submissions.subreddit.unique()}')

# Removing false subreddits
df_submissions = df_submissions[(df_submissions['subreddit'] == 'cats') | (df_submissions['subreddit'] == 'dogs')]

print(f'subreddits we have in our data after cleaning the data: {df_submissions.subreddit.unique()}')

subreddits we have in our data before cleaning the data: ['cats' 'u_Bottle-of-cats' 'dogs' 'u_wobble-dogs']
subreddits we have in our data after cleaning the data: ['cats' 'dogs']


In [8]:
# Chack if we have empty rows
df_submissions[(df_submissions['selftext'] == '') & (df_submissions['title'] == '')]

Unnamed: 0,title,selftext,subreddit,text


In [9]:
cat_dog_submissions = pd.DataFrame(df_submissions[['text', 'subreddit']])

### Using Pushshift's API, collecting posts from two subreddits 'cats' and 'dogs'

In [10]:
# Request comments
df_comments = pull_subreddit('https://api.pushshift.io/reddit/search/comment', 7)
df_comments = df_comments.drop_duplicates(subset = 'body')

### Comments Preprocessing

In [11]:
df_comments = df_comments[['body', 'subreddit']]

# Remove the "removed" texts
df_comments = df_comments[df_comments['body'] != '[removed]']

# Reset indexes in the subreddit dataframe
df_comments.reset_index(drop=True, inplace=True)

# Check Null values
df_comments.isna().sum() # No Nulls

# Check empty rows
df_comments[df_comments['body'] == ' '] # No Empty cells

Unnamed: 0,body,subreddit


### Join submissions and comments

In [12]:
df_comments.rename(columns={'body':'text'},inplace=True)

In [13]:
df = pd.concat([cat_dog_submissions,df_comments], axis=0).reset_index(drop=True)

In [14]:
df.to_csv('./data/cat_dog.csv', index=False)

### Vectorizing the data

In [15]:
# Make a Dataframe of the vectorixed data for visuazlization

cvec = CountVectorizer(ngram_range=(1,2))

X = df['text']
y = df['subreddit']
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, random_state=42)

X_train = cvec.fit_transform(X_train)
X_test = cvec.transform(X_test)

vec_X_train = pd.DataFrame(X_train.todense(), 
                          columns=cvec.get_feature_names())
vec_X_train.head()



Unnamed: 0,00,00 13,00 14,00 200,00 300,00 31,00 days,00 during,00 in,00 month,...,القطط,القطط جملة,انظر,انظر شكل,بعد,بعد أسابيع,جملة,جملة بعد,شكل,شكل القطط
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [16]:
# A list of cat and dog food brands
food_brands = ["hill's science diet", 'royal canin', 'purina', 'purina pro plan', 'blue buffalo', 'iams', 'orijen', 'acana',
               'taste of the wild', 'wellness','merrick', 'fromm', 'nutro', "nature's variety", 'canidae', 'natural balance',
               'diamond naturals', 'diamond', 'pedigree', 'eukanuba', 'wellness core', 'nutro ultra', 'mars petcare', 'mars',
               'just food for dogs', 'nestle', 'avoderm', 'advantage ii', 'advantage', 'against the grain', 'alzoo', 'api',
               'fancy feast', 'pedigree', 'meow mix', 'reveal', 'tiny tiger', 'american journey', 'solid gold', 'earthborn holistic',
              'instinct', 'sportmix', 'kitten chow', 'tiki cat', 'tiki', 'applaws', 'authority', 'simply nourish']

feature_names = cvec.get_feature_names_out()
brand_columns = [feature for feature in feature_names if feature.lower() in food_brands]

### Removing non-English words from vectorized data

In [18]:
# Create a list of English words using the nltk library
nltk.download('words')  # download the English word list
english_words = list(nltk.corpus.words.words())
english_words.extend(food_brands)

# Get all the words in the data
feature_names = cvec.get_feature_names_out() 

# Removing words that are not in English
english_columns = [feature for feature in feature_names if feature.lower() in english_words]

[nltk_data] Downloading package words to
[nltk_data]     C:\Users\kavia\AppData\Roaming\nltk_data...
[nltk_data]   Package words is already up-to-date!


In [19]:
X_train_en = vec_X_train[vec_X_train.columns.intersection(english_columns)]

In [24]:
X_train_en.to_csv('./data/cleaned_vectorized_catdog.csv', index=False)

### Stemming

In [29]:
#  A function for text stemming

def stem_words(text):
    
    tokenizer = RegexpTokenizer(r'\w+')  
    words = tokenizer.tokenize(text.lower())
    
    # Initialize the Porter stemmer
    stemmer = PorterStemmer()
    
    # # Tokenize the text into individual words
    # words = nltk.word_tokenize(text)
    
    # Stem words and join them back into a string and deleting stop words
    stemmed_words = [stemmer.stem(word) for word in words if word not in stopwords.words('english')]
    stemmed_text = " ".join(stemmed_words)
    
    return stemmed_text