# Preprocessing data

In [66]:
# load pickle file into pandas frame
df = pd.read_pickle("../submissions_star_wars.data.pickle")
# df = df.iloc[:2000]

In [73]:
import time
import pandas as pandas
import string 
import re 
import nltk 
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords 
from nltk.stem import WordNetLemmatizer 
from sklearn.feature_extraction.text import TfidfVectorizer
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Fred\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Fred\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Fred\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [76]:
# 1. remove urls
# 2. lowercase transform
# 3. remove numbers
# 4. remove punctuation 
# 5. tokenize words (separate words into tokens)
# 6. stop words (?)
# 7. lemmatize (reduce to base root)
# 8. Part of speech tagging?
# 9. Vectorization? 

# make all text lowercase
def text_lowercase(text):
    return text.lower()

# remove urls
def remove_urls(text):
    new_text = ' '.join(re.sub("(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)"," ",text).split())
    return new_text

# remove numbers
def remove_numbers(text):
    result = re.sub(r'\d+', '', text)
    return result

# remove punctuation
def remove_punctuation(text):
    translator = str.maketrans('', '', string.punctuation)
    return text.translate(translator)

# tokenize
def tokenize(text):
    text = word_tokenize(text)
    return text

# remove stopwords
stop_words = set(stopwords.words('english'))
def remove_stopwords(text):
    text = [i for i in text if not i in stop_words]
    return text

# lemmatize
lemmatizer = WordNetLemmatizer()
def lemmatize(text):
    text = [lemmatizer.lemmatize(token) for token in text]
    return text

#POS tag
def pos_tag(text):
    return nltk.pos_tag(nltk.word_tokenize(text))

# chain all functions
def preprocessing(text):
    text = text_lowercase(text)
    text = remove_urls(text)
    text = remove_numbers(text)
    text = remove_punctuation(text)
    text = tokenize(text)
    # text = remove_stopwords(text) # for our purposes, keep stopwords for now. Can really affect model performance
    text = lemmatize(text)
    text = ' '.join(text)
    text = pos_tag(text) #optional
    return text

In [77]:
start = time.time()

processed_list = []
for entry in df["title"].iloc[:1000]: # select first 1k
    print("Raw sentence         : ", entry)
    print("Preprocessed sentence: ", preprocessing(entry))
    processed_list.append(preprocessing(entry))
stop = time.time()
print("elapsed time (s): ", round(stop-start, 3))

Raw sentence         :  Selling an old star war’s funko smuggler’s bounty box! 👇
Preprocessed sentence:  [('selling', 'VBG'), ('an', 'DT'), ('old', 'JJ'), ('star', 'NN'), ('war', 'NN'), ('s', 'NN'), ('funko', 'NN'), ('smuggler', 'NN'), ('s', 'JJ'), ('bounty', 'NN'), ('box', 'NN')]
Raw sentence         :  This is the way ! (From Empire at war - Battlefront Commander mod)
Preprocessed sentence:  [('this', 'DT'), ('is', 'VBZ'), ('the', 'DT'), ('way', 'NN'), ('from', 'IN'), ('empire', 'NN'), ('at', 'IN'), ('war', 'NN'), ('battlefront', 'NN'), ('commander', 'NN'), ('mod', 'NN')]
Raw sentence         :  Does reddit like this Lego x-wing that I made?
Preprocessed sentence:  [('doe', 'NN'), ('reddit', 'NN'), ('like', 'IN'), ('this', 'DT'), ('lego', 'NN'), ('x', 'VBZ'), ('wing', 'VBG'), ('that', 'IN'), ('i', 'NN'), ('made', 'VBD')]
Raw sentence         :  Been waiting all my life for someone to say you need to wear your mask at work......... Feelin sexy !! 🤣 *** Be advised - No storm troopers w

In [62]:
# optional: vectorize
tf=TfidfVectorizer()
fitted_vectorizer = tf.fit(processed_list) #fit
transformed = fitted_vectorizer.transform(processed_list) # transform
print(fitted_vectorizer.get_feature_names()) # unique words in all posts within our column

['advised', 'all', 'an', 'at', 'battlefront', 'be', 'been', 'bounty', 'box', 'commander', 'doe', 'during', 'empire', 'feelin', 'for', 'from', 'funko', 'hurt', 'idea', 'is', 'lego', 'life', 'like', 'made', 'mask', 'mod', 'my', 'need', 'new', 'no', 'of', 'old', 'picture', 'reddit', 'say', 'selling', 'sexy', 'smuggler', 'someone', 'star', 'storm', 'taking', 'that', 'the', 'this', 'to', 'trooper', 'waiting', 'war', 'way', 'wear', 'were', 'wing', 'work', 'you', 'your']


In [64]:
print(transformed) 
# (document/entry, wordId) tf-idf
# essentially gives us frequency-inverse document frequency (tf-idf). It's just a way of normalizing the frequency to account for how often this word occurs in the entire document. Otherwise, we give a lot of weight to the assumption that if a word appeared many times in a document, it is important.

(0, 48)	0.2743035641495426
  (0, 39)	0.339992197464673
  (0, 37)	0.339992197464673
  (0, 35)	0.339992197464673
  (0, 31)	0.339992197464673
  (0, 16)	0.339992197464673
  (0, 8)	0.339992197464673
  (0, 7)	0.339992197464673
  (0, 2)	0.339992197464673
  (1, 49)	0.3261421451398157
  (1, 48)	0.26312942914086856
  (1, 44)	0.21842119564089313
  (1, 43)	0.26312942914086856
  (1, 25)	0.3261421451398157
  (1, 19)	0.3261421451398157
  (1, 15)	0.3261421451398157
  (1, 12)	0.3261421451398157
  (1, 9)	0.3261421451398157
  (1, 4)	0.3261421451398157
  (1, 3)	0.26312942914086856
  (2, 52)	0.3664082043739687
  (2, 44)	0.24538784479290737
  (2, 42)	0.3664082043739687
  (2, 33)	0.3664082043739687
  (2, 23)	0.3664082043739687
  :	:
  (3, 44)	0.11702533994685393
  (3, 43)	0.14097904191431268
  (3, 41)	0.1747398886540911
  (3, 40)	0.1747398886540911
  (3, 38)	0.1747398886540911
  (3, 36)	0.1747398886540911
  (3, 34)	0.1747398886540911
  (3, 32)	0.1747398886540911
  (3, 30)	0.1747398886540911
  (3, 29)	0.17473