In [None]:
# imports
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
nltk.download('punkt')
nltk.download("stopwords")



[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [None]:
# load data
imdb_train = pd.read_table("train_data.txt", sep=":::", header=None, engine='python')

In [None]:
# format data frame
imdb_train = imdb_train.rename(columns={0: "id", 1: "title", 2: "genre", 3: "description"})
imdb_train = imdb_train.set_index('id')

Pre-Processing

In [None]:
# break up
def get_year(row):
   return row["title"][-6:-2]


def remove_year(row):
  return row["title"][:-7]

In [None]:
imdb_train["year"] = imdb_train.apply(get_year, axis=1)

In [None]:
imdb_train["title"] = imdb_train.apply(remove_year, axis=1)

NLP Pre-Processing

In [None]:
def remove_punctuation(row):
  desc = row["description"]
  
  desc = desc.replace(".", "")
  desc = desc.replace(",", "")
  desc = desc.replace("!", "")
  desc = desc.replace("?", "")
  desc = desc.replace("(", "")
  desc = desc.replace(")", "")
  desc = desc.replace("$", "")
  desc = desc.replace("%", "")
  desc = desc.replace("&", "")
  desc = desc.replace("*", "")
  
  return desc

def to_lower(row):
  return row["processed_description"].lower()

def tokenize(row):
  return word_tokenize(row["processed_description"])

In [None]:
# remove punctuation, standardize case to lower, word tokenize description
imdb_train["processed_description"] = imdb_train.apply(remove_punctuation, axis=1)
imdb_train["processed_description"] = imdb_train.apply(to_lower, axis=1)
imdb_train["processed_description"] = imdb_train.apply(tokenize, axis=1)

In [None]:
stop_words = set(stopwords.words('english'))

def remove_stop_words(row):
  return [w for w in row["processed_description"] if not w in stop_words]

In [None]:
imdb_train["processed_description"] = imdb_train.apply(remove_stop_words, axis=1)

Stemming

Reduces words into stems of words (so that, for example, "conversation" and "conversations" will both be treated as the same word).

In [None]:
from nltk.stem.porter import PorterStemmer
porter_stemmer = PorterStemmer()

def stemming(text):
  stem_text = [porter_stemmer.stem(word) for word in text]
  return stem_text

imdb_train['description_stemmed']=imdb_train['processed_description'].apply(lambda x: stemming(x))

In [None]:
imdb_train

Unnamed: 0_level_0,title,genre,description,year,processed_description,description_stemmed
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1,Oscar et la dame rose,drama,Listening in to a conversation between his do...,2009,"[listening, conversation, doctor, parents, 10-...","[listen, convers, doctor, parent, 10-year-old,..."
2,Cupid,thriller,A brother and sister with a past incestuous r...,1997,"[brother, sister, past, incestuous, relationsh...","[brother, sister, past, incestu, relationship,..."
3,"Young, Wild and Wonderful",adult,As the bus empties the students for their fie...,1980,"[bus, empties, students, field, trip, museum, ...","[bu, empti, student, field, trip, museum, natu..."
4,The Secret Sin,drama,To help their unemployed father make ends mee...,1915,"[help, unemployed, father, make, ends, meet, e...","[help, unemploy, father, make, end, meet, edit..."
5,The Unrecovered,drama,The film's title refers not only to the un-re...,2007,"[film, 's, title, refers, un-recovered, bodies...","[film, 's, titl, refer, un-recov, bodi, ground..."
...,...,...,...,...,...,...
54210,"""Bonino""",comedy,This short-lived NBC live sitcom centered on ...,1953,"[short-lived, nbc, live, sitcom, centered, bon...","[short-liv, nbc, live, sitcom, center, bonino,..."
54211,Dead Girls Don't Cry,horror,The NEXT Generation of EXPLOITATION. The sist...,????,"[next, generation, exploitation, sisters, kapa...","[next, gener, exploit, sister, kapa, -, bay, s..."
54212,Ronald Goedemondt: Ze bestaan echt,documentary,"Ze bestaan echt, is a stand-up comedy about g...",2008,"[ze, bestaan, echt, stand-up, comedy, growing,...","[ze, bestaan, echt, stand-up, comedi, grow, fa..."
54213,Make Your Own Bed,comedy,Walter and Vivian live in the country and hav...,1944,"[walter, vivian, live, country, difficult, tim...","[walter, vivian, live, countri, difficult, tim..."
