# IMDB 5000 Movie Dataset Storyline Text Processing

================================================================================================================================

**AUTHOR**: Mengshan Jin

**CREATION DATE**: 08/01/2017

================================================================================================================================


**PROGRAM DESCRIPTION**: Text processing on storyline

**INPUT DATASETS**: 01_Data/Outputs/imdb_with_storyline.csv

**OUTPUT DATASETS**: 


================================================================================================================================
**PROGRAM CHANGE HISTORY**

Date|Author|Change|
----|------|------|

# Section 0: Import packages

In [12]:
# Data structure
import numpy as np
import pandas as pd
# pd.options.mode.chained_assignment = None  # default='warn'

# utilities
from collections import Counter
import re
import string
from itertools import groupby, chain
import dill
import pickle

# nltk
from nltk.tokenize import word_tokenize, MWETokenizer
from nltk.tag import StanfordNERTagger
from nltk.corpus import stopwords

# Section 1: Read data

In [2]:
imdb_with_storyline = pd.read_csv("../01_Data/Outputs/imdb_with_storyline.csv", encoding="utf-8")

In [5]:
imdb_with_storyline.shape

(5043, 29)

In [3]:
imdb_with_storyline.head()

Unnamed: 0,color,director_name,num_critic_for_reviews,duration,director_facebook_likes,actor_3_facebook_likes,actor_2_name,actor_1_facebook_likes,gross,genres,...,language,country,content_rating,budget,title_year,actor_2_facebook_likes,imdb_score,aspect_ratio,movie_facebook_likes,storyline
0,Color,James Cameron,723.0,178.0,0.0,855.0,Joel David Moore,1000.0,760505847.0,Action|Adventure|Fantasy|Sci-Fi,...,English,USA,PG-13,237000000.0,2009.0,936.0,7.9,1.78,33000,"When his brother is killed in a robbery, parap..."
1,Color,Gore Verbinski,302.0,169.0,563.0,1000.0,Orlando Bloom,40000.0,309404152.0,Action|Adventure|Fantasy,...,English,USA,PG-13,300000000.0,2007.0,5000.0,7.1,2.35,0,"After Elizabeth, Will, and Captain Barbossa re..."
2,Color,Sam Mendes,602.0,148.0,0.0,161.0,Rory Kinnear,11000.0,200074175.0,Action|Adventure|Thriller,...,English,UK,PG-13,245000000.0,2015.0,393.0,6.8,2.35,85000,A cryptic message from the past sends James Bo...
3,Color,Christopher Nolan,813.0,164.0,22000.0,23000.0,Christian Bale,27000.0,448130642.0,Action|Thriller,...,English,USA,PG-13,250000000.0,2012.0,23000.0,8.5,2.35,164000,Despite his tarnished reputation after the eve...
4,,Doug Walker,,,131.0,,Rob Walker,131.0,,Documentary,...,,,,,,12.0,7.1,,0,


In [6]:
imdb_with_storyline = imdb_with_storyline.loc[imdb_with_storyline['title_year'] > 1980]

In [7]:
imdb_with_storyline.shape

(4650, 29)

In [8]:
imdb_with_storyline['storyline'].isnull().sum()

2

# Section 2: Raw frequency distribution

### Utility functions

In [173]:
def freq_count(list_tokens):
    raw_count = pd.DataFrame(Counter(list_tokens).most_common()).set_index([0], drop=True)
    del raw_count.index.name
    raw_count.reset_index(inplace=True)
    raw_count.columns = ['Token', 'Count']
    return raw_count

In [184]:
def get_weird_tokens(freq_count_res):
    tmp = freq_count_res.loc[~freq_count_res['Token'].str.isalnum()]
    return tmp.loc[tmp['Count']>5]

### Step 0: tokenization

In [225]:
# a list of text
term_list = imdb_with_storyline['storyline'].tolist()

In [226]:
# a list of lists of tokens, in lower case, with punctuations removed
token_list = [[item.lower().strip("'") for item in word_tokenize(storyline) if item not in string.punctuation] 
              for storyline in term_list if pd.notnull(storyline)]

In [227]:
len(token_list)
# 2 storylines are missing, so correct

4648

In [228]:
tokens_orig = [item for sublist in token_list for item in sublist]

In [229]:
raw_count = freq_count(tokens_orig)

In [230]:
get_weird_tokens(raw_count).head(20)

Unnamed: 0,Token,Count
41,``,966
42,,962
50,...,819
57,n't,734
168,--,255
252,dr.,176
536,u.s.,87
571,mr.,82
1471,l.a.,33
1723,year-old,28


### Step 1: remove stopwords

In [231]:
stop_list = stopwords.words('english') + ["'s", "n't", "``", "", "...", "--"]

In [232]:
# a list of tokens with stopwords removed
tokens = [item for sublist in token_list for item in sublist if item not in stop_list]

In [233]:
raw_count = freq_count(tokens)

In [82]:
# with open("../01_Data/Outputs/non-alpha-numeric tokens.txt", "w") as thefile:
#     for item in raw_count.loc[~raw_count['Token'].str.isalnum()]['Token'].tolist():
#         thefile.write("%s\n" % item.encode("utf-8"))

In [234]:
get_weird_tokens(raw_count).head(20)

Unnamed: 0,Token,Count
154,dr.,176
428,u.s.,87
462,mr.,82
1352,l.a.,33
1602,year-old,28
1606,mrs.,28
1613,high-school,28
1664,jr.,27
1685,ex-wife,27
1932,ex-con,23


### Step 2: handle x-year-old

In [98]:
def text2int(textnum, numwords={}):
    if not numwords:
        units = [
        "zero", "one", "two", "three", "four", "five", "six", "seven", "eight",
        "nine", "ten", "eleven", "twelve", "thirteen", "fourteen", "fifteen",
        "sixteen", "seventeen", "eighteen", "nineteen",
        ]

        tens = ["", "", "twenty", "thirty", "forty", "fifty", "sixty", "seventy", "eighty", "ninety"]

        scales = ["hundred", "thousand", "million", "billion", "trillion"]

        numwords["and"] = (1, 0)
        for idx, word in enumerate(units):    numwords[word] = (1, idx)
        for idx, word in enumerate(tens):     numwords[word] = (1, idx * 10)
        for idx, word in enumerate(scales):   numwords[word] = (10 ** (idx * 3 or 2), 0)

    current = result = 0
    for word in textnum.split():
        if word not in numwords:
            raise Exception("Illegal word: " + word)
        scale, increment = numwords[word]
        current = current * scale + increment
        if scale > 100:
            result += current
            current = 0

    return result + current

In [165]:
def summarize_x_year_old(x_year_old):
    tmp = re.sub('-', ' ', re.split(r'-year-old', x_year_old)[0])
    try:
        x = int(tmp)
    except ValueError:
        try:
            x = text2int(tmp)
        except Exception:
            return x_year_old
    if x < 12:
        return "childhood"
    elif x < 21:
        return "adolescence"
    elif x < 36:
        return "early-adulthood"
    elif x < 51:
        return "midlife"
    else:
        return "late-adulthood"

In [236]:
# a list of tokens with x-year-old fixed
tokens_clean_years = [summarize_x_year_old(x) if x.find("-year-old") != -1 else x for x in tokens]

In [237]:
raw_count = freq_count(tokens_clean_years)

In [238]:
get_weird_tokens(raw_count).head(20)

Unnamed: 0,Token,Count
154,dr.,176
429,u.s.,87
463,mr.,82
1353,l.a.,33
1603,year-old,28
1607,mrs.,28
1614,high-school,28
1665,jr.,27
1686,ex-wife,27
1933,ex-con,23


### Step 3: substitute synonyms
**Caution: The following dictionary *synonyms* is not a complete list!**

In [255]:
synonyms = {
    'dr.':'doctor',
    'u.s.':'united-states',
    "'ll":"will",
    "'ve":"have",
    "'d":"would",
    'l.a.':'los-angeles',
    'year-old':'year old',
    'jr.':'junior',
    'small-town':'small town',
    'real-life':'real life',
    'co-worker':'colleague',
    'a.k.a':'also known as',
    'st.':'street',
    'would-be':'would be',
    'long-time':'long time',
    'self-defense':'self defense',
    'co-workers':'colleague',
    'modern-day':'modern',
    'd.c.':'washington-dc',
    'top-secret':'top secret',
    'next-door':'neighbor',
    'face-to-face':'face to face',
    'mild-mannered':'mild manner',
    'part-time':'part time',
    'self-discovery':'self discovery',
    'one-time':'one time',
    'high-speed':'high speed',
    'high-tech':'high technology',
    'so-called':'so called',
    'new-found':'newly found',
    'long-term':'long term',
    'u.s':'united states',
    'sci-fi':'science fiction',
    'up-and-coming':'upcoming',
    'soon-to-be':'soon to be',
    'all-out':'all out',
    'one-night':'one night',
    'best-selling':'best selling',
    '3-d':'3 dimensions',
    'husband-and-wife':'husband and wife',
    'wedding-plans':'wedding plans',
    'salesman/font':'salesman',
    'mid-twenty-year-old':'young adulthood',
    'ten-and-a-half-year-old':'childhood'
}

In [256]:
tokens_sub_synonyms = [x if x not in synonyms.keys() else synonyms[x] for x in tokens_clean_years]

In [257]:
raw_count = freq_count(tokens_sub_synonyms)

In [258]:
get_weird_tokens(raw_count).head(20)

Unnamed: 0,Token,Count
429,united-states,87
464,mr.,82
1373,los-angeles,33
1606,mrs.,28
1613,high-school,28
1620,year old,28
1685,ex-wife,27
1933,ex-con,23
2457,small town,18
2597,real life,17


### Step 4: remove words with only one letter and a dot at the end

In [259]:
tokens_remove_dot = [x for x in tokens_sub_synonyms if re.match(r"[a-z]\.$", x) == None]

In [260]:
raw_count = freq_count(tokens_remove_dot)

In [261]:
get_weird_tokens(raw_count).head(20)

Unnamed: 0,Token,Count
429,united-states,87
464,mr.,82
1373,los-angeles,33
1606,mrs.,28
1613,high-school,28
1620,year old,28
1685,ex-wife,27
1933,ex-con,23
2457,small town,18
2597,real life,17


### Step 5: split phrases into words and remove stopwords

In [265]:
tokens_split = [x for item in tokens_remove_dot for x in item.split(" ")]

In [266]:
raw_count = freq_count(tokens_split)

In [267]:
get_weird_tokens(raw_count).head(20)

Unnamed: 0,Token,Count
429,united-states,87
466,mr.,82
1378,los-angeles,33
1608,mrs.,28
1615,high-school,28
1686,ex-wife,27
1934,ex-con,23
2813,middle-aged,16
3058,african-american,14
3068,spider-man,14


### Wrap up everything into a custom tokenization function

In [268]:
def custom_tokenizer(storyline_list, stop_list, synonyms):
    token_list = [[item.lower().strip("'") for item in word_tokenize(storyline) if item not in string.punctuation] 
              for storyline in storyline_list if pd.notnull(storyline)]
    tokens = [item for sublist in token_list for item in sublist if item not in stop_list]
    tokens_clean_years = [summarize_x_year_old(x) if x.find("-year-old") != -1 else x for x in tokens]
    tokens_sub_synonyms = [x if x not in synonyms.keys() else synonyms[x] for x in tokens_clean_years]
    tokens_remove_dot = [x for x in tokens_sub_synonyms if re.match(r"[a-z]\.$", x) == None]
    tokens_split = [x for item in tokens_remove_dot for x in item.split(" ")]
    tokens = [item for item in tokens_split if item not in stop_list]
    return tokens

In [269]:
tokens = custom_tokenizer(imdb_with_storyline['storyline'].tolist(), stop_list, synonyms)

In [273]:
with open("../01_Data/Outputs/cleaned_tokens.txt", "w") as thefile:
    for item in tokens:
        thefile.write("%s\n" % item.encode("utf-8"))