# Text Cleaning and Preprocessing

### Import Modules

In [11]:
import pandas as pd
import numpy as np
from pprint import pprint

import re
import pickle
import requests

from sklearn import utils
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS

from bs4 import BeautifulSoup
from nltk.tokenize import WordPunctTokenizer

import warnings
warnings.filterwarnings('ignore')

np.random.seed(42)

### Read Data into Pandas Table

Data source: https://www.kaggle.com/utathya/imdb-review-dataset



In [12]:
df = pd.read_csv('../Data/imdb_master.csv', encoding = "ISO-8859-1")
df.drop(columns='Unnamed: 0', axis=1, inplace=True)

df.head()

Unnamed: 0,type,review,label,file
0,test,Once again Mr. Costner has dragged out a movie...,neg,0_2.txt
1,test,This is an example of why the majority of acti...,neg,10000_4.txt
2,test,"First of all I hate those moronic rappers, who...",neg,10001_1.txt
3,test,Not even the Beatles could write songs everyon...,neg,10002_3.txt
4,test,Brass pictures (movies is not a fitting word f...,neg,10003_3.txt


### Create Train Table

I will filter the dataframe so that I am only working with the train data and reset index.

In [13]:
df_train = df[df['type'] == 'train']
df_train.reset_index(inplace=True)
df_train.drop(columns=['index'], inplace=True)
df_train.head()

Unnamed: 0,type,review,label,file
0,train,Story of a man who has unnatural feelings for ...,neg,0_3.txt
1,train,Airport '77 starts as a brand new luxury 747 p...,neg,10000_4.txt
2,train,This film lacked something I couldn't put my f...,neg,10001_4.txt
3,train,"Sorry everyone,,, I know this is supposed to b...",neg,10002_1.txt
4,train,When I was little my parents took me along to ...,neg,10003_1.txt


### Check for Datatype, Unique Values and Nulls

In [14]:
def dfcheck(data):
    df_check = pd.DataFrame()
    df_check['columns'] = data.columns.tolist()
    df_check['datatype'] = data.dtypes.tolist()
    df_check['count'] = data.count().tolist()
    df_check['unique'] = [len(data[c].unique()) for c in data]
    #list null counts per column
    df_check['nulls'] = data.isnull().sum().tolist()
    df_check = df_check.drop('count',axis=1)
   
    return df_check

dfcheck(df_train)

Unnamed: 0,columns,datatype,unique,nulls
0,type,object,1,0
1,review,object,74057,0
2,label,object,3,0
3,file,object,75000,0


### Drop Rows with 'Unsup' in label column

There are 50000 reviews with no sentiment label. I will drop these.

In [15]:
df_train = df_train[df_train['label'] != 'unsup']

df_train['label'].value_counts()

pos    12500
neg    12500
Name: label, dtype: int64

### Data Dictionary

In [16]:
data_dict = {
    'label':{
        'type':df_train['label'].dtype,
        'description':'sentiment class - pos = positive, neg = negative'
    },
    'review':{
        'type':df_train['review'].dtype, 
        'description':'movie review',
        'unique': f'{len(df_train.review.unique())} unique values'
        
    },
    'dataset_shape':{ 
        df_train.shape
    }}

pprint(data_dict)

{'dataset_shape': {(25000, 4)},
 'label': {'description': 'sentiment class - pos = positive, neg = negative',
           'type': dtype('O')},
 'review': {'description': 'movie review',
            'type': dtype('O'),
            'unique': '24904 unique values'}}


There are 48 pairs of reviews that are duplicates.  I will drop one review from each of these pairs.

In [17]:
df_train.drop_duplicates('review', inplace=True)

In [18]:
df_train.shape

(24904, 4)

# Text Preprocessing

### Cleaning Text

*(will be used for tweets)*

This function does the following:
- decodes HTML to general text with beautiful soup (will be useful for tweets)
- remove the @ symbol
- removes any URL links
- removes the # symbol
- makes all the review text lower case
- Ensures that the t's in negation words are not erased
- Removes any numbers and special characters
- Eliminates all emojis

In [19]:
tok = WordPunctTokenizer()

negations_dict = {"isn't":"is not", "aren't":"are not", "wasn't":"was not", "weren't":"were not",
                "haven't":"have not","hasn't":"has not","hadn't":"had not","won't":"will not",
                "wouldn't":"would not", "don't":"do not", "doesn't":"does not","didn't":"did not",
                "can't":"can not","couldn't":"could not","shouldn't":"should not","mightn't":"might not",
                "mustn't":"must not"}
neg_pattern = re.compile(r'\b(' + '|'.join(negations_dict.keys()) + r')\b')
emoji_pattern = re.compile("["
        u"\U0001F600-\U0001F64F"  # emoticons
        u"\U0001F300-\U0001F5FF"  # symbols & pictographs
        u"\U0001F680-\U0001F6FF"  # transport & map symbols
        u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           "]+", flags=re.UNICODE)

def review_cleaner(text):
    soup = BeautifulSoup(text, 'lxml')
    souped = soup.get_text()
    stripped  = re.sub(r'@[A-Za-z0-9_]+', '', souped)
    stripped = re.sub(r'https?://[^ ]+', '', stripped)
    stripped = re.sub(r'www.[^ ]+', '', stripped)
    lower_case = text.lower()
    neg_dict = neg_pattern.sub(lambda x: negations_dict[x.group()], lower_case)
    letters_only = re.sub('[^a-zA-Z]', " ", neg_dict)
    no_emoji = re.sub(emoji_pattern, '', letters_only)
    words = [x for x  in tok.tokenize(letters_only) if len(x) > 1]
    return (" ".join(words)).strip()

Map this function to all cells of the 'review' column

In [21]:
df_train['review'] = df_train['review'].map(review_cleaner)

In [22]:
df_train.head()

Unnamed: 0,type,review,label,file
0,train,story of man who has unnatural feelings for pi...,neg,0_3.txt
1,train,airport starts as brand new luxury plane is lo...,neg,10000_4.txt
2,train,this film lacked something could not put my fi...,neg,10001_4.txt
3,train,sorry everyone know this is supposed to be an ...,neg,10002_1.txt
4,train,when was little my parents took me along to th...,neg,10003_1.txt


### Create Stop Words List

I created a stop words list by combining the english stop words from sklearn and a stop word list from the Standford NLP library.  I also added the words 'film', 'films', 'movie', 'movies' because they had a frequency in both positive and negative movie reviews.

In [23]:
url = 'https://raw.githubusercontent.com/stanfordnlp/CoreNLP/master/data/edu/stanford/nlp/patterns/surface/stopwords.txt'
res = requests.get(url)
soup = BeautifulSoup(res.content, 'lxml')

In [24]:
standford_nlp = soup.text.split('\n')
stop_word_list = standford_nlp + list(ENGLISH_STOP_WORDS) + ['film', 'films', 'movie', 'movies']

### Export Cleaned DataFrame

In [25]:
with open('../Data/df_train.pkl', 'wb+') as f:
    pickle.dump(df_train, f)

### Export Stopwords List

In [26]:
with open('../Data/stop_word_list.pkl', 'wb+') as f:
    pickle.dump(stop_word_list, f)