# Text data processing pipeline (Compiled)
## by Iris and Pooja!

### The following pipeline is considered:
* Use of punctuation across labels: q_marks, exclamation marks
* Use of multiple caps_lock words
* POS tagged words - using adjectives (deleting nouns?)
* Use of emojis across labels
* Use of manual bigrams - in the form of adjective_noun for common vocabulary (specific to a label)

In [1]:
# importing packages
import pandas as pd
from langdetect import detect
import matplotlib.pyplot as plt
import numpy as np
from itertools import chain
from nltk.collocations import BigramCollocationFinder
from nltk.collocations import *
from nltk.tokenize import RegexpTokenizer
from nltk.util import ngrams
from nltk.stem import SnowballStemmer
from nltk.stem import WordNetLemmatizer
import math
from nltk.probability import *
from gensim import corpora
from gensim.models import Word2Vec
import multiprocessing
import nltk
from nltk.corpus import stopwords
from gensim.models.phrases import Phrases, Phraser
import re  # For preprocessing
from collections import defaultdict  # For word frequency


import logging  # Setting up the loggings to monitor gensim
logging.basicConfig(format="%(levelname)s - %(asctime)s: %(message)s", datefmt= '%H:%M:%S', level=logging.INFO)

In [2]:
# Ignore warnings
import warnings
warnings.filterwarnings('ignore')

In [3]:
# Import the data
labeled_data = pd.read_csv('./labeled_data.csv')

# Have a look at the top 5 rows
labeled_data.head(n=5)

Unnamed: 0,text,label
0,The new rule is - \r\nif you are waiting for a...,4
1,"Flirted with giving this two stars, but that's...",3
2,I was staying at planet Hollywood across the s...,5
3,Food is good but prices are super expensive. ...,2
4,Worse company to deal with they do horrible wo...,1


In [5]:
# Import unlabeled data
test_data  = pd.read_csv('./test_data.csv')
unlabeled_data = pd.read_csv('./unlabeled_data.csv')

# Combine all the data (for training the language model)
all_data = pd.DataFrame(test_data['text']).append(labeled_data)

# Check the length of the dataframe
print(len(all_data))
all_data[0:5]

100000


Unnamed: 0,label,text
0,,trying to have a nice quiet dinner. the annou...
1,,Been getting food to go from here for over 3yr...
2,,Ugh. I've had to eat here a couple of times be...
3,,The people here are so nice! I ordered on eat ...
4,,Heard alot of good things about this place and...


## 1: Pre-Processing

In [7]:
contractions = { 
    "ain't": "not",
    "aren't": "not",
    "arent": "not",
    "can't": "not",
    "cant": "not",
    "can't've": "not",
    "couldn't": "not",
    "couldnt": "not",
    "couldn't've": "not",
    "didn't": "not",
    "didnt": "not",
    "doesn't": "not",
    "doesnt": "not",
    "don't": "not",
    "dont": "not",
    "hadn't": "not",
    "hadn't've": "not",
    "hasn't": "not",
    "haven't": "not",
    "havent": "not",
    "mayn't": "not",
    "mightn't": "not",
    "mightn't've": "not",
    "must've": "",
    "mustn't": "not",
    "mustn't've": "not",
    "needn't": "not",
    "needn't've": "not",
    "oughtn't": "not",
    "oughtn't've": "not",
    "shan't": "not",
    "sha'n't": "not",
    "shan't've": "not",
    "shouldn't": "not",
    "shouldn't've": "not",
    "won't": "not",
    "wont": "not",
    "won't've": "not",
    "cause": "because",
    "wouldn't": "not",
    "wouldnt": "not",
    "wouldn't've": "not"
}


stars = {
    '[\b\s]10/10':'amazing',
    '\b5\.\b':'amazing',
    '10 star[s]?':'amazing',
    '[56789] star[s]?':'amazing',
    'five star[s]?':'amazing',
    'four star[s]?':'great',
    '\b4\.\b':'great',
    '4 star[s]?':'great',
    '4-star[s]?':'great',
    '4\.[0-9]+ star[s]?':'great',
    'was a 4':'great',
    '4.5/5':'great',
    '4/5':'great',
    'three star[s]?':'average',
    '3 star[s]?':'average',
    '\b3\.\b':'average',
    '3-star[s]?':'average',
    '3\.[0-9]+ star[s]?':'average',
    'was a 3':'average',
    '3.5/5':'average',
    '3/5':'average',
    'two star[s]?':'bad',
    '\b2\.\b':'bad',
    '2 star[s]?':'bad',
    '2-star[s]?':'bad',
    '2\.[0-9]+ star[s]?':'bad',
    'was a 2':'bad',
    '2.5/5':'bad',
    '2/5':'bad',
    '\b1\.\b':'bad',
    '1 star[s]?':'horrible',
    '1-star[s]?':'horrible',
    '0 star[s]?':'horrible',
    '0-star[s]?':'horrible',
    'no star[s]?':'horrible',
    'one star[s]?':'horrible',
    'zero star[s]?':'horrible',
    'negative star[s]?':'horrible'
}


irregular_spellings = {
    'ÃÂ©':'e',# fixing this non-ascii value
    r'(\bam[mazing]+g\b)':'amazing',
    r'(\bg[o]{2,}d\b)':'good',
    r'(yu[um]+[omyers!]+)':'yummy',
    r'(w+[ay]{1,}y)':'way',
    r'(over\s?r?ated)':'overrated',
    r'([!]{2,})':' absolute ',
    r'([?]{1,}[?!]*)':' why ',
    r'([!]{2,})':' absolute ',
    r'([?]{1,}[?!]*)':' why '
}


# case normalization
all_data['text_processed']= all_data['text'].str.lower()
# replace starts related ngrams with adj's
for mention, repl in stars.items():
    all_data.text_processed = all_data.text_processed.str.replace(mention, repl)
# replace negating contractions with 'not'
for contr, repl in contractions.items():
    all_data.text_processed = all_data.text_processed.str.replace(contr, repl)
# replace negating contractions with 'not'
for irregular, repl in irregular_spellings.items():
    all_data.text_processed = all_data.text_processed.str.replace(irregular, repl)
    
all_data.head()

Unnamed: 0,label,text,text_processed
0,,trying to have a nice quiet dinner. the annou...,trying to have a nice quiet dinner. the annou...
1,,Been getting food to go from here for over 3yr...,been getting food to go from here for over 3yr...
2,,Ugh. I've had to eat here a couple of times be...,ugh. i've had to eat here a couple of times be...
3,,The people here are so nice! I ordered on eat ...,the people here are so nice! i ordered on eat ...
4,,Heard alot of good things about this place and...,heard alot of good things about this place and...


In [8]:
# FUNCTIONS TO DO PRE-PROCESSING

# removing non-alphabets
def remove_characters(comment):
    comment = re.sub(r"[^A-Za-z]+", ' ', str(comment)).lower()
    return comment

# tokenize each comment
def tokenize(comment):
    tokenizer = RegexpTokenizer(r"\s+", gaps=True)
    return tokenizer.tokenize(comment)

# remove stopwords
keep = ['not','no','but','because','into','above','below','up',\
        'down','again','further','why','how','more','most','all','any','such',\
        'very','too','so','just','by']
add = ['could','would']
stopwords_list = stopwords.words('english') + add
stopwords_list = [w for w in stopwords_list if w not in keep]
def remove_stopwords(tokens_list):
    return [x for x in tokens_list if x not in stopwords_list]

# stemming
stemmer = SnowballStemmer('english')
def stem_words(tokens_list):
    return [stemmer.stem(x) for x in tokens_list]

In [9]:
# remove some kinds of characters characters
all_data["text_processed"] = all_data["text_processed"].apply(remove_characters)

# tokenize the data
all_data["tokens"] = all_data.text_processed.apply(tokenize)

# remove stop-words from tokens list
all_data.tokens = all_data.tokens.apply(remove_stopwords)

# view the processed data and tokens
all_data.head()

Unnamed: 0,label,text,text_processed,tokens
0,,trying to have a nice quiet dinner. the annou...,trying to have a nice quiet dinner the announc...,"[trying, nice, quiet, dinner, announcer, award..."
1,,Been getting food to go from here for over 3yr...,been getting food to go from here for over yrs...,"[getting, food, go, yrs, wife, usually, tend, ..."
2,,Ugh. I've had to eat here a couple of times be...,ugh i ve had to eat here a couple of times beb...,"[ugh, eat, couple, times, bebecause, work, eve..."
3,,The people here are so nice! I ordered on eat ...,the people here are so nice i ordered on eat a...,"[people, so, nice, ordered, eat, promptly, cal..."
4,,Heard alot of good things about this place and...,heard alot of good things about this place and...,"[heard, alot, good, things, place, decided, gr..."


In [10]:
all_data = all_data.reset_index(drop=True)
all_data.head()

Unnamed: 0,label,text,text_processed,tokens
0,,trying to have a nice quiet dinner. the annou...,trying to have a nice quiet dinner the announc...,"[trying, nice, quiet, dinner, announcer, award..."
1,,Been getting food to go from here for over 3yr...,been getting food to go from here for over yrs...,"[getting, food, go, yrs, wife, usually, tend, ..."
2,,Ugh. I've had to eat here a couple of times be...,ugh i ve had to eat here a couple of times beb...,"[ugh, eat, couple, times, bebecause, work, eve..."
3,,The people here are so nice! I ordered on eat ...,the people here are so nice i ordered on eat a...,"[people, so, nice, ordered, eat, promptly, cal..."
4,,Heard alot of good things about this place and...,heard alot of good things about this place and...,"[heard, alot, good, things, place, decided, gr..."


In [11]:
# examine and impute rows in df with empty lists in them
no_tokens = list(all_data.loc[(all_data['tokens'].str.len() == 0), :].index)
all_data.loc[no_tokens]

Unnamed: 0,label,text,text_processed,tokens
26091,,ç¾å³ããã£ãã¼ï¼ï¼,,[]
28298,,èæ¿å¾ç­æï¼è¯´æè¯ä»·çå­æ°å°æä...,,[]
37800,,6å ä¸æ¹æµ·é²éå®¶çå¹²ççæ²³ ç»åç...,,[]
42313,,5.5å å¹²ççæ²³ éèæ¯è¾ä¹±ä¸å«ç³ é...,,[]
47029,,ä»å¤©çèè²å¤±æï¼å»å¸«æ²ç¨å¿ï¼å¨æ...,,[]
51491,1.0,......,,[]
54323,3.0,ï¼ãæåãã¾ããï¼å¤§å¥½ãã§ãã\...,,[]
62644,4.0,éè »å¥½åçï¼åªæ¯é¸è¾£æ¹¯åºæå¯ä»¥å...,,[]
76397,2.0,ä¹ãã¶ãã«å£ã«åããªãï¼\r\nã¹ã¢...,,[]
79785,2.0,;)),,[]


In [43]:
all_data.loc[no_tokens,'tokens'] = 'hmm'
all_data.loc[no_tokens]

Unnamed: 0,label,text,text_processed,tokens,pos_tagged,adjectives,nouns,no_nouns,no_nouns_stemmed,tokens_stemmed
26091,,ç¾å³ããã£ãã¼ï¼ï¼,,hmm,[],[],[],[],[],[]
28298,,èæ¿å¾ç­æï¼è¯´æè¯ä»·çå­æ°å°æä...,,hmm,[],[],[],[],[],[]
37800,,6å ä¸æ¹æµ·é²éå®¶çå¹²ççæ²³ ç»åç...,,hmm,[],[],[],[],[],[]
42313,,5.5å å¹²ççæ²³ éèæ¯è¾ä¹±ä¸å«ç³ é...,,hmm,[],[],[],[],[],[]
47029,,ä»å¤©çèè²å¤±æï¼å»å¸«æ²ç¨å¿ï¼å¨æ...,,hmm,[],[],[],[],[],[]
51491,1.0,......,,hmm,[],[],[],[],[],[]
54323,3.0,ï¼ãæåãã¾ããï¼å¤§å¥½ãã§ãã\...,,hmm,[],[],[],[],[],[]
62644,4.0,éè »å¥½åçï¼åªæ¯é¸è¾£æ¹¯åºæå¯ä»¥å...,,hmm,[],[],[],[],[],[]
76397,2.0,ä¹ãã¶ãã«å£ã«åããªãï¼\r\nã¹ã¢...,,hmm,[],[],[],[],[],[]
79785,2.0,;)),,hmm,[],[],[],[],[],[]


In [13]:
all_data.shape

(100000, 4)

## 2: Label-Wise EDA

### 2.1 Looking at Commonly Used POSpeech

### POS-Tagging the Split Data

In [14]:
# using nltk to pos-tag
#def pos_tagger(token_list):
 #   return nltk.pos_tag([x for x in token_list if x])

all_data["pos_tagged"] = all_data.tokens.apply(nltk.pos_tag)

In [15]:
# defining function to tag using "pos" and filter tokens from df.tokens list of tokens
# for example, finding the adjectives as below
pos = "JJ+"

def POS(token_pos_list):
    return [x[0] for x in token_pos_list if re.search(pos, x[1])]

In [16]:
# finding all adjectives for the labels
all_data["adjectives"] = all_data.pos_tagged.apply(POS)
all_data.head()

Unnamed: 0,label,text,text_processed,tokens,pos_tagged,adjectives
0,,trying to have a nice quiet dinner. the annou...,trying to have a nice quiet dinner the announc...,"[trying, nice, quiet, dinner, announcer, award...","[(trying, VBG), (nice, JJ), (quiet, JJ), (dinn...","[nice, quiet, loud]"
1,,Been getting food to go from here for over 3yr...,been getting food to go from here for over yrs...,"[getting, food, go, yrs, wife, usually, tend, ...","[(getting, VBG), (food, NN), (go, VB), (yrs, J...","[yrs, mongolian, special, noodle, best, chines..."
2,,Ugh. I've had to eat here a couple of times be...,ugh i ve had to eat here a couple of times beb...,"[ugh, eat, couple, times, bebecause, work, eve...","[(ugh, JJ), (eat, NN), (couple, NN), (times, N...","[ugh, clad, super, much, salad, sure, helpful,..."
3,,The people here are so nice! I ordered on eat ...,the people here are so nice i ordered on eat a...,"[people, so, nice, ordered, eat, promptly, cal...","[(people, NNS), (so, RB), (nice, JJ), (ordered...","[nice, double, sweet]"
4,,Heard alot of good things about this place and...,heard alot of good things about this place and...,"[heard, alot, good, things, place, decided, gr...","[(heard, NN), (alot, NN), (good, JJ), (things,...","[good, cheese, large, hungry, hot, green, def]"


#### Removing Nouns

In [17]:
nltk.help.upenn_tagset('NN*')

NN: noun, common, singular or mass
    common-carrier cabbage knuckle-duster Casino afghan shed thermostat
    investment slide humour falloff slick wind hyena override subhumanity
    machinist ...
NNP: noun, proper, singular
    Motown Venneboerger Czestochwa Ranzer Conchita Trumplane Christos
    Oceanside Escobar Kreisler Sawyer Cougar Yvette Ervin ODI Darryl CTCA
    Shannon A.K.C. Meltex Liverpool ...
NNPS: noun, proper, plural
    Americans Americas Amharas Amityvilles Amusements Anarcho-Syndicalists
    Andalusians Andes Andruses Angels Animals Anthony Antilles Antiques
    Apache Apaches Apocrypha ...
NNS: noun, common, plural
    undergraduates scotches bric-a-brac products bodyguards facets coasts
    divestitures storehouses designs clubs fragrances averages
    subjectivists apprehensions muses factory-jobs ...


In [18]:
#  finding all the nouns
pos = "NN"
all_data["nouns"] = all_data.pos_tagged.apply(POS)
all_data.head()

Unnamed: 0,label,text,text_processed,tokens,pos_tagged,adjectives,nouns
0,,trying to have a nice quiet dinner. the annou...,trying to have a nice quiet dinner the announc...,"[trying, nice, quiet, dinner, announcer, award...","[(trying, VBG), (nice, JJ), (quiet, JJ), (dinn...","[nice, quiet, loud]","[dinner, announcer, awards, way, restaurant]"
1,,Been getting food to go from here for over 3yr...,been getting food to go from here for over yrs...,"[getting, food, go, yrs, wife, usually, tend, ...","[(getting, VBG), (food, NN), (go, VB), (yrs, J...","[yrs, mongolian, special, noodle, best, chines...","[food, wife, items, something, works, beef, lu..."
2,,Ugh. I've had to eat here a couple of times be...,ugh i ve had to eat here a couple of times beb...,"[ugh, eat, couple, times, bebecause, work, eve...","[(ugh, JJ), (eat, NN), (couple, NN), (times, N...","[ugh, clad, super, much, salad, sure, helpful,...","[eat, couple, times, work, events, course, gir..."
3,,The people here are so nice! I ordered on eat ...,the people here are so nice i ordered on eat a...,"[people, so, nice, ordered, eat, promptly, cal...","[(people, NNS), (so, RB), (nice, JJ), (ordered...","[nice, double, sweet]","[people, eat, check, everything]"
4,,Heard alot of good things about this place and...,heard alot of good things about this place and...,"[heard, alot, good, things, place, decided, gr...","[(heard, NN), (alot, NN), (good, JJ), (things,...","[good, cheese, large, hungry, hot, green, def]","[heard, alot, things, place, grab, breakfast, ..."


In [19]:
vecsub = lambda x, y: [item for item in x if item not in y]
all_data['no_nouns'] = list(map(vecsub, all_data['tokens'], all_data['nouns']))
all_data.head()

Unnamed: 0,label,text,text_processed,tokens,pos_tagged,adjectives,nouns,no_nouns
0,,trying to have a nice quiet dinner. the annou...,trying to have a nice quiet dinner the announc...,"[trying, nice, quiet, dinner, announcer, award...","[(trying, VBG), (nice, JJ), (quiet, JJ), (dinn...","[nice, quiet, loud]","[dinner, announcer, awards, way, restaurant]","[trying, nice, quiet, giveaways, too, loud]"
1,,Been getting food to go from here for over 3yr...,been getting food to go from here for over yrs...,"[getting, food, go, yrs, wife, usually, tend, ...","[(getting, VBG), (food, NN), (go, VB), (yrs, J...","[yrs, mongolian, special, noodle, best, chines...","[food, wife, items, something, works, beef, lu...","[getting, go, yrs, usually, tend, get, why, fi..."
2,,Ugh. I've had to eat here a couple of times be...,ugh i ve had to eat here a couple of times beb...,"[ugh, eat, couple, times, bebecause, work, eve...","[(ugh, JJ), (eat, NN), (couple, NN), (times, N...","[ugh, clad, super, much, salad, sure, helpful,...","[eat, couple, times, work, events, course, gir...","[ugh, bebecause, makes, snotily, clad, super, ..."
3,,The people here are so nice! I ordered on eat ...,the people here are so nice i ordered on eat a...,"[people, so, nice, ordered, eat, promptly, cal...","[(people, NNS), (so, RB), (nice, JJ), (ordered...","[nice, double, sweet]","[people, eat, check, everything]","[so, nice, ordered, promptly, called, double, ..."
4,,Heard alot of good things about this place and...,heard alot of good things about this place and...,"[heard, alot, good, things, place, decided, gr...","[(heard, NN), (alot, NN), (good, JJ), (things,...","[good, cheese, large, hungry, hot, green, def]","[heard, alot, things, place, grab, breakfast, ...","[good, decided, say, enjoyed, cheese, firstly,..."


In [20]:
# stem the tokens
all_data['no_nouns_stemmed'] = all_data.no_nouns.apply(stem_words)
all_data['tokens_stemmed'] = all_data.tokens.apply(stem_words)
all_data.head()

Unnamed: 0,label,text,text_processed,tokens,pos_tagged,adjectives,nouns,no_nouns,no_nouns_stemmed,tokens_stemmed
0,,trying to have a nice quiet dinner. the annou...,trying to have a nice quiet dinner the announc...,"[trying, nice, quiet, dinner, announcer, award...","[(trying, VBG), (nice, JJ), (quiet, JJ), (dinn...","[nice, quiet, loud]","[dinner, announcer, awards, way, restaurant]","[trying, nice, quiet, giveaways, too, loud]","[tri, nice, quiet, giveaway, too, loud]","[tri, nice, quiet, dinner, announc, award, giv..."
1,,Been getting food to go from here for over 3yr...,been getting food to go from here for over yrs...,"[getting, food, go, yrs, wife, usually, tend, ...","[(getting, VBG), (food, NN), (go, VB), (yrs, J...","[yrs, mongolian, special, noodle, best, chines...","[food, wife, items, something, works, beef, lu...","[getting, go, yrs, usually, tend, get, why, fi...","[get, go, yrs, usual, tend, get, whi, fix, alw...","[get, food, go, yrs, wife, usual, tend, get, i..."
2,,Ugh. I've had to eat here a couple of times be...,ugh i ve had to eat here a couple of times beb...,"[ugh, eat, couple, times, bebecause, work, eve...","[(ugh, JJ), (eat, NN), (couple, NN), (times, N...","[ugh, clad, super, much, salad, sure, helpful,...","[eat, couple, times, work, events, course, gir...","[ugh, bebecause, makes, snotily, clad, super, ...","[ugh, bebecaus, make, snotili, clad, super, aw...","[ugh, eat, coupl, time, bebecaus, work, event,..."
3,,The people here are so nice! I ordered on eat ...,the people here are so nice i ordered on eat a...,"[people, so, nice, ordered, eat, promptly, cal...","[(people, NNS), (so, RB), (nice, JJ), (ordered...","[nice, double, sweet]","[people, eat, check, everything]","[so, nice, ordered, promptly, called, double, ...","[so, nice, order, prompt, call, doubl, correct...","[peopl, so, nice, order, eat, prompt, call, do..."
4,,Heard alot of good things about this place and...,heard alot of good things about this place and...,"[heard, alot, good, things, place, decided, gr...","[(heard, NN), (alot, NN), (good, JJ), (things,...","[good, cheese, large, hungry, hot, green, def]","[heard, alot, things, place, grab, breakfast, ...","[good, decided, say, enjoyed, cheese, firstly,...","[good, decid, say, enjoy, chees, first, rather...","[heard, alot, good, thing, place, decid, grab,..."


### Common Triigrams

In [21]:
# check common bigrams for "not_good" appearing a lot more
tokens_list = [item for sublist in list(all_data.tokens_stemmed) for item in sublist]
fd = FreqDist(tokens_list)

# most common tokens - document frequency
trigrams = ngrams(tokens_list, n = 3)
fdtrigram = FreqDist(trigrams)
trigrams = fdtrigram.most_common()
trigrams = pd.DataFrame(trigrams)#.set_index(0)
trigrams[:20]

Unnamed: 0,0,1
0,"(not, go, back)",1379
1,"(not, come, back)",870
2,"(good, but, not)",867
3,"(never, go, back)",660
4,"(definit, come, back)",645
5,"(food, good, but)",564
6,"(not, know, how)",557
7,"(come, back, again)",530
8,"(staff, veri, friend)",497
9,"(definit, go, back)",495


In [23]:
# only keep trigrams with freq > 300
top_trigrams = trigrams[trigrams.loc[:,1]>300].loc[:,0]

In [24]:
findTrigrams = {}
for tg in top_trigrams:
    key = tg[0]+" "+tg[1]+" "+tg[2]
    findTrigrams[key] = tg[0]+"_"+tg[1]+"_"+tg[2]

findTrigrams

{'not go back': 'not_go_back',
 'not come back': 'not_come_back',
 'good but not': 'good_but_not',
 'never go back': 'never_go_back',
 'definit come back': 'definit_come_back',
 'food good but': 'food_good_but',
 'not know how': 'not_know_how',
 'come back again': 'come_back_again',
 'staff veri friend': 'staff_veri_friend',
 'definit go back': 'definit_go_back',
 'go back again': 'go_back_again',
 'mac n chees': 'mac_n_chees',
 'go somewher els': 'go_somewher_els',
 'great custom servic': 'great_custom_servic',
 'not so much': 'not_so_much',
 'pretti good but': 'pretti_good_but',
 'not wast time': 'not_wast_time',
 'not recommend place': 'not_recommend_place',
 'not sure whi': 'not_sure_whi',
 'but just not': 'but_just_not',
 'sweet potato fri': 'sweet_potato_fri',
 'way too much': 'way_too_much',
 'not feel like': 'not_feel_like',
 'but not go': 'but_not_go',
 'food pretti good': 'food_pretti_good',
 'never come back': 'never_come_back',
 'not get wrong': 'not_get_wrong',
 'not veri 

### Bigrams

In [25]:
# most common tokens - document frequency
bigrams = ngrams(tokens_list, n = 2)
fdbigram = FreqDist(bigrams)
bigrams = fdbigram.most_common()
bigrams = pd.DataFrame(bigrams)
bigrams[:30]

Unnamed: 0,0,1
0,"(but, not)",11816
1,"(go, back)",6610
2,"(custom, servic)",6137
3,"(come, back)",6055
4,"(not, even)",5491
5,"(first, time)",4474
6,"(not, go)",4418
7,"(not, know)",4297
8,"(good, but)",4282
9,"(veri, good)",4278


In [26]:
all_bigrams = list(bigrams.loc[:,0])
top = int(len(all_bigrams)*0.0003) 
top_bigrams = all_bigrams[:top]
top_bigrams

[('but', 'not'),
 ('go', 'back'),
 ('custom', 'servic'),
 ('come', 'back'),
 ('not', 'even'),
 ('first', 'time'),
 ('not', 'go'),
 ('not', 'know'),
 ('good', 'but'),
 ('veri', 'good'),
 ('not', 'get'),
 ('end', 'up'),
 ('so', 'not'),
 ('pick', 'up'),
 ('pretti', 'good'),
 ('not', 'want'),
 ('look', 'like'),
 ('just', 'not'),
 ('realli', 'good'),
 ('not', 'sure'),
 ('so', 'much'),
 ('ice', 'cream'),
 ('not', 'like'),
 ('veri', 'nice'),
 ('even', 'though'),
 ('no', 'one'),
 ('tast', 'like'),
 ('high', 'recommend'),
 ('feel', 'like'),
 ('next', 'time'),
 ('whi', 'not'),
 ('food', 'good'),
 ('not', 'realli'),
 ('make', 'sure'),
 ('not', 'good'),
 ('too', 'much'),
 ('las', 'vega'),
 ('not', 'think'),
 ('not', 'come'),
 ('veri', 'friend'),
 ('everi', 'time'),
 ('seem', 'like'),
 ('food', 'not'),
 ('happi', 'hour'),
 ('good', 'food'),
 ('but', 'still'),
 ('bebecaus', 'not'),
 ('so', 'mani'),
 ('not', 'recommend'),
 ('so', 'good'),
 ('not', 'worth'),
 ('great', 'place'),
 ('not', 'too'),
 ('pl

In [27]:
findBigrams = {}
for bg in top_bigrams:
    key = bg[0]+" "+bg[1]
    findBigrams[key] = bg[0]+"_"+bg[1]

findBigrams

{'but not': 'but_not',
 'go back': 'go_back',
 'custom servic': 'custom_servic',
 'come back': 'come_back',
 'not even': 'not_even',
 'first time': 'first_time',
 'not go': 'not_go',
 'not know': 'not_know',
 'good but': 'good_but',
 'veri good': 'veri_good',
 'not get': 'not_get',
 'end up': 'end_up',
 'so not': 'so_not',
 'pick up': 'pick_up',
 'pretti good': 'pretti_good',
 'not want': 'not_want',
 'look like': 'look_like',
 'just not': 'just_not',
 'realli good': 'realli_good',
 'not sure': 'not_sure',
 'so much': 'so_much',
 'ice cream': 'ice_cream',
 'not like': 'not_like',
 'veri nice': 'veri_nice',
 'even though': 'even_though',
 'no one': 'no_one',
 'tast like': 'tast_like',
 'high recommend': 'high_recommend',
 'feel like': 'feel_like',
 'next time': 'next_time',
 'whi not': 'whi_not',
 'food good': 'food_good',
 'not realli': 'not_realli',
 'make sure': 'make_sure',
 'not good': 'not_good',
 'too much': 'too_much',
 'las vega': 'las_vega',
 'not think': 'not_think',
 'not co

## Fix HERE!

In [28]:
def to_text(token_list):
    string = ''
    for token in token_list:
        string += token + ' '
    return string

all_data['text_processed'] = all_data.tokens_stemmed.apply(to_text)
all_data.head()

Unnamed: 0,label,text,text_processed,tokens,pos_tagged,adjectives,nouns,no_nouns,no_nouns_stemmed,tokens_stemmed
0,,trying to have a nice quiet dinner. the annou...,tri nice quiet dinner announc award giveaway w...,"[trying, nice, quiet, dinner, announcer, award...","[(trying, VBG), (nice, JJ), (quiet, JJ), (dinn...","[nice, quiet, loud]","[dinner, announcer, awards, way, restaurant]","[trying, nice, quiet, giveaways, too, loud]","[tri, nice, quiet, giveaway, too, loud]","[tri, nice, quiet, dinner, announc, award, giv..."
1,,Been getting food to go from here for over 3yr...,get food go yrs wife usual tend get item whi f...,"[getting, food, go, yrs, wife, usually, tend, ...","[(getting, VBG), (food, NN), (go, VB), (yrs, J...","[yrs, mongolian, special, noodle, best, chines...","[food, wife, items, something, works, beef, lu...","[getting, go, yrs, usually, tend, get, why, fi...","[get, go, yrs, usual, tend, get, whi, fix, alw...","[get, food, go, yrs, wife, usual, tend, get, i..."
2,,Ugh. I've had to eat here a couple of times be...,ugh eat coupl time bebecaus work event cours m...,"[ugh, eat, couple, times, bebecause, work, eve...","[(ugh, JJ), (eat, NN), (couple, NN), (times, N...","[ugh, clad, super, much, salad, sure, helpful,...","[eat, couple, times, work, events, course, gir...","[ugh, bebecause, makes, snotily, clad, super, ...","[ugh, bebecaus, make, snotili, clad, super, aw...","[ugh, eat, coupl, time, bebecaus, work, event,..."
3,,The people here are so nice! I ordered on eat ...,peopl so nice order eat prompt call doubl chec...,"[people, so, nice, ordered, eat, promptly, cal...","[(people, NNS), (so, RB), (nice, JJ), (ordered...","[nice, double, sweet]","[people, eat, check, everything]","[so, nice, ordered, promptly, called, double, ...","[so, nice, order, prompt, call, doubl, correct...","[peopl, so, nice, order, eat, prompt, call, do..."
4,,Heard alot of good things about this place and...,heard alot good thing place decid grab breakfa...,"[heard, alot, good, things, place, decided, gr...","[(heard, NN), (alot, NN), (good, JJ), (things,...","[good, cheese, large, hungry, hot, green, def]","[heard, alot, things, place, grab, breakfast, ...","[good, decided, say, enjoyed, cheese, firstly,...","[good, decid, say, enjoy, chees, first, rather...","[heard, alot, good, thing, place, decid, grab,..."


In [33]:
for trigram, repl in findTrigrams.items():
    all_data['text_processed'] = all_data.text_processed.str.replace(trigram, repl) 
    
for bigram, repl in findBigrams.items():
    all_data['text_processed'] = all_data.text_processed.str.replace(bigram, repl)

In [34]:
all_data.head()

Unnamed: 0,label,text,text_processed,tokens,pos_tagged,adjectives,nouns,no_nouns,no_nouns_stemmed,tokens_stemmed
0,,trying to have a nice quiet dinner. the annou...,tri nice quiet dinner announc award giveaway w...,"[trying, nice, quiet, dinner, announcer, award...","[(trying, VBG), (nice, JJ), (quiet, JJ), (dinn...","[nice, quiet, loud]","[dinner, announcer, awards, way, restaurant]","[trying, nice, quiet, giveaways, too, loud]","[tri, nice, quiet, giveaway, too, loud]","[tri, nice, quiet, dinner, announc, award, giv..."
1,,Been getting food to go from here for over 3yr...,get_food go yrs wife usual tend get item whi f...,"[getting, food, go, yrs, wife, usually, tend, ...","[(getting, VBG), (food, NN), (go, VB), (yrs, J...","[yrs, mongolian, special, noodle, best, chines...","[food, wife, items, something, works, beef, lu...","[getting, go, yrs, usually, tend, get, why, fi...","[get, go, yrs, usual, tend, get, whi, fix, alw...","[get, food, go, yrs, wife, usual, tend, get, i..."
2,,Ugh. I've had to eat here a couple of times be...,ugh eat coupl_time bebecaus work event cours m...,"[ugh, eat, couple, times, bebecause, work, eve...","[(ugh, JJ), (eat, NN), (couple, NN), (times, N...","[ugh, clad, super, much, salad, sure, helpful,...","[eat, couple, times, work, events, course, gir...","[ugh, bebecause, makes, snotily, clad, super, ...","[ugh, bebecaus, make, snotili, clad, super, aw...","[ugh, eat, coupl, time, bebecaus, work, event,..."
3,,The people here are so nice! I ordered on eat ...,peopl so_nice order eat prompt call doubl chec...,"[people, so, nice, ordered, eat, promptly, cal...","[(people, NNS), (so, RB), (nice, JJ), (ordered...","[nice, double, sweet]","[people, eat, check, everything]","[so, nice, ordered, promptly, called, double, ...","[so, nice, order, prompt, call, doubl, correct...","[peopl, so, nice, order, eat, prompt, call, do..."
4,,Heard alot of good things about this place and...,heard alot good_thing place decid grab breakfa...,"[heard, alot, good, things, place, decided, gr...","[(heard, NN), (alot, NN), (good, JJ), (things,...","[good, cheese, large, hungry, hot, green, def]","[heard, alot, things, place, grab, breakfast, ...","[good, decided, say, enjoyed, cheese, firstly,...","[good, decid, say, enjoy, chees, first, rather...","[heard, alot, good, thing, place, decid, grab,..."


In [36]:
# # making variants of the pre-processing so far

# def n_gram(tokens_no_stopwords, ngram_corpus):
#     return [x for x in ngram_corpus if x not in tokens_no_stopwords]

# all_data['only_ngrams'] = all_data.apply(lambda x: n_gram(x.tokens, x.n_grams), axis=1)

In [37]:
# all_data['tokens_ngrams'] = all_data["tokens"] + all_data["only_ngrams"] 
# all_data["ngrams_adjectives"] = all_data["n_grams"] + all_data["adjectives"]

# all_data_1 = all_data[["text", 'tokens_ngrams','ngrams_adjectives',"label"]]
# all_data_1.head()

In [38]:
# tokenize the data
all_data["tokens"] = all_data.text_processed.apply(tokenize)
all_data.head()

Unnamed: 0,label,text,text_processed,tokens,pos_tagged,adjectives,nouns,no_nouns,no_nouns_stemmed,tokens_stemmed
0,,trying to have a nice quiet dinner. the annou...,tri nice quiet dinner announc award giveaway w...,"[tri, nice, quiet, dinner, announc, award, giv...","[(trying, VBG), (nice, JJ), (quiet, JJ), (dinn...","[nice, quiet, loud]","[dinner, announcer, awards, way, restaurant]","[trying, nice, quiet, giveaways, too, loud]","[tri, nice, quiet, giveaway, too, loud]","[tri, nice, quiet, dinner, announc, award, giv..."
1,,Been getting food to go from here for over 3yr...,get_food go yrs wife usual tend get item whi f...,"[get_food, go, yrs, wife, usual, tend, get, it...","[(getting, VBG), (food, NN), (go, VB), (yrs, J...","[yrs, mongolian, special, noodle, best, chines...","[food, wife, items, something, works, beef, lu...","[getting, go, yrs, usually, tend, get, why, fi...","[get, go, yrs, usual, tend, get, whi, fix, alw...","[get, food, go, yrs, wife, usual, tend, get, i..."
2,,Ugh. I've had to eat here a couple of times be...,ugh eat coupl_time bebecaus work event cours m...,"[ugh, eat, coupl_time, bebecaus, work, event, ...","[(ugh, JJ), (eat, NN), (couple, NN), (times, N...","[ugh, clad, super, much, salad, sure, helpful,...","[eat, couple, times, work, events, course, gir...","[ugh, bebecause, makes, snotily, clad, super, ...","[ugh, bebecaus, make, snotili, clad, super, aw...","[ugh, eat, coupl, time, bebecaus, work, event,..."
3,,The people here are so nice! I ordered on eat ...,peopl so_nice order eat prompt call doubl chec...,"[peopl, so_nice, order, eat, prompt, call, dou...","[(people, NNS), (so, RB), (nice, JJ), (ordered...","[nice, double, sweet]","[people, eat, check, everything]","[so, nice, ordered, promptly, called, double, ...","[so, nice, order, prompt, call, doubl, correct...","[peopl, so, nice, order, eat, prompt, call, do..."
4,,Heard alot of good things about this place and...,heard alot good_thing place decid grab breakfa...,"[heard, alot, good_thing, place, decid, grab, ...","[(heard, NN), (alot, NN), (good, JJ), (things,...","[good, cheese, large, hungry, hot, green, def]","[heard, alot, things, place, grab, breakfast, ...","[good, decided, say, enjoyed, cheese, firstly,...","[good, decid, say, enjoy, chees, first, rather...","[heard, alot, good, thing, place, decid, grab,..."


In [40]:
# making new data frame with dropped NA values 
labeled_data = all_data.dropna(axis = 0, how ='any')
labeled_data.shape

(50000, 10)

In [42]:
labeled_data.head()

Unnamed: 0,label,text,text_processed,tokens,pos_tagged,adjectives,nouns,no_nouns,no_nouns_stemmed,tokens_stemmed
50000,4.0,The new rule is - \r\nif you are waiting for a...,new rule wait tabl almost alway not_wait insid...,"[new, rule, wait, tabl, almost, alway, not_wai...","[(new, JJ), (rule, NN), (waiting, VBG), (table...","[new, sign, upfront, awful, cold, p, short, wr...","[rule, table, becauses, concerns, patrons, apo...","[new, waiting, almost, always, not, wait, insi...","[new, wait, almost, alway, not, wait, insid, j...","[new, rule, wait, tabl, almost, alway, not, wa..."
50001,3.0,"Flirted with giving this two stars, but that's...",flirt give_bad_but pretti damn rate might just...,"[flirt, give_bad_but, pretti, damn, rate, migh...","[(flirted, VBN), (giving, VBG), (bad, JJ), (bu...","[bad, pretty, new, east, many, hidden, friend,...","[rating, night, side, gems, fiance, drinks, th...","[flirted, giving, bad, but, pretty, damning, m...","[flirt, give, bad, but, pretti, damn, might, j...","[flirt, give, bad, but, pretti, damn, rate, mi..."
50002,5.0,I was staying at planet Hollywood across the s...,stay planet hollywood across_street saw good r...,"[stay, planet, hollywood, across_street, saw, ...","[(staying, VBG), (planet, NN), (hollywood, NN)...","[good, give, good, bacon, cheese, cold, carame...","[planet, hollywood, street, reviews, try, brea...","[staying, across, saw, good, so, husband, deci...","[stay, across, saw, good, so, husband, decid, ...","[stay, planet, hollywood, across, street, saw,..."
50003,2.0,Food is good but prices are super expensive. ...,food_good_but price super expens buck extra la...,"[food_good_but, price, super, expens, buck, ex...","[(food, NN), (good, JJ), (but, CC), (prices, N...","[good, expensive, large, carne, little, bigger...","[food, prices, bucks, asada, taco, bell, bean,...","[good, but, super, expensive, extra, large, li...","[good, but, super, expens, extra, larg, littl,...","[food, good, but, price, super, expens, buck, ..."
50004,1.0,Worse company to deal with they do horrible wo...,wors compani deal horribl work bring truck bac...,"[wors, compani, deal, horribl, work, bring, tr...","[(worse, JJR), (company, NN), (deal, NN), (hor...","[worse, horrible, door, trim, straight, update...","[company, deal, work, truck, replacement, door...","[worse, horrible, back, not, match, trim, mold...","[wors, horribl, back, not, match, trim, mold, ...","[wors, compani, deal, horribl, work, bring, tr..."


In [None]:
all_data.loc[no_tokens,'text_processed'] = 'hmm'

In [46]:
all_data.to_csv('text_preprocessed', encoding='utf-8',index=False)
labeled_data.to_csv('labeled_text_preprocessed', encoding='utf-8',index=False)