# CFM Data Pre-Processing -EN

In [6]:
# Import Libraries
import pandas as pd
import numpy as np
import pickle

import contractions
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords, wordnet
from nltk.stem import WordNetLemmatizer
import fasttext
import string
import sys
import re


# from langdetect import detect # detect language

pd.set_option('max_colwidth', 800)

## 1. Load and Process Data

In [2]:
filename_list = ['cfm_train_ja', 'cfm_test_ja', 'cfm_val_ja']

for fn in filename_list:
    if fn.find('train') > 0:
        train_df = pd.read_csv(f'./data/{fn}.csv', encoding = 'utf-8')
    elif fn.find('test') > 0:
        test_df = pd.read_csv(f'./data/{fn}.csv', encoding = 'utf-8')
    elif fn.find('val') > 0:
        val_df = pd.read_csv(f'./data/{fn}.csv', encoding = 'utf-8')
    else:
        pass

In [3]:
def lange_detect(series):
    pretrained_model = "lid.176.bin" 
    model = fasttext.load_model(pretrained_model)
    langs = []
    for cm in series:
        lang = model.predict(cm)[0]
        langs.append(str(lang)[11:13])
    return langs


In [7]:
# Filter out the English comments with DataFrame

# English Comments df 
train_en_df = train_df.loc[~train_df['comments_en'].isna()].drop(columns=['comments_ja', 'totalwords_ja'])
test_en_df = test_df.loc[~test_df['comments_en'].isna()].drop(columns=['comments_ja'])
test_en_df['totalwords_en'] = test_en_df['comments_en'].str.len() # add totalwords_en in test df

val_en_df = val_df.loc[~val_df['comments_en'].isna()].drop(columns=['comments_ja'])
val_en_df['totalwords_en'] = val_en_df['comments_en'].str.len()


## Combined df for English comments
en_dfs = [train_en_df, test_en_df, val_en_df]
## rename comment col
for e in en_dfs:
    e.rename(columns = {'comments_en': 'comments', 'totalwords_en': 'init_totalwords'}, inplace =  True)
all_en_dfs = pd.concat(en_dfs, sort = False)
# expand contractions
all_en_dfs['comments_no_contraction'] = all_en_dfs['comments'].apply(lambda x: [contractions.fix(word) for word in x.split()])
# convert terms to list
all_en_dfs['comments_str'] = [' '.join(map(str, l)) for l in all_en_dfs['comments_no_contraction']]
# remove punctuation
all_en_dfs['comments_no_punc'] = all_en_dfs['comments_str'].str.replace('[^\w\s]','')
all_en_dfs['comments_no_punc']=all_en_dfs['comments_no_punc'].apply(lambda x: re.sub('\w*\d\w*','', x))
# detect language 
all_en_dfs['lang_detect'] = lange_detect(all_en_dfs['comments_no_punc'])
print(f'The size of the observations for raw dataset of English comments is {all_en_dfs.shape[0]}')
all_en_dfs = all_en_dfs[all_en_dfs['lang_detect'].isin(['en'])] 

print(f'The size of the observations for the cleaned dataset of English comments is {all_en_dfs.shape[0]}')



The size of the observations for raw dataset of English comments is 73289
The size of the observations for the cleaned dataset of English comments is 72971


In [8]:
all_en_dfs.head()

Unnamed: 0,comments,lang,label,totalwords,comments_no_contraction,comments_str,comments_no_punc,lang_detect
3375,"""When I was working on screw tightening for about an hour, smoke came out and it became useless. There seems to be no protection circuit. Isn't it necessary to write the continuous usage time?""}",en,1,195,"[""When, I, was, working, on, screw, tightening, for, about, an, hour,, smoke, came, out, and, it, became, useless., There, seems, to, be, no, protection, circuit., is not, it, necessary, to, write, the, continuous, usage, time?""}]","""When I was working on screw tightening for about an hour, smoke came out and it became useless. There seems to be no protection circuit. is not it necessary to write the continuous usage time?""}",When I was working on screw tightening for about an hour smoke came out and it became useless There seems to be no protection circuit is not it necessary to write the continuous usage time,en
3376,'I had eczema on my body when I use it. You can now contact the shop to get a return.'},en,1,88,"['I, had, eczema, on, my, body, when, I, use, it., You, can, now, contact, the, shop, to, get, a, return.'}]",'I had eczema on my body when I use it. You can now contact the shop to get a return.'},I had eczema on my body when I use it You can now contact the shop to get a return,en
3377,'A big failure. Power on after plugging in. I had a light burn. I made a return procedure immediately.'},en,1,105,"['A, big, failure., Power, on, after, plugging, in., I, had, a, light, burn., I, made, a, return, procedure, immediately.'}]",'A big failure. Power on after plugging in. I had a light burn. I made a return procedure immediately.'},A big failure Power on after plugging in I had a light burn I made a return procedure immediately,en
3378,"""It has sparks and it smells burnt. I'm afraid I can't use it.""}",en,1,65,"[""It, has, sparks, and, it, smells, burnt., I am, afraid, I, can not, use, it.""}]","""It has sparks and it smells burnt. I am afraid I can not use it.""}",It has sparks and it smells burnt I am afraid I can not use it,en
3379,"'When the power is turned on and the heating button is pressed, the sparks are scattered and the power is no longer turned on.'}",en,1,129,"['When, the, power, is, turned, on, and, the, heating, button, is, pressed,, the, sparks, are, scattered, and, the, power, is, no, longer, turned, on.'}]","'When the power is turned on and the heating button is pressed, the sparks are scattered and the power is no longer turned on.'}",When the power is turned on and the heating button is pressed the sparks are scattered and the power is no longer turned on,en


In [12]:
all_en_dfs['tokenized'] = all_en_dfs['comments_no_punc'].apply(word_tokenize)
all_en_dfs['lower'] = all_en_dfs['tokenized'].apply(lambda x: [str(word).lower() for word in x])
all_en_dfs.head()

Unnamed: 0,comments,lang,label,totalwords,comments_no_contraction,comments_str,comments_no_punc,lang_detect,tokenized,lower
3375,"""When I was working on screw tightening for about an hour, smoke came out and it became useless. There seems to be no protection circuit. Isn't it necessary to write the continuous usage time?""}",en,1,195,"[""When, I, was, working, on, screw, tightening, for, about, an, hour,, smoke, came, out, and, it, became, useless., There, seems, to, be, no, protection, circuit., is not, it, necessary, to, write, the, continuous, usage, time?""}]","""When I was working on screw tightening for about an hour, smoke came out and it became useless. There seems to be no protection circuit. is not it necessary to write the continuous usage time?""}",When I was working on screw tightening for about an hour smoke came out and it became useless There seems to be no protection circuit is not it necessary to write the continuous usage time,en,"[When, I, was, working, on, screw, tightening, for, about, an, hour, smoke, came, out, and, it, became, useless, There, seems, to, be, no, protection, circuit, is, not, it, necessary, to, write, the, continuous, usage, time]","[when, i, was, working, on, screw, tightening, for, about, an, hour, smoke, came, out, and, it, became, useless, there, seems, to, be, no, protection, circuit, is, not, it, necessary, to, write, the, continuous, usage, time]"
3376,'I had eczema on my body when I use it. You can now contact the shop to get a return.'},en,1,88,"['I, had, eczema, on, my, body, when, I, use, it., You, can, now, contact, the, shop, to, get, a, return.'}]",'I had eczema on my body when I use it. You can now contact the shop to get a return.'},I had eczema on my body when I use it You can now contact the shop to get a return,en,"[I, had, eczema, on, my, body, when, I, use, it, You, can, now, contact, the, shop, to, get, a, return]","[i, had, eczema, on, my, body, when, i, use, it, you, can, now, contact, the, shop, to, get, a, return]"
3377,'A big failure. Power on after plugging in. I had a light burn. I made a return procedure immediately.'},en,1,105,"['A, big, failure., Power, on, after, plugging, in., I, had, a, light, burn., I, made, a, return, procedure, immediately.'}]",'A big failure. Power on after plugging in. I had a light burn. I made a return procedure immediately.'},A big failure Power on after plugging in I had a light burn I made a return procedure immediately,en,"[A, big, failure, Power, on, after, plugging, in, I, had, a, light, burn, I, made, a, return, procedure, immediately]","[a, big, failure, power, on, after, plugging, in, i, had, a, light, burn, i, made, a, return, procedure, immediately]"
3378,"""It has sparks and it smells burnt. I'm afraid I can't use it.""}",en,1,65,"[""It, has, sparks, and, it, smells, burnt., I am, afraid, I, can not, use, it.""}]","""It has sparks and it smells burnt. I am afraid I can not use it.""}",It has sparks and it smells burnt I am afraid I can not use it,en,"[It, has, sparks, and, it, smells, burnt, I, am, afraid, I, can, not, use, it]","[it, has, sparks, and, it, smells, burnt, i, am, afraid, i, can, not, use, it]"
3379,"'When the power is turned on and the heating button is pressed, the sparks are scattered and the power is no longer turned on.'}",en,1,129,"['When, the, power, is, turned, on, and, the, heating, button, is, pressed,, the, sparks, are, scattered, and, the, power, is, no, longer, turned, on.'}]","'When the power is turned on and the heating button is pressed, the sparks are scattered and the power is no longer turned on.'}",When the power is turned on and the heating button is pressed the sparks are scattered and the power is no longer turned on,en,"[When, the, power, is, turned, on, and, the, heating, button, is, pressed, the, sparks, are, scattered, and, the, power, is, no, longer, turned, on]","[when, the, power, is, turned, on, and, the, heating, button, is, pressed, the, sparks, are, scattered, and, the, power, is, no, longer, turned, on]"


In [13]:
stop_words = set(stopwords.words('english'))
# remove stopwords
all_en_dfs['stopwords_removed'] = all_en_dfs['lower'].apply(lambda x: [word for word in x if word not in stop_words])
all_en_dfs.head()

Unnamed: 0,comments,lang,label,totalwords,comments_no_contraction,comments_str,comments_no_punc,lang_detect,tokenized,lower,stopwords_removed
3375,"""When I was working on screw tightening for about an hour, smoke came out and it became useless. There seems to be no protection circuit. Isn't it necessary to write the continuous usage time?""}",en,1,195,"[""When, I, was, working, on, screw, tightening, for, about, an, hour,, smoke, came, out, and, it, became, useless., There, seems, to, be, no, protection, circuit., is not, it, necessary, to, write, the, continuous, usage, time?""}]","""When I was working on screw tightening for about an hour, smoke came out and it became useless. There seems to be no protection circuit. is not it necessary to write the continuous usage time?""}",When I was working on screw tightening for about an hour smoke came out and it became useless There seems to be no protection circuit is not it necessary to write the continuous usage time,en,"[When, I, was, working, on, screw, tightening, for, about, an, hour, smoke, came, out, and, it, became, useless, There, seems, to, be, no, protection, circuit, is, not, it, necessary, to, write, the, continuous, usage, time]","[when, i, was, working, on, screw, tightening, for, about, an, hour, smoke, came, out, and, it, became, useless, there, seems, to, be, no, protection, circuit, is, not, it, necessary, to, write, the, continuous, usage, time]","[working, screw, tightening, hour, smoke, came, became, useless, seems, protection, circuit, necessary, write, continuous, usage, time]"
3376,'I had eczema on my body when I use it. You can now contact the shop to get a return.'},en,1,88,"['I, had, eczema, on, my, body, when, I, use, it., You, can, now, contact, the, shop, to, get, a, return.'}]",'I had eczema on my body when I use it. You can now contact the shop to get a return.'},I had eczema on my body when I use it You can now contact the shop to get a return,en,"[I, had, eczema, on, my, body, when, I, use, it, You, can, now, contact, the, shop, to, get, a, return]","[i, had, eczema, on, my, body, when, i, use, it, you, can, now, contact, the, shop, to, get, a, return]","[eczema, body, use, contact, shop, get, return]"
3377,'A big failure. Power on after plugging in. I had a light burn. I made a return procedure immediately.'},en,1,105,"['A, big, failure., Power, on, after, plugging, in., I, had, a, light, burn., I, made, a, return, procedure, immediately.'}]",'A big failure. Power on after plugging in. I had a light burn. I made a return procedure immediately.'},A big failure Power on after plugging in I had a light burn I made a return procedure immediately,en,"[A, big, failure, Power, on, after, plugging, in, I, had, a, light, burn, I, made, a, return, procedure, immediately]","[a, big, failure, power, on, after, plugging, in, i, had, a, light, burn, i, made, a, return, procedure, immediately]","[big, failure, power, plugging, light, burn, made, return, procedure, immediately]"
3378,"""It has sparks and it smells burnt. I'm afraid I can't use it.""}",en,1,65,"[""It, has, sparks, and, it, smells, burnt., I am, afraid, I, can not, use, it.""}]","""It has sparks and it smells burnt. I am afraid I can not use it.""}",It has sparks and it smells burnt I am afraid I can not use it,en,"[It, has, sparks, and, it, smells, burnt, I, am, afraid, I, can, not, use, it]","[it, has, sparks, and, it, smells, burnt, i, am, afraid, i, can, not, use, it]","[sparks, smells, burnt, afraid, use]"
3379,"'When the power is turned on and the heating button is pressed, the sparks are scattered and the power is no longer turned on.'}",en,1,129,"['When, the, power, is, turned, on, and, the, heating, button, is, pressed,, the, sparks, are, scattered, and, the, power, is, no, longer, turned, on.'}]","'When the power is turned on and the heating button is pressed, the sparks are scattered and the power is no longer turned on.'}",When the power is turned on and the heating button is pressed the sparks are scattered and the power is no longer turned on,en,"[When, the, power, is, turned, on, and, the, heating, button, is, pressed, the, sparks, are, scattered, and, the, power, is, no, longer, turned, on]","[when, the, power, is, turned, on, and, the, heating, button, is, pressed, the, sparks, are, scattered, and, the, power, is, no, longer, turned, on]","[power, turned, heating, button, pressed, sparks, scattered, power, longer, turned]"


In [20]:
#
all_en_dfs['pos_tags'] = all_en_dfs['stopwords_removed'].apply(nltk.tag.pos_tag)
all_en_dfs.head()

Unnamed: 0,comments,lang,label,totalwords,comments_no_contraction,comments_str,comments_no_punc,lang_detect,tokenized,lower,stopwords_removed,pos_tags
3375,"""When I was working on screw tightening for about an hour, smoke came out and it became useless. There seems to be no protection circuit. Isn't it necessary to write the continuous usage time?""}",en,1,195,"[""When, I, was, working, on, screw, tightening, for, about, an, hour,, smoke, came, out, and, it, became, useless., There, seems, to, be, no, protection, circuit., is not, it, necessary, to, write, the, continuous, usage, time?""}]","""When I was working on screw tightening for about an hour, smoke came out and it became useless. There seems to be no protection circuit. is not it necessary to write the continuous usage time?""}",When I was working on screw tightening for about an hour smoke came out and it became useless There seems to be no protection circuit is not it necessary to write the continuous usage time,en,"[When, I, was, working, on, screw, tightening, for, about, an, hour, smoke, came, out, and, it, became, useless, There, seems, to, be, no, protection, circuit, is, not, it, necessary, to, write, the, continuous, usage, time]","[when, i, was, working, on, screw, tightening, for, about, an, hour, smoke, came, out, and, it, became, useless, there, seems, to, be, no, protection, circuit, is, not, it, necessary, to, write, the, continuous, usage, time]","[working, screw, tightening, hour, smoke, came, became, useless, seems, protection, circuit, necessary, write, continuous, usage, time]","[(working, VBG), (screw, VBD), (tightening, VBG), (hour, NN), (smoke, NN), (came, VBD), (became, VBD), (useless, JJ), (seems, VBZ), (protection, NN), (circuit, NN), (necessary, JJ), (write, RB), (continuous, JJ), (usage, NN), (time, NN)]"
3376,'I had eczema on my body when I use it. You can now contact the shop to get a return.'},en,1,88,"['I, had, eczema, on, my, body, when, I, use, it., You, can, now, contact, the, shop, to, get, a, return.'}]",'I had eczema on my body when I use it. You can now contact the shop to get a return.'},I had eczema on my body when I use it You can now contact the shop to get a return,en,"[I, had, eczema, on, my, body, when, I, use, it, You, can, now, contact, the, shop, to, get, a, return]","[i, had, eczema, on, my, body, when, i, use, it, you, can, now, contact, the, shop, to, get, a, return]","[eczema, body, use, contact, shop, get, return]","[(eczema, NN), (body, NN), (use, NN), (contact, NN), (shop, NN), (get, VBP), (return, NN)]"
3377,'A big failure. Power on after plugging in. I had a light burn. I made a return procedure immediately.'},en,1,105,"['A, big, failure., Power, on, after, plugging, in., I, had, a, light, burn., I, made, a, return, procedure, immediately.'}]",'A big failure. Power on after plugging in. I had a light burn. I made a return procedure immediately.'},A big failure Power on after plugging in I had a light burn I made a return procedure immediately,en,"[A, big, failure, Power, on, after, plugging, in, I, had, a, light, burn, I, made, a, return, procedure, immediately]","[a, big, failure, power, on, after, plugging, in, i, had, a, light, burn, i, made, a, return, procedure, immediately]","[big, failure, power, plugging, light, burn, made, return, procedure, immediately]","[(big, JJ), (failure, NN), (power, NN), (plugging, VBG), (light, JJ), (burn, NN), (made, VBD), (return, JJ), (procedure, NN), (immediately, RB)]"
3378,"""It has sparks and it smells burnt. I'm afraid I can't use it.""}",en,1,65,"[""It, has, sparks, and, it, smells, burnt., I am, afraid, I, can not, use, it.""}]","""It has sparks and it smells burnt. I am afraid I can not use it.""}",It has sparks and it smells burnt I am afraid I can not use it,en,"[It, has, sparks, and, it, smells, burnt, I, am, afraid, I, can, not, use, it]","[it, has, sparks, and, it, smells, burnt, i, am, afraid, i, can, not, use, it]","[sparks, smells, burnt, afraid, use]","[(sparks, NNS), (smells, VBZ), (burnt, NN), (afraid, NN), (use, NN)]"
3379,"'When the power is turned on and the heating button is pressed, the sparks are scattered and the power is no longer turned on.'}",en,1,129,"['When, the, power, is, turned, on, and, the, heating, button, is, pressed,, the, sparks, are, scattered, and, the, power, is, no, longer, turned, on.'}]","'When the power is turned on and the heating button is pressed, the sparks are scattered and the power is no longer turned on.'}",When the power is turned on and the heating button is pressed the sparks are scattered and the power is no longer turned on,en,"[When, the, power, is, turned, on, and, the, heating, button, is, pressed, the, sparks, are, scattered, and, the, power, is, no, longer, turned, on]","[when, the, power, is, turned, on, and, the, heating, button, is, pressed, the, sparks, are, scattered, and, the, power, is, no, longer, turned, on]","[power, turned, heating, button, pressed, sparks, scattered, power, longer, turned]","[(power, NN), (turned, VBD), (heating, VBG), (button, NN), (pressed, VBN), (sparks, NNS), (scattered, VBD), (power, NN), (longer, RBR), (turned, VBD)]"


In [21]:
def get_wordnet_pos(tag):
    if tag.startswith('J'):
        return wordnet.ADJ
    elif tag.startswith('V'):
        return wordnet.VERB
    elif tag.startswith('N'):
        return wordnet.NOUN
    elif tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN

In [22]:
all_en_dfs['wordnet_pos'] = all_en_dfs['pos_tags'].apply(lambda x: [(word, get_wordnet_pos(pos_tag)) for (word, pos_tag) in x])
all_en_dfs.head()

Unnamed: 0,comments,lang,label,totalwords,comments_no_contraction,comments_str,comments_no_punc,lang_detect,tokenized,lower,stopwords_removed,pos_tags,wordnet_pos
3375,"""When I was working on screw tightening for about an hour, smoke came out and it became useless. There seems to be no protection circuit. Isn't it necessary to write the continuous usage time?""}",en,1,195,"[""When, I, was, working, on, screw, tightening, for, about, an, hour,, smoke, came, out, and, it, became, useless., There, seems, to, be, no, protection, circuit., is not, it, necessary, to, write, the, continuous, usage, time?""}]","""When I was working on screw tightening for about an hour, smoke came out and it became useless. There seems to be no protection circuit. is not it necessary to write the continuous usage time?""}",When I was working on screw tightening for about an hour smoke came out and it became useless There seems to be no protection circuit is not it necessary to write the continuous usage time,en,"[When, I, was, working, on, screw, tightening, for, about, an, hour, smoke, came, out, and, it, became, useless, There, seems, to, be, no, protection, circuit, is, not, it, necessary, to, write, the, continuous, usage, time]","[when, i, was, working, on, screw, tightening, for, about, an, hour, smoke, came, out, and, it, became, useless, there, seems, to, be, no, protection, circuit, is, not, it, necessary, to, write, the, continuous, usage, time]","[working, screw, tightening, hour, smoke, came, became, useless, seems, protection, circuit, necessary, write, continuous, usage, time]","[(working, VBG), (screw, VBD), (tightening, VBG), (hour, NN), (smoke, NN), (came, VBD), (became, VBD), (useless, JJ), (seems, VBZ), (protection, NN), (circuit, NN), (necessary, JJ), (write, RB), (continuous, JJ), (usage, NN), (time, NN)]","[(working, v), (screw, v), (tightening, v), (hour, n), (smoke, n), (came, v), (became, v), (useless, a), (seems, v), (protection, n), (circuit, n), (necessary, a), (write, r), (continuous, a), (usage, n), (time, n)]"
3376,'I had eczema on my body when I use it. You can now contact the shop to get a return.'},en,1,88,"['I, had, eczema, on, my, body, when, I, use, it., You, can, now, contact, the, shop, to, get, a, return.'}]",'I had eczema on my body when I use it. You can now contact the shop to get a return.'},I had eczema on my body when I use it You can now contact the shop to get a return,en,"[I, had, eczema, on, my, body, when, I, use, it, You, can, now, contact, the, shop, to, get, a, return]","[i, had, eczema, on, my, body, when, i, use, it, you, can, now, contact, the, shop, to, get, a, return]","[eczema, body, use, contact, shop, get, return]","[(eczema, NN), (body, NN), (use, NN), (contact, NN), (shop, NN), (get, VBP), (return, NN)]","[(eczema, n), (body, n), (use, n), (contact, n), (shop, n), (get, v), (return, n)]"
3377,'A big failure. Power on after plugging in. I had a light burn. I made a return procedure immediately.'},en,1,105,"['A, big, failure., Power, on, after, plugging, in., I, had, a, light, burn., I, made, a, return, procedure, immediately.'}]",'A big failure. Power on after plugging in. I had a light burn. I made a return procedure immediately.'},A big failure Power on after plugging in I had a light burn I made a return procedure immediately,en,"[A, big, failure, Power, on, after, plugging, in, I, had, a, light, burn, I, made, a, return, procedure, immediately]","[a, big, failure, power, on, after, plugging, in, i, had, a, light, burn, i, made, a, return, procedure, immediately]","[big, failure, power, plugging, light, burn, made, return, procedure, immediately]","[(big, JJ), (failure, NN), (power, NN), (plugging, VBG), (light, JJ), (burn, NN), (made, VBD), (return, JJ), (procedure, NN), (immediately, RB)]","[(big, a), (failure, n), (power, n), (plugging, v), (light, a), (burn, n), (made, v), (return, a), (procedure, n), (immediately, r)]"
3378,"""It has sparks and it smells burnt. I'm afraid I can't use it.""}",en,1,65,"[""It, has, sparks, and, it, smells, burnt., I am, afraid, I, can not, use, it.""}]","""It has sparks and it smells burnt. I am afraid I can not use it.""}",It has sparks and it smells burnt I am afraid I can not use it,en,"[It, has, sparks, and, it, smells, burnt, I, am, afraid, I, can, not, use, it]","[it, has, sparks, and, it, smells, burnt, i, am, afraid, i, can, not, use, it]","[sparks, smells, burnt, afraid, use]","[(sparks, NNS), (smells, VBZ), (burnt, NN), (afraid, NN), (use, NN)]","[(sparks, n), (smells, v), (burnt, n), (afraid, n), (use, n)]"
3379,"'When the power is turned on and the heating button is pressed, the sparks are scattered and the power is no longer turned on.'}",en,1,129,"['When, the, power, is, turned, on, and, the, heating, button, is, pressed,, the, sparks, are, scattered, and, the, power, is, no, longer, turned, on.'}]","'When the power is turned on and the heating button is pressed, the sparks are scattered and the power is no longer turned on.'}",When the power is turned on and the heating button is pressed the sparks are scattered and the power is no longer turned on,en,"[When, the, power, is, turned, on, and, the, heating, button, is, pressed, the, sparks, are, scattered, and, the, power, is, no, longer, turned, on]","[when, the, power, is, turned, on, and, the, heating, button, is, pressed, the, sparks, are, scattered, and, the, power, is, no, longer, turned, on]","[power, turned, heating, button, pressed, sparks, scattered, power, longer, turned]","[(power, NN), (turned, VBD), (heating, VBG), (button, NN), (pressed, VBN), (sparks, NNS), (scattered, VBD), (power, NN), (longer, RBR), (turned, VBD)]","[(power, n), (turned, v), (heating, v), (button, n), (pressed, v), (sparks, n), (scattered, v), (power, n), (longer, r), (turned, v)]"


In [23]:
wnl = WordNetLemmatizer()
all_en_dfs['lemmatized'] = all_en_dfs['wordnet_pos'].apply(lambda x: [wnl.lemmatize(word, tag) for word, tag in x])
all_en_dfs['lemmatized_comments'] = [' '.join(map(str,l)) for l in all_en_dfs['lemmatized']]
all_en_dfs.head()

Unnamed: 0,comments,lang,label,totalwords,comments_no_contraction,comments_str,comments_no_punc,lang_detect,tokenized,lower,stopwords_removed,pos_tags,wordnet_pos,lemmatized,lemmatized_comments
3375,"""When I was working on screw tightening for about an hour, smoke came out and it became useless. There seems to be no protection circuit. Isn't it necessary to write the continuous usage time?""}",en,1,195,"[""When, I, was, working, on, screw, tightening, for, about, an, hour,, smoke, came, out, and, it, became, useless., There, seems, to, be, no, protection, circuit., is not, it, necessary, to, write, the, continuous, usage, time?""}]","""When I was working on screw tightening for about an hour, smoke came out and it became useless. There seems to be no protection circuit. is not it necessary to write the continuous usage time?""}",When I was working on screw tightening for about an hour smoke came out and it became useless There seems to be no protection circuit is not it necessary to write the continuous usage time,en,"[When, I, was, working, on, screw, tightening, for, about, an, hour, smoke, came, out, and, it, became, useless, There, seems, to, be, no, protection, circuit, is, not, it, necessary, to, write, the, continuous, usage, time]","[when, i, was, working, on, screw, tightening, for, about, an, hour, smoke, came, out, and, it, became, useless, there, seems, to, be, no, protection, circuit, is, not, it, necessary, to, write, the, continuous, usage, time]","[working, screw, tightening, hour, smoke, came, became, useless, seems, protection, circuit, necessary, write, continuous, usage, time]","[(working, VBG), (screw, VBD), (tightening, VBG), (hour, NN), (smoke, NN), (came, VBD), (became, VBD), (useless, JJ), (seems, VBZ), (protection, NN), (circuit, NN), (necessary, JJ), (write, RB), (continuous, JJ), (usage, NN), (time, NN)]","[(working, v), (screw, v), (tightening, v), (hour, n), (smoke, n), (came, v), (became, v), (useless, a), (seems, v), (protection, n), (circuit, n), (necessary, a), (write, r), (continuous, a), (usage, n), (time, n)]","[work, screw, tighten, hour, smoke, come, become, useless, seem, protection, circuit, necessary, write, continuous, usage, time]",work screw tighten hour smoke come become useless seem protection circuit necessary write continuous usage time
3376,'I had eczema on my body when I use it. You can now contact the shop to get a return.'},en,1,88,"['I, had, eczema, on, my, body, when, I, use, it., You, can, now, contact, the, shop, to, get, a, return.'}]",'I had eczema on my body when I use it. You can now contact the shop to get a return.'},I had eczema on my body when I use it You can now contact the shop to get a return,en,"[I, had, eczema, on, my, body, when, I, use, it, You, can, now, contact, the, shop, to, get, a, return]","[i, had, eczema, on, my, body, when, i, use, it, you, can, now, contact, the, shop, to, get, a, return]","[eczema, body, use, contact, shop, get, return]","[(eczema, NN), (body, NN), (use, NN), (contact, NN), (shop, NN), (get, VBP), (return, NN)]","[(eczema, n), (body, n), (use, n), (contact, n), (shop, n), (get, v), (return, n)]","[eczema, body, use, contact, shop, get, return]",eczema body use contact shop get return
3377,'A big failure. Power on after plugging in. I had a light burn. I made a return procedure immediately.'},en,1,105,"['A, big, failure., Power, on, after, plugging, in., I, had, a, light, burn., I, made, a, return, procedure, immediately.'}]",'A big failure. Power on after plugging in. I had a light burn. I made a return procedure immediately.'},A big failure Power on after plugging in I had a light burn I made a return procedure immediately,en,"[A, big, failure, Power, on, after, plugging, in, I, had, a, light, burn, I, made, a, return, procedure, immediately]","[a, big, failure, power, on, after, plugging, in, i, had, a, light, burn, i, made, a, return, procedure, immediately]","[big, failure, power, plugging, light, burn, made, return, procedure, immediately]","[(big, JJ), (failure, NN), (power, NN), (plugging, VBG), (light, JJ), (burn, NN), (made, VBD), (return, JJ), (procedure, NN), (immediately, RB)]","[(big, a), (failure, n), (power, n), (plugging, v), (light, a), (burn, n), (made, v), (return, a), (procedure, n), (immediately, r)]","[big, failure, power, plug, light, burn, make, return, procedure, immediately]",big failure power plug light burn make return procedure immediately
3378,"""It has sparks and it smells burnt. I'm afraid I can't use it.""}",en,1,65,"[""It, has, sparks, and, it, smells, burnt., I am, afraid, I, can not, use, it.""}]","""It has sparks and it smells burnt. I am afraid I can not use it.""}",It has sparks and it smells burnt I am afraid I can not use it,en,"[It, has, sparks, and, it, smells, burnt, I, am, afraid, I, can, not, use, it]","[it, has, sparks, and, it, smells, burnt, i, am, afraid, i, can, not, use, it]","[sparks, smells, burnt, afraid, use]","[(sparks, NNS), (smells, VBZ), (burnt, NN), (afraid, NN), (use, NN)]","[(sparks, n), (smells, v), (burnt, n), (afraid, n), (use, n)]","[spark, smell, burnt, afraid, use]",spark smell burnt afraid use
3379,"'When the power is turned on and the heating button is pressed, the sparks are scattered and the power is no longer turned on.'}",en,1,129,"['When, the, power, is, turned, on, and, the, heating, button, is, pressed,, the, sparks, are, scattered, and, the, power, is, no, longer, turned, on.'}]","'When the power is turned on and the heating button is pressed, the sparks are scattered and the power is no longer turned on.'}",When the power is turned on and the heating button is pressed the sparks are scattered and the power is no longer turned on,en,"[When, the, power, is, turned, on, and, the, heating, button, is, pressed, the, sparks, are, scattered, and, the, power, is, no, longer, turned, on]","[when, the, power, is, turned, on, and, the, heating, button, is, pressed, the, sparks, are, scattered, and, the, power, is, no, longer, turned, on]","[power, turned, heating, button, pressed, sparks, scattered, power, longer, turned]","[(power, NN), (turned, VBD), (heating, VBG), (button, NN), (pressed, VBN), (sparks, NNS), (scattered, VBD), (power, NN), (longer, RBR), (turned, VBD)]","[(power, n), (turned, v), (heating, v), (button, n), (pressed, v), (sparks, n), (scattered, v), (power, n), (longer, r), (turned, v)]","[power, turn, heat, button, press, spark, scatter, power, longer, turn]",power turn heat button press spark scatter power longer turn


In [None]:
all_en_dfs = all_en_dfs[['lemmatized_comments', 'label', 'totalwords']]
all_en_dfs.rename(columns = {'totalwords': 'init_totalwords'}, inplace = True)

In [27]:
with open('./clean_data/en_comments_clean.pkl', 'wb') as pickle_file:
    pickle.dump(all_en_dfs, pickle_file)