In [43]:
import pandas as pd
df = pd.read_csv('complaints_cleaned.csv')


In [44]:

df.head()

Unnamed: 0,product,consumer_complaint_narrative
0,debt collection,has claimed i owe them {$27.00} for years des...
1,consumer loan,due to inconsistencies in the amount owed that...
2,mortgage,in my wages that i earned at my job decreased...
3,mortgage,i have an open and current mortgage with chase...
4,mortgage,was submitted . at the time i submitted this c...


In [28]:
df.to_json('complaints_cleaned.json')

In [29]:
df.reset_index(drop=True, inplace=True)

In [30]:
df.head()

Unnamed: 0,consumer_complaint_narrative
0,has claimed i owe them {$27.00} for years des...
1,due to inconsistencies in the amount owed that...
2,in my wages that i earned at my job decreased...
3,i have an open and current mortgage with chase...
4,was submitted . at the time i submitted this c...


In [24]:

df = pd.read_csv('complaints_cleaned.csv')
df.head()

Unnamed: 0,consumer_complaint_narrative
0,has claimed i owe them {$27.00} for years des...
1,due to inconsistencies in the amount owed that...
2,in my wages that i earned at my job decreased...
3,i have an open and current mortgage with chase...
4,was submitted . at the time i submitted this c...


In [18]:
separated_data = [[sub_el for sub_el in el.strip(',') if ',' not in sub_el] 
                    for el in df['product']]
# separated_data is [['a', 'b'], ['b'], ['c', 'd'], ['a', 'c']]


# 2. (optional) find the set of keys contained in your dataframe,
#        if you don't already know that
keys = set([key for sublist in separated_data for key in sublist])
# keys is {'a', 'b', 'c', 'd'}


# 3. Create a dictionary, where the each character is a key and each value
#     is a list. The n-th value of the list says 1 if the character is
#     contained in the n-th row, 0 otherwise
columns = {key: [1 if key in sublist else 0 for sublist in separated_data] 
                for key in keys}
              
# columns is {'a': [1, 0, 0, 1], 'b': [1, 1, 0, 0], 'c': [0, 0, 1, 1], 'd': [0, 0, 1, 0]}


# 4. Your dataframe
onehot_dataframe = pd.DataFrame(columns)
onehot_dataframe.head()

Unnamed: 0,Unnamed: 1,a,b,c,d,e,f,g,h,i,...,m,n,o,p,r,s,t,u,v,y
0,1,0,1,1,1,1,0,0,0,1,...,0,1,1,0,0,0,1,0,0,0
1,1,1,0,1,0,1,0,0,0,0,...,1,1,1,0,1,1,0,1,0,0
2,0,1,0,0,0,1,0,1,0,0,...,1,0,1,0,1,0,1,0,0,0
3,0,1,0,0,0,1,0,1,0,0,...,1,0,1,0,1,0,1,0,0,0
4,0,1,0,0,0,1,0,1,0,0,...,1,0,1,0,1,0,1,0,0,0


In [45]:
# Get one hot encoding of columns B
one_hot = pd.get_dummies(df['product'])
# Drop column B as it is now encoded
df = df.drop('product',axis = 1)
# Join the encoded df
df1 = df.join(one_hot)
df1.head()  

Unnamed: 0,consumer_complaint_narrative,bank account or service,consumer loan,credit card,credit reporting,debt collection,money transfers,mortgage,other financial service,payday loan,prepaid card,student loan
0,has claimed i owe them {$27.00} for years des...,0,0,0,0,1,0,0,0,0,0,0
1,due to inconsistencies in the amount owed that...,0,1,0,0,0,0,0,0,0,0,0
2,in my wages that i earned at my job decreased...,0,0,0,0,0,0,1,0,0,0,0
3,i have an open and current mortgage with chase...,0,0,0,0,0,0,1,0,0,0,0
4,was submitted . at the time i submitted this c...,0,0,0,0,0,0,1,0,0,0,0


In [46]:
df1.to_csv('complaints_onehot.csv')

In [36]:
df2 = df1.tail(10000)
df1 = df1.head(56806)
df3 = df2.tail(3000)
df2 = df2.head(7000)


df1.to_csv('complaints_one_train.csv')
df2.to_csv('complaints_one_dev.csv')
df3.to_csv('complaints_one_test.csv')

In [21]:
from torchtext.data import Field
tokenize = lambda x: x.split()
TEXT = Field(sequential=True, tokenize=tokenize, lower=True)
 
LABEL = Field(sequential=False, use_vocab=False)

In [38]:
from torchtext.data import TabularDataset
 
tv_datafields = [ # we won't be needing the id, so we pass in None as the field
                 ("consumer_complaint_narrative", TEXT), ("bank account or service", LABEL),
                 ("consumer loan", LABEL), ("credit card", LABEL),
                 ("credit reporting", LABEL), ("debt collection", LABEL),
                 ("money transfers", LABEL), ("mortgage", LABEL), ("other financial service", LABEL)
                , ("payday loan", LABEL), ("prepaid card", LABEL), ("student loan", LABEL)]
trn, vld = TabularDataset.splits(
               path="", # the root directory where the data lies
               train='complaints_one_train.csv', validation="complaints_one_dev.csv",
               format='csv',
               skip_header=True, 
               fields=tv_datafields)
 
tst_datafields = [ # we won't be needing the id, so we pass in None as the field
                  ("consumer_complaint_narrative", TEXT)]
tst = TabularDataset(
           path="complaints_one_test.csv", # the file path
           format='csv',
           skip_header=True, fields=tst_datafields)

In [39]:
trn[0].__dict__.keys()

dict_keys(['consumer_complaint_narrative', 'bank account or service', 'consumer loan', 'credit card', 'credit reporting', 'debt collection', 'money transfers', 'mortgage', 'other financial service', 'payday loan', 'prepaid card', 'student loan'])

In [40]:
TEXT.build_vocab(trn)

In [41]:
from torchtext.data import Iterator, BucketIterator
 
train_iter, val_iter = BucketIterator.splits(
 (trn, vld), # we pass in the datasets we want the iterator to draw data from
 batch_sizes=(64, 64),
 device=-1, # if you want to use the GPU, specify the GPU number here
 sort_key=lambda x: len(x.comment_text), # the BucketIterator needs to be told what function it should use to group the data.
 sort_within_batch=False,
 repeat=False # we pass repeat=False because we want to wrap this Iterator layer.
)
test_iter = Iterator(tst, batch_size=64, device=-1, sort=False, sort_within_batch=False, repeat=False)

The `device` argument should be set by using `torch.device` or passing a string as an argument. This behavior will be deprecated soon and currently defaults to cpu.
The `device` argument should be set by using `torch.device` or passing a string as an argument. This behavior will be deprecated soon and currently defaults to cpu.
The `device` argument should be set by using `torch.device` or passing a string as an argument. This behavior will be deprecated soon and currently defaults to cpu.


In [42]:
class BatchWrapper:
      def __init__(self, dl, x_var, y_vars):
            self.dl, self.x_var, self.y_vars = dl, x_var, y_vars # we pass in the list of attributes for x &amp;amp;amp;amp;lt;g class="gr_ gr_3178 gr-alert gr_spell gr_inline_cards gr_disable_anim_appear ContextualSpelling ins-del" id="3178" data-gr-id="3178"&amp;amp;amp;amp;gt;and y&amp;amp;amp;amp;lt;/g&amp;amp;amp;amp;gt;
  
      def __iter__(self):
            for batch in self.dl:
                  x = getattr(batch, self.x_var) # we assume only one input in this wrapper
  
                  if self.y_vars is &amp;amp;amp;amp;lt;g class="gr_ gr_3177 gr-alert gr_gramm gr_inline_cards gr_disable_anim_appear Grammar replaceWithoutSep" id="3177" data-gr-id="3177"&amp;amp;amp;amp;gt;not&amp;amp;amp;amp;lt;/g&amp;amp;amp;amp;gt; None: # we will concatenate y into a single tensor
                        y = torch.cat([getattr(batch, feat).unsqueeze(1) for feat in self.y_vars], dim=1).float()
                  else:
                        y = torch.zeros((1))
 
                  yield (x, y)
  
      def __len__(self):
            return len(self.dl)
train_dl = BatchWrapper(train_iter, "comment_text", ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"])
valid_dl = BatchWrapper(val_iter, "comment_text", ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"])
test_dl = BatchWrapper(test_iter, "comment_text", None)

SyntaxError: invalid syntax (<ipython-input-42-d7aaa2d25ab7>, line 9)

In [55]:
from io import StringIO
df = pd.read_csv('complaints_cleaned.csv')
col = ['product', 'consumer_complaint_narrative']
df = df[col]
df = df[pd.notnull(df['consumer_complaint_narrative'])]
df.columns = ['product', 'consumer_complaint_narrative']
df['category_id'] = df['product'].factorize()[0]
category_id_df = df[['product', 'category_id']].drop_duplicates().sort_values('category_id')
category_to_id = dict(category_id_df.values)
id_to_category = dict(category_id_df[['category_id', 'product']].values)

df.head()

Unnamed: 0,product,consumer_complaint_narrative,category_id
0,debt collection,has claimed i owe them {$27.00} for years des...,0
1,consumer loan,due to inconsistencies in the amount owed that...,1
2,mortgage,in my wages that i earned at my job decreased...,2
3,mortgage,i have an open and current mortgage with chase...,2
4,mortgage,was submitted . at the time i submitted this c...,2


In [56]:
df.to_csv('complaints_cleaned_label.csv', index=False)

In [52]:
df['category_id'] += 1
df.head()

Unnamed: 0,product,consumer_complaint_narrative,category_id
0,debt collection,has claimed i owe them {$27.00} for years des...,1
1,consumer loan,due to inconsistencies in the amount owed that...,2
2,mortgage,in my wages that i earned at my job decreased...,3
3,mortgage,i have an open and current mortgage with chase...,3
4,mortgage,was submitted . at the time i submitted this c...,3


In [54]:
df.to_csv('complaints_cleaned_label.csv')