In [1]:
import numpy as np
import pandas as pd
import torch

df=pd.read_csv("smile-annotations-final.csv",names=["id","text","category"])

df.head()

Unnamed: 0,id,text,category
0,611857364396965889,@aandraous @britishmuseum @AndrewsAntonio Merc...,nocode
1,614484565059596288,Dorian Gray with Rainbow Scarf #LoveWins (from...,happy
2,614746522043973632,@SelectShowcase @Tate_StIves ... Replace with ...,happy
3,614877582664835073,@Sofabsports thank you for following me back. ...,happy
4,611932373039644672,@britishmuseum @TudorHistory What a beautiful ...,happy


In [2]:
df.category.value_counts()

category
nocode               1572
happy                1137
not-relevant          214
angry                  57
surprise               35
sad                    32
happy|surprise         11
happy|sad               9
disgust|angry           7
disgust                 6
sad|disgust             2
sad|angry               2
sad|disgust|angry       1
Name: count, dtype: int64

In [3]:
df = df[df.category.isin(['happy', 'not-relevant', 'angry', 'surprise', 'sad', 'disgust'])]

In [4]:
df.category.value_counts()

category
happy           1137
not-relevant     214
angry             57
surprise          35
sad               32
disgust            6
Name: count, dtype: int64

In [5]:
df.head(10)

Unnamed: 0,id,text,category
1,614484565059596288,Dorian Gray with Rainbow Scarf #LoveWins (from...,happy
2,614746522043973632,@SelectShowcase @Tate_StIves ... Replace with ...,happy
3,614877582664835073,@Sofabsports thank you for following me back. ...,happy
4,611932373039644672,@britishmuseum @TudorHistory What a beautiful ...,happy
5,611570404268883969,@NationalGallery @ThePoldarkian I have always ...,happy
9,614499696015503361,Lucky @FitzMuseum_UK! Good luck @MirandaStearn...,happy
12,613601881441570816,Yr 9 art students are off to the @britishmuseu...,happy
15,613696526297210880,@RAMMuseum Please vote for us as @sainsbury #s...,not-relevant
16,610746718641102848,#AskTheGallery Have you got plans to privatise...,not-relevant
18,612648200588038144,@BarbyWT @britishmuseum so beautiful,happy


In [6]:
possible_labels = df.category.unique()

In [7]:
label_dict = {}
for index, possible_label in enumerate(possible_labels):
    label_dict[possible_label] = index

In [8]:
label_dict

{'happy': 0,
 'not-relevant': 1,
 'angry': 2,
 'disgust': 3,
 'sad': 4,
 'surprise': 5}

In [9]:
df.category = df['category'].map(label_dict)

In [10]:
df.head(10)

Unnamed: 0,id,text,category
1,614484565059596288,Dorian Gray with Rainbow Scarf #LoveWins (from...,0
2,614746522043973632,@SelectShowcase @Tate_StIves ... Replace with ...,0
3,614877582664835073,@Sofabsports thank you for following me back. ...,0
4,611932373039644672,@britishmuseum @TudorHistory What a beautiful ...,0
5,611570404268883969,@NationalGallery @ThePoldarkian I have always ...,0
9,614499696015503361,Lucky @FitzMuseum_UK! Good luck @MirandaStearn...,0
12,613601881441570816,Yr 9 art students are off to the @britishmuseu...,0
15,613696526297210880,@RAMMuseum Please vote for us as @sainsbury #s...,1
16,610746718641102848,#AskTheGallery Have you got plans to privatise...,1
18,612648200588038144,@BarbyWT @britishmuseum so beautiful,0


In [11]:
from sklearn.model_selection import train_test_split

In [12]:
X_train, X_val, y_train, y_val = train_test_split(df.index.values, 
                                                  df.category.values, 
                                                  test_size=0.15, 
                                                  random_state=42,
                                                  stratify=df.category.values)

In [13]:
df['data_type'] = ['not_set']*df.shape[0]

In [15]:
df.loc[X_train, 'data_type'] = 'train'
df.loc[X_val, 'data_type'] = 'val'

df.groupby(['category', 'data_type']).count()

Unnamed: 0_level_0,Unnamed: 1_level_0,id,text
category,data_type,Unnamed: 2_level_1,Unnamed: 3_level_1
0,train,966,966
0,val,171,171
1,train,182,182
1,val,32,32
2,train,48,48
2,val,9,9
3,train,5,5
3,val,1,1
4,train,27,27
4,val,5,5


In [16]:
from transformers import BertTokenizer



In [17]:
tokenizer=BertTokenizer.from_pretrained("bert-base-uncased")

In [45]:
tokens=tokenizer.tokenize("I am joy chandra das.")

In [47]:
ids=tokenizer.convert_tokens_to_ids(tokens)

In [48]:
ids

[1045, 2572, 6569, 16469, 8695, 1012]

In [58]:
encoded_tokens=tokenizer.batch_encode_plus(tokens,
                                          add_special_tokens=True,
                                          return_attention_mask=True,
                                          max_length=5,
                                          pad_to_max_length=True,
                                           return_tensors="pt"
                                          )

In [59]:
print(encoded_tokens)

{'input_ids': tensor([[ 101, 1045,  102,    0,    0],
        [ 101, 2293,  102,    0,    0],
        [ 101, 3019,  102,    0,    0],
        [ 101, 2653,  102,    0,    0],
        [ 101, 6364,  102,    0,    0],
        [ 101, 1012,  102,    0,    0]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 0, 0],
        [1, 1, 1, 0, 0],
        [1, 1, 1, 0, 0],
        [1, 1, 1, 0, 0],
        [1, 1, 1, 0, 0],
        [1, 1, 1, 0, 0]])}
