In [5]:
import pandas as pd

In [6]:
df = pd.read_csv('arab_dataset/ar_dataset.csv')

In [7]:
df.head()

Unnamed: 0,HITId,tweet,sentiment,directness,annotator_sentiment,target,group
0,1,صلاة الفجر خير لك من ترديد بول البعير وسبي الن...,hateful_normal,indirect,shock,gender,individual
1,2,صراحة نفسي اشوف ولاد الوسخة اللي قالوا مدرب اج...,offensive,indirect,anger_confusion_sadness_indifference_disgust,other,other
2,3,طيب! هي متبرجة وعبايتها ملونه وطالعة من بيتهم ...,offensive,indirect,indifference,other,individual
3,4,@user @user انا اوافقك بخصوص السوريين و العراق...,normal,direct,indifference,origin,other
4,5,هذه السعودية التي شعبها شعب الخيم و بول البعير...,normal,indirect,indifference,origin,other


In [8]:
# label utils

# multi label
sentiment_dict ={
          'abusive':0 ,'hateful':1 ,'offensive':2 ,
          'disrespectful':3 ,'fearful':4 ,'normal':5}

group_dict ={'individual':0 ,'other':1 ,'women':2 ,'african_descent':3}

target_dict ={'other':0 ,'origin':1 ,'gender':2 ,'religion':3 }

# multi label
annotator_dict ={
              'disgust':0 ,'shock':1 , 'anger':2 ,
              'sadness':3 ,'fear':4,'confusion':5,'indifference':6}

directness_dict ={ 'indirect':0 , 'direct':1}

In [26]:
def prepare_datset( df ):

    # main categories for sentiment
    dataset = []
    
    # prepare the dataset for the sentiment analysis
    for i_tweet , i_sentiment , i_directness , i_annotator , i_target , i_group in zip( df.tweet , df.sentiment , df.directness , df.annotator_sentiment , df.target , df.group ) :
        
        # sentiment labels
        multi_sentiments =  i_sentiment.split("_")
        multi_annotator = i_annotator.split("_")
        
        if( i_target != 'disability' and ( i_group in list( group_dict.keys() ) )  ) :
        
            for i_sent in multi_sentiments :
                for i_anno  in multi_annotator :

                    i_sample = {
                          'tweet': i_tweet ,
                          'sentiment': i_sent ,
                          'sentiment_score': sentiment_dict[ i_sent ] ,
                          'annotator': i_anno ,
                          'annotator_score': annotator_dict[ i_anno ] ,
                          'directness': i_directness ,
                          'directness_score' :directness_dict[ i_directness ] ,
                          'group': i_group ,
                          'group_score' : group_dict[ i_group ] ,
                          'target': i_target ,
                          'target_score' : target_dict[ i_target ]
                            }
                    
                    # add the samples into the array
                    dataset.append( i_sample )

    # create the dataframe
    df = pd.DataFrame( dataset ) 

    return df

In [27]:
df_final = prepare_datset( df ) 

In [28]:
df_final

Unnamed: 0,tweet,sentiment,sentiment_score,annotator,annotator_score,directness,directness_score,group,group_score,target,target_score
0,صلاة الفجر خير لك من ترديد بول البعير وسبي الن...,hateful,1,shock,1,indirect,0,individual,0,gender,2
1,صلاة الفجر خير لك من ترديد بول البعير وسبي الن...,normal,5,shock,1,indirect,0,individual,0,gender,2
2,صراحة نفسي اشوف ولاد الوسخة اللي قالوا مدرب اج...,offensive,2,anger,2,indirect,0,other,1,other,0
3,صراحة نفسي اشوف ولاد الوسخة اللي قالوا مدرب اج...,offensive,2,confusion,5,indirect,0,other,1,other,0
4,صراحة نفسي اشوف ولاد الوسخة اللي قالوا مدرب اج...,offensive,2,sadness,3,indirect,0,other,1,other,0
...,...,...,...,...,...,...,...,...,...,...,...
5486,@user مهما حصل هندوس عليهم شويه الرويبضه بس نخ...,hateful,1,indifference,6,indirect,0,other,1,other,0
5487,الكلب لا يعض اذن اخوه كذابين خنازير @url,offensive,2,disgust,0,indirect,0,other,1,other,0
5488,الكلب لا يعض اذن اخوه كذابين خنازير @url,hateful,1,disgust,0,indirect,0,other,1,other,0
5489,@user لأنه صغير ويكتب في قناة خنازير فلن نقرأ ...,hateful,1,shock,1,indirect,0,other,1,other,0


In [35]:
# Creating the dataset and dataloader for the neural network
from sklearn.model_selection import train_test_split


train_dataset , valid_dataset = train_test_split( df_final  , test_size=0.2 , random_state=42  )

test_dataset , valid_dataset = train_test_split( valid_dataset , test_size=0.5 , random_state=42 , )


train_dataset = train_dataset.reset_index(drop=True)
test_dataset = test_dataset.reset_index(drop=True)
valid_dataset = valid_dataset.reset_index(drop=True)

print("FULL Dataset: {}".format( df_final.shape))
print("TRAIN Dataset: {}".format(train_dataset.shape))
print("VALID Dataset: {}".format(valid_dataset.shape))
print("TEST Dataset: {}".format(test_dataset.shape))

FULL Dataset: (5491, 11)
TRAIN Dataset: (4392, 11)
VALID Dataset: (550, 11)
TEST Dataset: (549, 11)


In [36]:
train_dataset.to_csv( "arab_trainset.csv" , index=False ) 
valid_dataset.to_csv( "arab_validset.csv" , index=False ) 
test_dataset.to_csv( "arab_testset.csv" , index=False ) 

In [42]:
train_dataset.target_score.value_counts()

0    2290
1    1105
2     871
3     126
Name: target_score, dtype: int64

In [44]:
a = [1,2,4,5,6,7]
a[0:3]

[1, 2, 4]

In [2]:
list( {'abusive':0 ,'hateful':1 ,'offensive':2 ,'disrespectful':3 ,'fearful':4 ,'normal':5}.keys() )

['abusive', 'hateful', 'offensive', 'disrespectful', 'fearful', 'normal']